Skip to content


apiRequest(url, token)

apiRequest To make api requests to the GitHub API

@url - the url for the API @token - GitHub API token

Source code in src/core/
def apiRequest(url, token):
    To make api requests to the GitHub API

    @url - the url for the API
    @token - GitHub API token

    header = {'Authorization': 'token %s' % token}
    response = requests.get(url, headers=header)
    return response

classify_hunk(class_patch, class_buggy)

classify_hunk To classify a hunk

@class_patch @class_buggy

Source code in src/core/
def classify_hunk(class_patch, class_buggy):
    To classify a hunk


    finalClass = ''
    if class_patch == 'ED' and class_buggy =='MO':
        finalClass = 'SP'
    if class_buggy == 'MO' and class_patch == 'MC':
        finalClass = 'MO'
    if class_buggy == 'MC' and class_patch == 'ED':
        finalClass = 'ED'
    if class_buggy == 'MC' and class_patch == 'MO':
        finalClass = 'MO'
    if class_buggy == 'ED' and class_patch == 'MC':
        finalClass = 'ED'
    if class_buggy == 'MC' and class_patch == 'MC':
        finalClass = 'NA'
    if class_patch == '' and class_buggy !='':
        finalClass = class_buggy
    if class_patch != '' and class_buggy =='':
        finalClass = class_patch
    if class_patch == '' and class_buggy =='':
        finalClass = 'NA'
    return finalClass


classify_patch To classify a patch based on the hunks

@hunk_classifications - the classifications for the different hunks in the .diff of a file changed in a PR

Source code in src/core/
def classify_patch(hunk_classifications):
    To classify a patch based on the hunks

    @hunk_classifications - the classifications for the different hunks in the .diff of a file changed in a PR

    NA_total = 0
    MO_total = 0
    ED_total = 0
    SP_total = 0

    finalClass= ''
    for i in range(len(hunk_classifications)):
        if hunk_classifications[i] =='ED':
            ED_total += 1
        elif hunk_classifications[i] =='MO':
            MO_total += 1
        elif hunk_classifications[i] =='NA':
            NA_total += 1
        elif hunk_classifications[i] =='SP':
            SP_total += 1

    if MO_total == 0 and ED_total == 0 and SP_total ==0:
        max_total = NA_total
        finalClass = 'NA'
        max_total = ED_total

        if max_total < MO_total:
            max_total = MO_total
            finalClass = 'MO'
        elif max_total == MO_total:
            # Possible SPLIT case if ED == MO

        if max_total <= SP_total:
            max_total = SP_total
            finalClass = 'SP'

    return finalClass

find_hunk_matches(match_items, _type, important_hashes, source_hashes)

find_hunk_matches To find the different matches between two hunk using the hashed values

@match_items @_type @important_hashes @source_hashes

Source code in src/core/
def find_hunk_matches(match_items, _type, important_hashes, source_hashes):
    To find the different matches between two hunk using the hashed values


    seq_matches = {} 

    for patch_nr in match_items:
        seq_matches[patch_nr] = {}
        seq_matches[patch_nr]['sequences'] = {}
        seq_matches[patch_nr]['class'] = ''
        for patch_seq in match_items[patch_nr]:

            seq_matches[patch_nr]['sequences'][patch_seq] = {}
            seq_matches[patch_nr]['sequences'][patch_seq]['count'] = 0
            seq_matches[patch_nr]['sequences'][patch_seq]['hash_list'] = list(match_items[patch_nr][patch_seq].keys())

            for k in match_items[patch_nr][patch_seq]:
                if match_items[patch_nr][patch_seq][k]:
                    seq_matches[patch_nr]['sequences'][patch_seq]['count'] += 1

    match_bool = True

    for seq_nr in seq_matches:
        for seq in seq_matches[seq_nr]['sequences']:
            if seq_matches[seq_nr]['sequences'][seq]['count'] < 2:
                match_bool = False
        _class = ''

        if _type == 'MO':
            if match_bool:
                _class = _type
                _class = 'MC'
        elif _type == 'ED':
            if match_bool:
                _class = _type
                _class = 'MC'

        seq_matches[seq_nr]['class']= _class        

    return seq_matches

find_hunk_matches_w_important_hash(match_items, _type, important_hashes, source_hashes)

find_hunk_matches_w_important_hash To find the different matches between two hunk using the hashed values and using the important hash feature

@match_items @_type @important_hashes @source_hashes

Source code in src/core/
def find_hunk_matches_w_important_hash(match_items, _type, important_hashes, source_hashes):
    To find the different matches between two hunk using the hashed values and using the important hash feature


    seq_matches = {} 
    test = []
    for lines in important_hashes:
        for line in lines:
            for each in line:
                for ngram, hash_list in source_hashes:
                    if each in ngram:

    found_important_hashes = {}
    important_hash_match = 0
    total_important_hashes = len(important_hashes)
    for patch_nr in match_items:
        seq_matches[patch_nr] = {}
        seq_matches[patch_nr]['sequences'] = {}
        seq_matches[patch_nr]['class'] = ''
        for patch_seq in match_items[patch_nr]:
            seq_matches[patch_nr]['sequences'][patch_seq] = {}
            seq_matches[patch_nr]['sequences'][patch_seq]['count'] = 0
            seq_matches[patch_nr]['sequences'][patch_seq]['hash_list'] = list(match_items[patch_nr][patch_seq].keys())

            if seq_matches[patch_nr]['sequences'][patch_seq]['hash_list'] in test:
                seq_matches[patch_nr]['sequences'][patch_seq]['important'] = True
                important_hash_match =+ 1
                seq_matches[patch_nr]['sequences'][patch_seq]['important'] = False

            for k in match_items[patch_nr][patch_seq]:
                if match_items[patch_nr][patch_seq][k]:
                    seq_matches[patch_nr]['sequences'][patch_seq]['count'] += 1

    if total_important_hashes != 0:       
        important_hash_perc = (important_hash_match*100)/total_important_hashes            

    if test:
        match_bool = False
        match_bool = True

    for i in seq_matches:
        for j in seq_matches[i]['sequences']:
            if test:
                if seq_matches[i]['sequences'][j]['important'] and seq_matches[i]['sequences'][j]['count'] != 0:
                    match_bool = True
                    if seq_matches[i]['sequences'][j]['count'] < 2:
                        match_bool = False      
                if seq_matches[i]['sequences'][j]['count'] < 2:
                    match_bool = False

        _class = ''

        if _type == 'MO':
            if match_bool:
                _class = _type
                _class = 'MC'

        elif _type == 'ED':
            if match_bool:
                _class = _type
                _class = 'MC'

        seq_matches[i]['class']= _class 

    return seq_matches 

getFileBeforePatch(repo_dir, mainline, sha, parent, pair_nr, pr_nr, file, fileDir, fileName, token)

getFileBeforePatch Extracts the buggy file using the GitHub API

@repo_dir - directory where to store the file @mainline - the source repository @sha - the commit sha-value that last changed the file @parent - the parent commit sha-value of the commit that last changed the file @pr_nr - the pull request number of the patch @file - the file path in the repository @fileDir - the sub directory where to store the file @fileName - a name to store the file @token - token needed for the GitHub API

Source code in src/core/
def getFileBeforePatch(repo_dir, mainline, sha, parent, pair_nr, pr_nr, file, fileDir, fileName, token):
    Extracts the buggy file using the GitHub API

    @repo_dir - directory where to store the file
    @mainline - the source repository
    @sha - the commit sha-value that last changed the file
    @parent - the parent commit sha-value of the commit that last changed the file
    @pr_nr - the pull request number of the patch
    @file - the file path in the repository
    @fileDir - the sub directory where to store the file
    @fileName - a name to store the file
    @token - token needed for the GitHub API
    fileBeforePatchDir = f'{repo_dir}{str(pair_nr)}/{mainline}/{str(pr_nr)}/{sha}/before_patch/{fileDir}'
    beforePatch_url = f'{constant.GITHUB_RAW_URL}{mainline}/{parent}/{file}'
    fileBeforePatch = apiRequest(beforePatch_url, token)
    beforePatch = commitloader.saveFile(fileBeforePatch.content, fileBeforePatchDir, fileName)
    return fileBeforePatchDir + fileName, beforePatch_url


getFirstLastCommit Retrieve the first and the last commit of a pull request


Source code in src/core/
def getFirstLastCommit(pr_commits):
    Retrieve the first and the last commit of a pull request

    first_commit = {}
    last_commit = {}
    for files in pr_commits:
        for p in files:
            first_commit_date = ''
            last_commit_date = ''
            for commit in files[p]:
                commit_date =commit['commit_date']
                if first_commit_date == '':
                    first_commit_date = commit_date
                    first_commit = commit
                    if commit_date < first_commit_date:
                        first_commit_date = commit_date
                        first_commit = commit   
                    if last_commit_date == '':
                        last_commit_date = commit_date
                        last_commit = commit
                        if commit_date > last_commit_date:
                            last_commit_date = commit_date
                            last_commit = commit
    return first_commit, last_commit


get_ext Extract the extension of the a file

@file - the file from which to extract the file

Source code in src/core/
def get_ext(file):
    Extract the extension of the a file

    @file - the file from which to extract the file
    ext = file.split['.'][-1]

processPatch(patchPath, dstPath, typePatch)

processPatch To process a patch This is done before bein able to classify the patch

@patchPath - the path where the patch file is stored @dstPath - the path where the destination file is stored @typePatch - the kind of patch we are dealing with, buggy or fixed

Source code in src/core/
def processPatch(patchPath, dstPath, typePatch):
    To process a patch
    This is done before bein able to classify the patch

    @patchPath - the path where the patch file is stored
    @dstPath - the path where the destination file is stored
    @typePatch - the kind of patch we are dealing with, buggy or fixed

    patch = patchloader.PatchLoader()
    npatch = patch.traverse(patchPath, typePatch)

    source = sourceloader.SourceLoader()
    nmatch = source.traverse(dstPath, patch)

    return patch, source

save_patch(storageDir, fileName, file, dup_count)

save_patch To save a patch file

@storageDir - The directory where to save the patch @fileName - The name of the file @file - The content of the file

Source code in src/core/
def save_patch(storageDir, fileName, file, dup_count):
    To save a patch file

    @storageDir - The directory where to save the patch
    @fileName - The name of the file
    @file - The content of the file
    patch_path = ''
    if not os.path.exists(storageDir):
        patch_path = storageDir + fileName + '.patch'
        f = open(patch_path, 'x')
        for line in file[2:]:

        if not os.path.isfile(storageDir + fileName):
            patch_path = storageDir + fileName + '.patch'
            f = open(patch_path, 'w')
            for line in file[2:]:
            patch_path = storageDir + fileName+ '_' + dup_count + '.patch'
            f = open(patch_path, 'w')
            for line in file[2:]:
            dup_count += 1
    return patch_path, dup_count

unified_diff(before, after)

unified_diff To create a unified diff file

@before - The state of the file before changes @after - The state of the file after the changes

Source code in src/core/
def unified_diff(before, after):
    To create a unified diff file

    @before - The state of the file before changes
    @after - The state of the file after the changes

    file1 = open(before).readlines()
    file2 = open(after).readlines()
    delta = difflib.unified_diff(file1, file2)

    file =list()
    for line in delta:
#         if not line.startswith('---') or not line.startswith('+++'):
    return file