From 64ec0819b3aa31fb27b34f9129f0b953a74939f8 Mon Sep 17 00:00:00 2001 From: shanmukh5 Date: Fri, 6 Dec 2024 10:11:57 -0600 Subject: [PATCH] fixes title standardization issue, contents having tokens issue and adds logic for file path addition --- .../fix_title_contents_add_filepath.py | 72 +++++++++++++++++++ .../internal/repo_level_wrappers.py | 18 +++-- .../ray/src/repo_level_order_transform.py | 9 +++ 3 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/fix_title_contents_add_filepath.py diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/fix_title_contents_add_filepath.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/fix_title_contents_add_filepath.py new file mode 100644 index 000000000..26e0a2c5c --- /dev/null +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/fix_title_contents_add_filepath.py @@ -0,0 +1,72 @@ +import pandas as pd + + +def fix_title_contents(title_column_name='title', + repo_column_name='repo_name', + dataset_column_name='dataset', + contents_column_name='contents'): + + def fix_title_contents_func(table): + table = fix_title(table, title_column_name, repo_column_name, dataset_column_name) + table = fix_contents(table, contents_column_name, dataset_column_name) + return table + + return fix_title_contents_func + + +def fix_bluepile_github_title(title: str, repo_name: str): + # Example - Filepath - dive-in-mall-master/mall-mini-program/components/hot-list/index.js, Repo Name - liuhuiAndroid/dive-in-mall + actual_repo_name = repo_name.split("/")[-1].strip("/") + if title.startswith(actual_repo_name): + title = '/'.join(title.split("/")[1:]) + return title + +def fix_abap_title(title: str, repo_name: str): + if title.startswith(repo_name): + title = title[len(repo_name)+1:] + return title + +def fix_title(table: pd.DataFrame, title_column_name: str, repo_column_name: str, dataset_column_name: str): + # Remove forward slash from start of file - also removes forard slash of abap dataset + table[title_column_name] = table[title_column_name].str.lstrip("/") + # Remove repo name, org name and branch name from title + bg_idx = (table[dataset_column_name] == "bluepile_github") + if bg_idx.any(): + table.loc[bg_idx, title_column_name] = table.loc[bg_idx].apply( + lambda row: fix_bluepile_github_title(row[title_column_name], row[repo_column_name]), axis=1) + abap_idx = (table[dataset_column_name] == "abap") + if abap_idx.any(): + table.loc[abap_idx, title_column_name] = table.loc[abap_idx].apply( + lambda row: fix_abap_title(row[title_column_name], row[repo_column_name]), axis=1) + return table + +def fix_startcoder_contents(contents: str): + def is_tag_absent(line: str): + if line.startswith(""): + return False + if line.startswith(""): + return False + if line.startswith(""): + return False + return True + + contents_lines = contents.splitlines() + contents_lines = [ele for ele in contents_lines if is_tag_absent(ele)] + return '\n'.join(contents_lines) + +def fix_contents(table, contents_column_name, dataset_column_name): + sc_idx = (table[dataset_column_name] == "starcoder") + if sc_idx.any(): + table.loc[sc_idx, contents_column_name] = table.loc[sc_idx, contents_column_name].apply(fix_startcoder_contents) + return table + +def prepend_filename_token_filepath(title, repo, contents): + file_path = f"{repo.split('/')[-1]}/{title}" + result = f"{file_path}\n{contents}" + return result + +if __name__ == "__main__": + df = pd.read_csv("/data/shanmukh/forked_repos/data-prep-kit/sc.csv") + df_copy = df.copy(deep=True) + df = fix_title_contents(title_column_name="filepath")(df) + df diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py index 1e9a24993..fd9eedf32 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py @@ -18,15 +18,14 @@ import pandas as pd import pyarrow as pa from dpk_repo_level_order.internal.check_languages import ( - get_dominant_language_repo_packing, -) + get_dominant_language_repo_packing,) from dpk_repo_level_order.internal.sorting.semantic_ordering import ( check_and_update_title, sort_by_path, sort_sem, ) from func_timeout.exceptions import FunctionTimedOut - +from dpk_repo_level_order.internal.fix_title_contents_add_filepath import prepend_filename_token_filepath SORT_BY_PATH = "SORT_BY_PATH" SORT_SEMANTIC = "SORT_SEMANTIC" @@ -160,7 +159,14 @@ def lang_distribution(grouping_column): lang_dist[k.as_py()] = v.as_py() return lang_dist + title_column_name = "title" super_row = table.column("contents").to_pylist() + titles = table.column(title_column_name).to_pylist() + repo_names = table.column(repo_column_name).to_pylist() + super_row = [ + prepend_filename_token_filepath(title, repo_name, contents) + for title, repo_name, contents in zip(titles, repo_names, super_row) + ] repo_doc_ids = table.column("document_id").to_pylist() lang_dist = lang_distribution(language_column_name) @@ -189,7 +195,7 @@ def lang_distribution(grouping_column): return new_table -def get_transforming_func(sorting_func=None, superrows_func=None, filename_func=None, language_column_name="language"): +def get_transforming_func(sorting_func=None, superrows_func=None, filename_func=None, language_column_name="language", fix_title_contents_func=None): """ This function takes three optional functions as input and returns a function that can be applied to a pyarrow table and file name. @@ -214,8 +220,10 @@ def get_transforming_func(sorting_func=None, superrows_func=None, filename_func= def my_transform(table, file_name): out_table = table + if fix_title_contents_func: + out_table = fix_title_contents_func(out_table) if sorting_func: - out_table = sorting_func(table, file_name) + out_table = sorting_func(out_table, file_name) if filename_func: file_name = filename_func(table, file_name) if superrows_func: diff --git a/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py b/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py index a43feda87..3fbeaa1a8 100644 --- a/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py +++ b/transforms/code/repo_level_ordering/ray/src/repo_level_order_transform.py @@ -259,6 +259,15 @@ def _prepare_mapper_function(self): get_dominant_language_func, get_transforming_func, ) + from dpk_repo_level_order.internal.fix_title_contents_add_filepath import fix_title_contents + title_column_name='title', + repo_column_name='repo_name', + dataset_column_name='dataset', + contents_column_name='contents' + fix_title_contents_func = fix_title_contents(title_column_name, repo_column_name, dataset_column_name, contents_column_name) + mapper_function_params = mapper_function_params | { + "fix_title_contents_func": fix_title_contents_func, + } if self.sorting_enabled: self.logger.info(f"Repo level sorting is enabled. Algo: {self.sorting_algo}")