diff --git a/scraping/scraping/data_agent_extraction-ReAct.py b/scraping/scraping/data_agent_extraction-ReAct.py index 9f9b3f8e..ff34f170 100644 --- a/scraping/scraping/data_agent_extraction-ReAct.py +++ b/scraping/scraping/data_agent_extraction-ReAct.py @@ -24,43 +24,59 @@ def extract_data(text: str) -> str: You are a highly precise extraction tool. I will provide you with a scientific text and you have to extract source code and sequencing data URLs, and accession codes for sequencing data from it. Your task: 1. Extract **only** explicitly stated sequencing data accession codes and their respective database names (e.g., GEO, ENA, SRA, etc.). Do **not** infer or guess accession codes if they are not explicitly mentioned in the text. - 2. Identify **all** source code URLs, including URLs from **GitHub** and **Zenodo**. For Zenodo, include any URL explicitly mentioned, regardless of whether it refers to sequencing data or source code. + 2. Identify **all** sequencing data URLs, including URLs from **Zenodo**. For Zenodo, include any URL explicitly mentioned. Sequencing data URLs are of hosting services where sequencing data in the field of Bioinformatics is uploaded. + 3. Identify **all** source code URLs, including URLs from **GitHub** and **Zenodo**. Source code URLs are those where custom code or custom scripts are uploaded. Return the results **only** in valid JSON format, with no additional text or explanation. Do **not** include any introductory or closing statements. The returned JSON should strictly follow the following format: {{ - "accession codes": {{ + "accession_codes": {{ "database_name_1": ["accession_code_1", "accession_code_2"], "database_name_2": ["accession_code_3"] }}, - "source code": ["GitHub_URL", "Zenodo_URL"] + "sequencing_data": ["sequencing_data_URL1", "sequencing_data_URL2"], + "source_code": ["GitHub_URL", "Zenodo_URL"] }} Example Input: <> Raw sequencing data were deposited at the GEO under accession numbers GSE12345 and GSE67890. Sequencing data is available on Zenodo at https://zenodo.org/record/12345 and https://doi.org/10.1010/zenodo.1234567. Additional analysis scripts are hosted on Zenodo at https://zenodo.org/record/67890 and GitHub at https://github.com/example/repo. The following open-source code and databases were used in this article: JaBbA (v.1.1) (https://github.com/mskilab/JaBbA), - gGnome (commit c390d80) (https://github.com/mskilab/gGnome), AmpliconArchitect (https://github.com/virajbdeshpande/AmpliconArchitect) <> + gGnome (commit c390d80) (https://github.com/mskilab/gGnome), AmpliconArchitect (https://github.com/virajbdeshpande/AmpliconArchitect). High-throughput sequencing (HTS) of T cell receptor β (TRB) and T cell receptor ⍺ (TRA) dataset are available from Adaptive Biotechnologies (http://clients.adaptivebiotech.com/login) <> Example Output: {{ - "accession codes": {{ + "accession_codes": {{ "GEO": ["GSE12345", "GSE67890"] }}, - "source code": [ - "https://github.com/example/repo", + "sequencing_data": [ "https://zenodo.org/record/12345", "https://doi.org/10.1010/zenodo.1234567", + "http://clients.adaptivebiotech.com/login" + ], + "source_code": [ + "https://github.com/example/repo", "https://zenodo.org/record/67890" ] }} Strict Rules: - - Only return the JSON structure as shown, with **no additional text or explanation**. - - If no accession codes are available, leave the "accession codes" section empty: "accession codes": {{}}. - - If no source code URLs are available, leave the "source code" section empty: "source code": []. - - Accession codes must strictly follow common formats (e.g., GSE followed by numbers for GEO, PRJ followed by alphanumeric strings for SRA, etc.). **Only include accession codes that match valid formats**. - - Include **all Zenodo URLs** explicitly mentioned in the text, regardless of their stated purpose, and ensure that **no other URLs** (except GitHub or Zenodo) are included in the "source code" section. Do **NOT** include URLs related to libraries or softwares used. + 1. accession_codes: Include ONLY valid formats: + - Accession codes must strictly follow common formats (e.g., GSE followed by numbers for GEO, PRJ followed by alphanumeric strings for SRA, etc.) + - ONLY include accession codes that match valid formats + - If no accession codes are available, leave the "accession codes" section empty: "accession_codes": {{}}. + + 2. sequencing_data: Include URLs for databases: + - ONLY include sequencing data URLs that are explicitly stated in the input text + - If no sequencing data URLs are available, leave the "sequencing data" section empty: "sequencing_data": []. + + 3. source_code: Include ONLY: + - Github URLs with code/scripts + - Zenodo URLs with code/scripts + - Do **NOT** add any database/software/tool repositories + - If no source code URLs are available, leave the "source code" section empty: "source_code": []. + + - ONLY return the JSON structure as shown, with **no additional text or explanation**. Input Text to Process: <> {text} <> @@ -72,10 +88,12 @@ def extract_data(text: str) -> str: # logger.info(result) # Validate structure - if not isinstance(result.get('source_code', []), list): - raise ValueError("source_code must be a list") if not isinstance(result.get('accession_codes', {}), dict): raise ValueError("accession_codes must be a dictionary") + if not isinstance(result.get('sequencing_data', []), list): + raise ValueError("sequencing_data must be a list") + if not isinstance(result.get('source_code', []), list): + raise ValueError("source_code must be a list") # logger.info(f"Extraction result: {result}") return json.dumps(result) @@ -83,8 +101,9 @@ def extract_data(text: str) -> str: except Exception as e: # logger.error(f"Extraction error: {e}") return json.dumps({ - "source_code": [], - "accession_codes": {} + "accession_codes": {}, + "sequencing_data": [], + "source_code": [] }) @@ -96,12 +115,18 @@ def review_data(text: str, extracted_data: str) -> str: {{ "is_valid": true, - "validation": {{ + "validation": + {{ + "accession_codes": {{ + "valid_count": 0, + "invalid_count": 0 + }}, + "source_code": {{ "valid_count": 0, "invalid_count": 0 }}, - "accession_codes": {{ + "sequencing_data": {{ "valid_count": 0, "invalid_count": 0 }} @@ -116,6 +141,9 @@ def review_data(text: str, extracted_data: str) -> str: - SRA: SRP*, SRR*, SRS*, PRJNA* - EGA: EGAS*, EGAD* - GenBank: MN*, NC_* + + Strict rules: + - Do **NOT** add anything to the output that is not explicitly mentioned in the extracted_data. Input Text to validate against: {text} @@ -125,10 +153,12 @@ def review_data(text: str, extracted_data: str) -> str: Return the results **only** in valid JSON format with **no additional text**. Follow this format exactly: {{ - "source_code": ["GitHub_URL", "Zenodo_URL"], "accession_codes": {{ - "database_name": ["accession_code1", "accession_code2"] - }} + "database_name_1": [], + "database_name_2": [] + }}, + "sequencing_data": [], + "source_code": [] }} """ @@ -147,11 +177,16 @@ def review_data(text: str, extracted_data: str) -> str: return json.dumps({ "is_valid": False, "validation": { + "accession_codes": { + "valid_count": 0, + "invalid_count": 0 + }, + "source_code": { "valid_count": 0, "invalid_count": 0 }, - "accession_codes": { + "sequencing_data": { "valid_count": 0, "invalid_count": 0 } @@ -159,6 +194,7 @@ def review_data(text: str, extracted_data: str) -> str: }) def get_agent(): + extract_tool = FunctionTool.from_defaults( fn=extract_data, name="extract_data", @@ -172,7 +208,7 @@ def get_agent(): ) - system_prompt = """You are designed to extract and validate source code URLs and accession codes from scientific text. + system_prompt = """You are designed to extract and validate source code URLs and sequencing data URLs, and accession codes from scientific text. ## Tools You have access to these tools: @@ -234,8 +270,7 @@ def get_agent(): react_system_prompt = PromptTemplate(system_prompt) agent.update_prompts({"agent_worker:system_prompt": react_system_prompt}) - # reset if do not want to maintain history between each JSON entry (faster) - agent.reset() + return agent def process_paper(text: str) -> Dict[str, Any]: @@ -243,12 +278,17 @@ def process_paper(text: str) -> Dict[str, Any]: try: agent = get_agent() response = agent.chat(text) + + # reset if do not want to maintain history between each JSON entry (faster) + agent.reset() + return json.loads(response.response) except Exception as e: # logger.error(f"Processing error: {e}") return { - "source_code": [], - "accession_codes": {} + "accession_codes": {}, + "sequencing_data": [], + "source_code": [] } @@ -257,17 +297,64 @@ def process_paper(text: str) -> Dict[str, Any]: # sample_text = "The whole-exome sequencing and whole genome sequencing datasets generated during this study are available at the Sequence Read Archive (SRA: PRJNA715377). The scRNA-seq and CITE-seq datasets generated during this study are available at the EGA European Genome-Phenome Archive (EGA: EGAS00001004837). High-throughput sequencing (HTS) of T cell receptor \u03b2 (TRB) and T cell receptor \u237a (TRA) dataset are available from Adaptive Biotechnologies (http://clients.adaptivebiotech.com/login; Email: beziat-review@adaptivebiotech.com ; Password: beziat2021review). Primary CD4+ naive T cell RNA-Seq datasets generated during this study are available at the gene expression omnibus: GEO: GSE139299. Lesions RNA-Seq datasets generated during this study are available at the GEO: GSE139259. The assembled genomes are available from GenBank under the accession numbers GenBank: MN605988 and MN605989 for HPV-2 (from P1) and HPV-4 (from P2 and P3), respectively. This study did not generate any unique code. Any other piece of data will be available upon reasonable request." - sample_text = { - "DOI": "https://doi.org/10.1186/s12943-023-01876-x", - "header": "Availability of data and materials", - "paragraph": "\nThe raw sequence data reported in this paper have been deposited in the Genome Sequence Archive in National Genomics Data Center, China National Center for Bioinformation / Beijing Institute of Genomics, Chinese Academy of Sciences (GSA-Human: HRA004998) that are publicly accessible at https://ngdc.cncb.ac.cn/gsa-human. The integrated ST data from 22 tissue slices and their corresponding scRNA-seq data have been deposited in Synapse under the accession code syn51758773 (https://www.synapse.org/#!Synapse:syn51758773/). The analysis and visualization of CAFs in pan-cancer can be performed at https://chenxisd.shinyapps.io/pancaf/. Reanalyzed publicly available scRNA-seq data can be accessed from the GEO database under accession codes: GSE176078 [97], GSE166555 [98], GSE149614 [99], GSE184880 [100], GSE203612 [101], GSE207422 [18], GSE215120 [19], GSE181919 [7]. The scRNA-seq data of PRAD from Chen et al. were downloaded from http://www.pradcellatlas.com/. The scRNA-seq data of BRCA patients receiving pembrolizumab from Bassez et al. were downloaded from https://lambrechtslab.sites.vib.be/en/single-cell [41]. Reanalyzed publicly available ST data can be accessed from the GEO database under accession codes: GSE176078 [97], GSE203612 [101]. The ST data of CRC from Wu et al. were downloaded from http://www.cancerdiversity.asia/scCRLM/ [79]. The ST data for LIHC1, LIHC2, LIHC3, and LIHC4 from Wu et al. were downloaded from http://lifeome.net/supp/livercancer-st/data.htm [102]. The ST data for OVCA_10x were downloaded from https://www.10xgenomics.com/resources/datasets/human-ovarian-cancer-1-standard. The ST data for PRAD1 were downloaded from https://www.10xgenomics.com/resources/datasets/human-prostate-cancer-acinar-cell-carcinoma-ffpe-1-standard. The ST data for PRAD2 were downloaded from https://www.10xgenomics.com/resources/datasets/human-prostate-cancer-adenocarcinoma-with-invasive-carcinoma-ffpe-1-standard-1-3-0. Reanalyzed publicly available RNA-seq data of melanoma patients undergoing immunotherapy (Riaz cohort) can be accessed from the GEO database under accession code GSE91061 [85]. Reanalyzed publicly available RNA-seq data of melanoma patients undergoing immunotherapy (Gide cohort [86] and Nathanson cohort [87]) were downloaded from TIDE database (http://tide.dfci.harvard.edu/) [88].\n", - "PubMed_ID": "77777777" - } + # sample_text = { + # "DOI": "https://doi.org/10.1186/s12943-023-01876-x", + # "header": "Availability of data and materials", + # "paragraph": "\nThe raw sequence data reported in this paper have been deposited in the Genome Sequence Archive in National Genomics Data Center, China National Center for Bioinformation / Beijing Institute of Genomics, Chinese Academy of Sciences (GSA-Human: HRA004998) that are publicly accessible at https://ngdc.cncb.ac.cn/gsa-human. The integrated ST data from 22 tissue slices and their corresponding scRNA-seq data have been deposited in Synapse under the accession code syn51758773 (https://www.synapse.org/#!Synapse:syn51758773/). The analysis and visualization of CAFs in pan-cancer can be performed at https://chenxisd.shinyapps.io/pancaf/. Reanalyzed publicly available scRNA-seq data can be accessed from the GEO database under accession codes: GSE176078 [97], GSE166555 [98], GSE149614 [99], GSE184880 [100], GSE203612 [101], GSE207422 [18], GSE215120 [19], GSE181919 [7]. The scRNA-seq data of PRAD from Chen et al. were downloaded from http://www.pradcellatlas.com/. The scRNA-seq data of BRCA patients receiving pembrolizumab from Bassez et al. were downloaded from https://lambrechtslab.sites.vib.be/en/single-cell [41]. Reanalyzed publicly available ST data can be accessed from the GEO database under accession codes: GSE176078 [97], GSE203612 [101]. The ST data of CRC from Wu et al. were downloaded from http://www.cancerdiversity.asia/scCRLM/ [79]. The ST data for LIHC1, LIHC2, LIHC3, and LIHC4 from Wu et al. were downloaded from http://lifeome.net/supp/livercancer-st/data.htm [102]. The ST data for OVCA_10x were downloaded from https://www.10xgenomics.com/resources/datasets/human-ovarian-cancer-1-standard. The ST data for PRAD1 were downloaded from https://www.10xgenomics.com/resources/datasets/human-prostate-cancer-acinar-cell-carcinoma-ffpe-1-standard. The ST data for PRAD2 were downloaded from https://www.10xgenomics.com/resources/datasets/human-prostate-cancer-adenocarcinoma-with-invasive-carcinoma-ffpe-1-standard-1-3-0. Reanalyzed publicly available RNA-seq data of melanoma patients undergoing immunotherapy (Riaz cohort) can be accessed from the GEO database under accession code GSE91061 [85]. Reanalyzed publicly available RNA-seq data of melanoma patients undergoing immunotherapy (Gide cohort [86] and Nathanson cohort [87]) were downloaded from TIDE database (http://tide.dfci.harvard.edu/) [88].\n", + # "PubMed_ID": "77777777" + # } + + # sample_text={ + # "DOI": "https://doi.org/10.1186/s12943-024-02017-8", + # "header": "Data availability", + # "paragraph": "\nNo datasets were generated or analysed during the current study.\n", + # "PubMed_ID": "33333333" + # } # sample_text = "All BAM files and associated sample information are deposited in dbGaP under accession phs001087.v4.p1. Single-cell RNA sequencing datasets from this study have been deposited in the Sequence Read Archive with the accession number SUB14118668 (BioProject PRJNA1061081). The analysis files from single-cell RNA sequencing, ecDNA amplicon reconstructions, Incucyte live-cell images, immunofluorescence pRPA and γH2AX foci images, and the according analysis files have been deposited into Zenodo https://doi.org/10.5281/zenodo.11121869129. The TCGA/PCAWG pan-cancer human cancer data22 used for CCND1 amplification analysis was obtained and modified from the supplementary information of that article22. Data for the CCND1 pan-cancer survival analysis was obtained from cBioPortal (https://bit.ly/4cjAYof). Source data are provided with this paper. The following open-source code and databases were used in this article: JaBbA (v.1.1) (https://github.com/mskilab/JaBbA), gGnome (commit c390d80) (https://github.com/mskilab/gGnome), AmpliconArchitect (https://github.com/virajbdeshpande/AmpliconArchitect), FishHook (commit 06e3927) (https://github.com/mskilab/fishHook), MutationTimeR (v.1.00.2) (https://github.com/gerstung-lab/MutationTimeR), deconstructSigs (v.1.9) (https://github.com/raerose01/deconstructSigs), SigProfilerClusters (v.1.1.2) (https://github.com/AlexandrovLab/SigProfilerClusters), Pileup (v.0.15.0) (https://github.com/pysam-developers/pysam), ShortAlignmentMarking (v.2.1) (https://github.com/nygenome/nygc-short-alignment-marking), BWA-MEM (v.0.7.15) (https://github.com/lh3/bwa), GATK (v.4.1.0) (https://github.com/broadinstitute/gatk), MuTect2 (v.4.0.5.1) (https://github.com/broadinstitute/gatk), Strelka2 (v.2.9.3) (https://github.com/Illumina/strelka), Lancet (v.1.0.7) (https://github.com/nygenome/lancet), Svaba (v.0.2.1) (https://github.com/walaj/svaba), Manta (v1.4.0) (https://github.com/Illumina/manta), Lumpy (v.0.2.13) (https://github.com/arq5x/lumpy-sv), SplazerS (v.1.1) (https://github.com/seqan/seqan/tree/master/apps/splazers), Ensembl (v.93) (https://www.ensembl.org), COSMIC (v.86) (https://cancer.sanger.ac.uk), COSMIC Cancer Gene Consensus (v.95) (https://cancer.sanger.ac.uk/census), ClinVar (201706) (https://www.ncbi.nlm.nih.gov/clinvar/), PolyPhen (v.2.2.2) (http://genetics.bwh.harvard.edu/pph2/index.shtml), SIFT (v.5.2.2) (http://sift-dna.org/sift4g), FATHMM (v.2.1) (http://fathmm.biocompute.org.uk), gnomAD (r.2.0.1) (https://gnomad.broadinstitute.org/), gnomAD-SV (v2.0.1) (https://gnomad.broadinstitute.org/, https://github.com/talkowski-lab/gnomad-sv-pipeline), dbSNP (v.150) (https://www.ncbi.nlm.nih.gov/snp/), Variant Effect Predictor (VEP) (v.93.2) (http://www.ensembl.org/vep), Database of Genomic Variants (DGV) (2020-02-25 release) (http://dgv.tcag.ca/), AscatNGS (v.4.2.1) (https://github.com/cancerit/ascatNgs), Sequenza (v.3.0.0) (http://www.cbs.dtu.dk/biotools/sequenza), LICHeE (v1.0) (https://github.com/viq854/lichee), fragCounter (https://github.com/mskilab/fragCounter), dryclean (commit bda8065) (https://github.com/mskilab/dryclean), RepeatMasker (created in 2010 with the original RepBase library from 2010-03-02 and RepeatMasker 3.0.1) (https://www.repeatmasker.org/species/hg.html). Scanpy (v.1.9.6) (https://github.com/scverse/scanpy), GSEApy (v.1.1.1) (https://github.com/zqfang/GSEApy), CycleViz (v.0.1.5) (https://github.com/AmpliconSuite/CycleViz) and CellRanger (v.7.1.0) (https://github.com/10XGenomics/cellranger). Custom analysis scripts and scripts to reproduce figures are available at GitHub (https://github.com/nygenome/UrothelialCancer_WGS_paper_figures). The JaBbA SV browser includes detailed interactive maps of our structure variant calls (https://urothelial-cancer-wcm-2023.nygenome.org/). Image Lab (Bio-Rad v6.1.0) (https://www.bio-rad.com/) was used for western blot image processing and analysis. CytoVision (v.7.3.1) (https://www.leicabiosystems.com/) was used for FISH imaging. Zeiss deconvolution software (Zen desk v.3.7) (https://www.zeiss.com/microscopy/en/products/software/zeiss-zen-desk.html), Fiji ImageJ (v.154f) (https://imagej.net/software/fiji/) and GraphPad Prism (v.10.2.0) (https://www.graphpad.com/) were used for immunofluorescence image processing and analysis. Incucyte software (2022B, Rev2) (https://www.sartorius.com) was used for competitive assays. FlowJo (v.10.10.0) (https://www.flowjo.com/) was used for the analysis of FACS data. R (v.4.0.0) software was used for statistical tests." - start_time = time.time() - result = process_paper(sample_text["paragraph"]) - end_time = time.time() - print(f"Extracted and validated data: {json.dumps(result, indent=2)}") - print("total execution time: ", end_time-start_time) + # sample_text = { + # "DOI": "https://doi.org/10.1126/scitranslmed.adg5252", + # "header": "Data and materials availability:", + # "paragraph": "All data associated with this study are in the paper or Supplementary Materials. Sequencing data have been deposited in Genome Expression Omnibus (GEO) under accession number GSE235743 (www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE235743). Scripts for reproducing the analysis can be accessed via Zenodo (DOI 10.5281/zenodo.8011746).", + # "PubMed_ID": "37878672", + # "journal": "Science Translational Medicine" + # } + + + files = ["test_file_input.json"] + + for file in files: + print("Filename:", file) + with open(file, 'r', encoding='utf-8') as f: + data = json.load(f) + + results = [] + for entry in data: + doi = entry.get("DOI", "") + sample_text = entry.get("paragraph", "") + pmid = entry.get("PubMed_ID", "") + + start_time = time.time() + result = process_paper(sample_text) + end_time = time.time() + + results.append({ + "DOI": doi, + "PubMed_ID": pmid, + "results": result + }) + + print(f"Extracted and validated data: {results}") + print("Total execution time: ", end_time-start_time) + + output_path = "test_file_output.json" + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=4) + + + + # result = process_paper(sample_text["paragraph"]) + # end_time = time.time() + # print(f"Extracted and validated data: {json.dumps(result, indent=2)}") + # print("total execution time: ", end_time-start_time)