update examples

chaochungkuo · Feb 20, 2024 · e416a53 · e416a53
1 parent c0cbcbf
commit e416a53
Showing 4 changed files with 143 additions and 119 deletions.
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -1,80 +1,8 @@
-========
-Examples
-========
+==============
+Usage Examples
+==============
 
-Extract exon, intron, and intergenetic regions in BED format from a GTF file
-----------------------------------------------------------------------------
+Here is a list of some possible usage cases with GenomKit.
 
-``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate 3 BED files as below:
-
-- hg38_exons.bed
-- hg38_introns.bed
-- hg38_intergenic_regions.bed
-
-.. code-block:: python
-
-    from genomkit import GRegions
-    from genomkit import GAnnotation
-
-    gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
-    genes = gtf.get_regions(element_type="gene")
-    exons = gtf.get_regions(element_type="exon")
-    introns = genes.subtract(exons, inplace=False)
-
-    chromosomes = GRegions(name="chromosomes")
-    chromosomes.get_chromosomes(organism="hg38")
-    intergenic_regions = chromosomes.subtract(genes, inplace=False)
-    exons.write(filename="hg38_exons.bed")
-    introns.write(filename="hg38_introns.bed")
-    intergenic_regions.write(filename="hg38_intergenic_regions.bed")
-
-
-Get all promoter regions in BED format from a GTF file
-------------------------------------------------------
-
-``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate 3 BED files as below:
-
-.. code-block:: python
-
-    from genomkit import GAnnotation
-
-    gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
-    genes = gtf.get_regions(element_type="gene")
-    promoters = genes.resize(extend_upstream=2000,
-                            extend_downstream=0,
-                            center="5prime", inplace=False)
-    promoters.write(filename="hg38_promoters.bed")
-
-Extract the genes by their biotypes from a GTF file
----------------------------------------------------
-
-``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate BED files for the biotypes below:
-
-- protein_coding
-- lncRNA
-- snRNA
-- miRNA
-
-.. code-block:: python
-
-    from genomkit import GAnnotation
-
-    gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
-    target_biotypes = ["protein_coding", "lncRNA", "snRNA", "miRNA"]
-    for biotype in target_biotypes:
-        genes = gtf.get_regions(element_type="gene",
-                                attribute="gene_type", value=biotype)
-        genes.write(filename="hg38_genes_"+biotype+".bed")
-
-Get the sequences in FASTA format from a BED file
--------------------------------------------------
-
-For example, you have a BED file for all the exons from hg38. Now you want to get their sequences in FASTA format with care of the orientation of the strands.
-
-.. code-block:: python
-
-    from genomkit import GRegions
-
-    exons = GRegions(name="exons", load="hg38_exons.bed")
-    exon_seqs = exons.get_GSequences(FASTA_file=FASTA_hg38)
-    exon_seqs.write_fasta(filename="hg38_exons.fasta")
+.. toctree::
+   examples_bed
diff --git a/docs/source/examples_bed.rst b/docs/source/examples_bed.rst
@@ -0,0 +1,80 @@
+=======================
+Examples with BED files
+=======================
+
+Extract exon, intron, and intergenetic regions in BED format from a GTF file
+----------------------------------------------------------------------------
+
+``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate 3 BED files as below:
+
+- hg38_exons.bed
+- hg38_introns.bed
+- hg38_intergenic_regions.bed
+
+.. code-block:: python
+
+    from genomkit import GRegions
+    from genomkit import GAnnotation
+
+    gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
+    genes = gtf.get_regions(element_type="gene")
+    exons = gtf.get_regions(element_type="exon")
+    introns = genes.subtract(exons, inplace=False)
+
+    chromosomes = GRegions(name="chromosomes")
+    chromosomes.get_chromosomes(organism="hg38")
+    intergenic_regions = chromosomes.subtract(genes, inplace=False)
+    exons.write(filename="hg38_exons.bed")
+    introns.write(filename="hg38_introns.bed")
+    intergenic_regions.write(filename="hg38_intergenic_regions.bed")
+
+
+Get all promoter regions in BED format from a GTF file
+------------------------------------------------------
+
+``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate 3 BED files as below:
+
+.. code-block:: python
+
+    from genomkit import GAnnotation
+
+    gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
+    genes = gtf.get_regions(element_type="gene")
+    promoters = genes.resize(extend_upstream=2000,
+                            extend_downstream=0,
+                            center="5prime", inplace=False)
+    promoters.write(filename="hg38_promoters.bed")
+
+Extract the genes by their biotypes from a GTF file
+---------------------------------------------------
+
+``GTF_hg38`` is the path to the hg38 GTF file for annotation. Now you want to generate BED files for the biotypes below:
+
+- protein_coding
+- lncRNA
+- snRNA
+- miRNA
+
+.. code-block:: python
+
+    from genomkit import GAnnotation
+
+    gtf = GAnnotation(file_path=GTF_hg38, file_format="gtf")
+    target_biotypes = ["protein_coding", "lncRNA", "snRNA", "miRNA"]
+    for biotype in target_biotypes:
+        genes = gtf.get_regions(element_type="gene",
+                                attribute="gene_type", value=biotype)
+        genes.write(filename="hg38_genes_"+biotype+".bed")
+
+Get the sequences in FASTA format from a BED file
+-------------------------------------------------
+
+For example, you have a BED file for all the exons from hg38. Now you want to get their sequences in FASTA format with care of the orientation of the strands.
+
+.. code-block:: python
+
+    from genomkit import GRegions
+
+    exons = GRegions(name="exons", load="hg38_exons.bed")
+    exon_seqs = exons.get_GSequences(FASTA_file=FASTA_hg38)
+    exon_seqs.write_fasta(filename="hg38_exons.fasta")
diff --git a/docs/source/examples_fastq.rst b/docs/source/examples_fastq.rst
@@ -0,0 +1,12 @@
+=========================
+Examples with FASTQ files
+=========================
+
+Trimming a FASTQ file for both sequences and quality
+----------------------------------------------------
+
+Sometimes you might want to trim the reads, for example, removing UMIs.
+
+.. code-block:: python
+
+    
diff --git a/genomkit/sequences/io.py b/genomkit/sequences/io.py
@@ -64,45 +64,49 @@ def load_FASTQ_from_file(file):
     current_sequence_id = None
     current_sequence = ""
     current_quality = ""
-    for line in file:
-        line = line.strip()
-        if line.startswith("#"):
-            continue
-        elif line.startswith("@"):  # FASTQ header line
-            # If there was a previously stored sequence, store it
-            if current_sequence_id is not None:
-                if len(current_sequence) != len(current_quality):
-                    raise ValueError("Invalid FASTQ file: Sequence and "
-                                     "quality lines do not match.")
-                elif len(current_quality) == len(current_sequence):
-                    infos = current_sequence_id.split()
-                    name = infos[0]
-                    data = infos[1:]
-                    res.add(GSequence(sequence=current_sequence,
-                                      quality=current_quality,
-                                      name=name, data=data))
-            # Extract the sequence ID
-            current_sequence_id = line[1:]
-            # Start new sequence and quality strings
-            current_sequence = ""
-            current_quality = ""
-        elif current_sequence_id and not current_sequence:
-            current_sequence = line
-        elif line.startswith("+"):  # FASTQ quality header line
-            continue
-        elif current_sequence_id and current_sequence and \
-                not current_quality:
-            current_quality = line
-    # Store the last sequence
-    if current_sequence_id is not None:
-        if len(current_sequence) != len(current_quality):
-            raise ValueError("Invalid FASTQ file: Sequence and quality "
-                             "lines do not match.")
-        elif len(current_quality) == len(current_sequence):
-            infos = current_sequence_id.split()
-            name = infos[0]
-            data = infos[1:]
-            res.add(GSequence(sequence=current_sequence,
-                              quality=current_quality,
-                              name=name, data=data))
+    total_lines = sum(1 for line in file if not line.startswith("#"))
+    file.seek(0)  # Reset file pointer to the beginning
+    with tqdm(total=total_lines) as pbar:
+        for line in file:
+            line = line.strip()
+            if line.startswith("#"):
+                continue
+            elif line.startswith("@"):  # FASTQ header line
+                # If there was a previously stored sequence, store it
+                if current_sequence_id is not None:
+                    if len(current_sequence) != len(current_quality):
+                        raise ValueError("Invalid FASTQ file: Sequence and "
+                                         "quality lines do not match.")
+                    elif len(current_quality) == len(current_sequence):
+                        infos = current_sequence_id.split()
+                        name = infos[0]
+                        data = infos[1:]
+                        res.add(GSequence(sequence=current_sequence,
+                                          quality=current_quality,
+                                          name=name, data=data))
+                # Extract the sequence ID
+                current_sequence_id = line[1:]
+                # Start new sequence and quality strings
+                current_sequence = ""
+                current_quality = ""
+            elif current_sequence_id and not current_sequence:
+                current_sequence = line
+            elif line.startswith("+"):  # FASTQ quality header line
+                continue
+            elif current_sequence_id and current_sequence and \
+                    not current_quality:
+                current_quality = line
+            pbar.update(1)
+        # Store the last sequence
+        if current_sequence_id is not None:
+            if len(current_sequence) != len(current_quality):
+                raise ValueError("Invalid FASTQ file: Sequence and quality "
+                                 "lines do not match.")
+            elif len(current_quality) == len(current_sequence):
+                infos = current_sequence_id.split()
+                name = infos[0]
+                data = infos[1:]
+                res.add(GSequence(sequence=current_sequence,
+                                  quality=current_quality,
+                                  name=name, data=data))
     return res