update examples

chaochungkuo · Feb 20, 2024 · 22817f7 · 22817f7
1 parent e416a53
commit 22817f7
Showing 4 changed files with 72 additions and 37 deletions.
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -5,4 +5,5 @@ Usage Examples
 Here is a list of some possible usage cases with GenomKit.
 
 .. toctree::
+   examples_fastq
    examples_bed
diff --git a/docs/source/examples_fastq.rst b/docs/source/examples_fastq.rst
@@ -5,8 +5,11 @@ Examples with FASTQ files
 Trimming a FASTQ file for both sequences and quality
 ----------------------------------------------------
 
-Sometimes you might want to trim the reads, for example, removing UMIs.
+Sometimes you might want to trim the reads, for example, removing UMIs and adaptor from the first 20 bp in R2.
 
 .. code-block:: python
 
-    
+    from genomkit import GSequences
+    fastq = GSequences(name="R2", load=FASTQ_R2)
+    fastq.trim(start=20, end=0)
+    fastq.write_FASTQ(filename="R2_trimmed.fastq", gz=False)
diff --git a/genomkit/sequences/gsequences.py b/genomkit/sequences/gsequences.py
@@ -1,5 +1,6 @@
 from .io import load_FASTA, load_FASTA_from_file, \
-                load_FASTQ, load_FASTQ_from_file
+                load_FASTQ, load_FASTQ_from_file, \
+                write_FASTA, write_FASTQ
 import gzip
 
 
@@ -128,13 +129,10 @@ def get_sequence(self, name, start, end):
             if seq.name == name:
                 return seq.slice_sequence(start, end)
 
-    def write_fasta(self, filename: str, data: bool = False):
-        with open(filename, "w") as fasta_file:
-            for seq in self.elements:
-                if data:
-                    fasta_file.write(">" + seq.name + " " +
-                                     " ".join(seq.data) + "\n")
-                else:
-                    fasta_file.write(f">{seq.name}\n")
-                for i in range(0, len(seq.sequence), 80):
-                    fasta_file.write(f"{seq.sequence[i:i+80]}\n")
+    def write_FASTA(self, filename: str, data: bool = False,
+                    gz: bool = True):
+        write_FASTA(seqs=self, filename=filename, data=data, gz=gz)
+
+    def write_FASTQ(self, filename: str, data: bool = False,
+                    gz: bool = True):
+        write_FASTQ(seqs=self, filename=filename, data=data, gz=gz)
diff --git a/genomkit/sequences/io.py b/genomkit/sequences/io.py
@@ -1,4 +1,5 @@
 from tqdm import tqdm
+import gzip
 
 
 ###########################################################################
@@ -64,49 +65,81 @@ def load_FASTQ_from_file(file):
     current_sequence_id = None
     current_sequence = ""
     current_quality = ""
-    total_lines = sum(1 for line in file if not line.startswith("#"))
+    total_records = sum(1 for _ in file) // 4  # Calculate total records
     file.seek(0)  # Reset file pointer to the beginning
-    with tqdm(total=total_lines) as pbar:
-        for line in file:
+
+    with tqdm(total=total_records) as pbar:
+        for line_num, line in enumerate(file):
             line = line.strip()
             if line.startswith("#"):
                 continue
-            elif line.startswith("@"):  # FASTQ header line
+            elif line_num % 4 == 0:  # Sequence header line
                 # If there was a previously stored sequence, store it
                 if current_sequence_id is not None:
                     if len(current_sequence) != len(current_quality):
                         raise ValueError("Invalid FASTQ file: Sequence and "
                                          "quality lines do not match.")
-                    elif len(current_quality) == len(current_sequence):
-                        infos = current_sequence_id.split()
-                        name = infos[0]
-                        data = infos[1:]
-                        res.add(GSequence(sequence=current_sequence,
-                                          quality=current_quality,
-                                          name=name, data=data))
+                    res.add(GSequence(sequence=current_sequence,
+                                      quality=current_quality,
+                                      name=current_sequence_id))
+                    pbar.update(1)  # Update progress bar
                 # Extract the sequence ID
                 current_sequence_id = line[1:]
                 # Start new sequence and quality strings
                 current_sequence = ""
                 current_quality = ""
-            elif current_sequence_id and not current_sequence:
+            elif line_num % 4 == 1:  # Sequence line
                 current_sequence = line
-            elif line.startswith("+"):  # FASTQ quality header line
-                continue
-            elif current_sequence_id and current_sequence and \
-                    not current_quality:
+            elif line_num % 4 == 3:  # Quality scores line
                 current_quality = line
-            pbar.update(1)
         # Store the last sequence
         if current_sequence_id is not None:
             if len(current_sequence) != len(current_quality):
                 raise ValueError("Invalid FASTQ file: Sequence and quality "
                                  "lines do not match.")
-            elif len(current_quality) == len(current_sequence):
-                infos = current_sequence_id.split()
-                name = infos[0]
-                data = infos[1:]
-                res.add(GSequence(sequence=current_sequence,
-                                  quality=current_quality,
-                                  name=name, data=data))
+            res.add(GSequence(sequence=current_sequence,
+                              quality=current_quality,
+                              name=current_sequence_id))
+            pbar.update(1)  # Update progress bar
     return res
+
+
+def write_FASTA(seqs, filename: str, data: bool = False,
+                gz: bool = True):
+    if gz:
+        with gzip.open(filename, "wt") as fasta_file:
+            write_fasta_content(seqs, fasta_file, data)
+    else:
+        with open(filename, "w") as fasta_file:
+            write_fasta_content(seqs, fasta_file, data)
+
+
+def write_fasta_content(seqs, fasta_file, data: bool):
+    for seq in seqs.elements:
+        if data:
+            fasta_file.write(">" + seq.name + " " +
+                             " ".join(seq.data) + "\n")
+        else:
+            fasta_file.write(f">{seq.name}\n")
+        for i in range(0, len(seq.sequence), 80):
+            fasta_file.write(f"{seq.sequence[i:i+80]}\n")
+
+
+def write_FASTQ(seqs, filename: str, data: bool = False, gz: bool = True):
+    if gz:
+        with gzip.open(filename, "wt") as fastq_file:
+            write_fastq_content(seqs, fastq_file, data)
+    else:
+        with open(filename, "w") as fastq_file:
+            write_fastq_content(seqs, fastq_file, data)
+
+
+def write_fastq_content(seqs, fastq_file, data: bool):
+    for seq in seqs.elements:
+        if data:
+            fastq_file.write(f"@{seq.name} {' '.join(seq.data)}\n")
+        else:
+            fastq_file.write(f"@{seq.name}\n")
+        fastq_file.write(f"{seq.sequence}\n")
+        fastq_file.write("+\n")
+        fastq_file.write(f"{seq.quality}\n")