Skip to content

Commit

Permalink
update examples
Browse files Browse the repository at this point in the history
chaochungkuo committed Feb 20, 2024
1 parent e416a53 commit 22817f7
Showing 4 changed files with 72 additions and 37 deletions.
1 change: 1 addition & 0 deletions docs/source/examples.rst
Original file line number Diff line number Diff line change
@@ -5,4 +5,5 @@ Usage Examples
Here is a list of some possible usage cases with GenomKit.

.. toctree::
examples_fastq
examples_bed
7 changes: 5 additions & 2 deletions docs/source/examples_fastq.rst
Original file line number Diff line number Diff line change
@@ -5,8 +5,11 @@ Examples with FASTQ files
Trimming a FASTQ file for both sequences and quality
----------------------------------------------------

Sometimes you might want to trim the reads, for example, removing UMIs.
Sometimes you might want to trim the reads, for example, removing UMIs and adaptor from the first 20 bp in R2.

.. code-block:: python
from genomkit import GSequences
fastq = GSequences(name="R2", load=FASTQ_R2)
fastq.trim(start=20, end=0)
fastq.write_FASTQ(filename="R2_trimmed.fastq", gz=False)
20 changes: 9 additions & 11 deletions genomkit/sequences/gsequences.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .io import load_FASTA, load_FASTA_from_file, \
load_FASTQ, load_FASTQ_from_file
load_FASTQ, load_FASTQ_from_file, \
write_FASTA, write_FASTQ
import gzip


@@ -128,13 +129,10 @@ def get_sequence(self, name, start, end):
if seq.name == name:
return seq.slice_sequence(start, end)

def write_fasta(self, filename: str, data: bool = False):
with open(filename, "w") as fasta_file:
for seq in self.elements:
if data:
fasta_file.write(">" + seq.name + " " +
" ".join(seq.data) + "\n")
else:
fasta_file.write(f">{seq.name}\n")
for i in range(0, len(seq.sequence), 80):
fasta_file.write(f"{seq.sequence[i:i+80]}\n")
def write_FASTA(self, filename: str, data: bool = False,
gz: bool = True):
write_FASTA(seqs=self, filename=filename, data=data, gz=gz)

def write_FASTQ(self, filename: str, data: bool = False,
gz: bool = True):
write_FASTQ(seqs=self, filename=filename, data=data, gz=gz)
81 changes: 57 additions & 24 deletions genomkit/sequences/io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from tqdm import tqdm
import gzip


###########################################################################
@@ -64,49 +65,81 @@ def load_FASTQ_from_file(file):
current_sequence_id = None
current_sequence = ""
current_quality = ""
total_lines = sum(1 for line in file if not line.startswith("#"))
total_records = sum(1 for _ in file) // 4 # Calculate total records
file.seek(0) # Reset file pointer to the beginning
with tqdm(total=total_lines) as pbar:
for line in file:

with tqdm(total=total_records) as pbar:
for line_num, line in enumerate(file):
line = line.strip()
if line.startswith("#"):
continue
elif line.startswith("@"): # FASTQ header line
elif line_num % 4 == 0: # Sequence header line
# If there was a previously stored sequence, store it
if current_sequence_id is not None:
if len(current_sequence) != len(current_quality):
raise ValueError("Invalid FASTQ file: Sequence and "
"quality lines do not match.")
elif len(current_quality) == len(current_sequence):
infos = current_sequence_id.split()
name = infos[0]
data = infos[1:]
res.add(GSequence(sequence=current_sequence,
quality=current_quality,
name=name, data=data))
res.add(GSequence(sequence=current_sequence,
quality=current_quality,
name=current_sequence_id))
pbar.update(1) # Update progress bar
# Extract the sequence ID
current_sequence_id = line[1:]
# Start new sequence and quality strings
current_sequence = ""
current_quality = ""
elif current_sequence_id and not current_sequence:
elif line_num % 4 == 1: # Sequence line
current_sequence = line
elif line.startswith("+"): # FASTQ quality header line
continue
elif current_sequence_id and current_sequence and \
not current_quality:
elif line_num % 4 == 3: # Quality scores line
current_quality = line
pbar.update(1)
# Store the last sequence
if current_sequence_id is not None:
if len(current_sequence) != len(current_quality):
raise ValueError("Invalid FASTQ file: Sequence and quality "
"lines do not match.")
elif len(current_quality) == len(current_sequence):
infos = current_sequence_id.split()
name = infos[0]
data = infos[1:]
res.add(GSequence(sequence=current_sequence,
quality=current_quality,
name=name, data=data))
res.add(GSequence(sequence=current_sequence,
quality=current_quality,
name=current_sequence_id))
pbar.update(1) # Update progress bar
return res


def write_FASTA(seqs, filename: str, data: bool = False,
gz: bool = True):
if gz:
with gzip.open(filename, "wt") as fasta_file:
write_fasta_content(seqs, fasta_file, data)
else:
with open(filename, "w") as fasta_file:
write_fasta_content(seqs, fasta_file, data)


def write_fasta_content(seqs, fasta_file, data: bool):
for seq in seqs.elements:
if data:
fasta_file.write(">" + seq.name + " " +
" ".join(seq.data) + "\n")
else:
fasta_file.write(f">{seq.name}\n")
for i in range(0, len(seq.sequence), 80):
fasta_file.write(f"{seq.sequence[i:i+80]}\n")


def write_FASTQ(seqs, filename: str, data: bool = False, gz: bool = True):
if gz:
with gzip.open(filename, "wt") as fastq_file:
write_fastq_content(seqs, fastq_file, data)
else:
with open(filename, "w") as fastq_file:
write_fastq_content(seqs, fastq_file, data)


def write_fastq_content(seqs, fastq_file, data: bool):
for seq in seqs.elements:
if data:
fastq_file.write(f"@{seq.name} {' '.join(seq.data)}\n")
else:
fastq_file.write(f"@{seq.name}\n")
fastq_file.write(f"{seq.sequence}\n")
fastq_file.write("+\n")
fastq_file.write(f"{seq.quality}\n")

0 comments on commit 22817f7

Please sign in to comment.