-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfastq2count.cellline.GSE248558.txt
151 lines (124 loc) · 4.34 KB
/
fastq2count.cellline.GSE248558.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#Download 'GSE248558'
#first install anaconda if it in not (update the link for updated versions)-----------------
#wget https://repo.anaconda.com/archive/Anaconda3-2023.09-0-Linux-x86_64.sh
#bash Anaconda3-2023.09-0-Linux-x86_64.sh
#creat new env and name it---
#conda create -n env2023 python=3.11
conda activate env2023
#install if not --------------------
#conda install -c bioconda samtools=2.35
#conda install -c bioconda cutadapt=3.5
#conda install -c bioconda fastqc=0.12.1
#conda install -c bioconda star=2.5.2b
#conda install -c bioconda rsem=1.3.3 #don't
#conda install -c bioconda subread=2.0.1 #don't
#conda install -c bioconda multiqc=2.35
#fastqc-----------------------
conda activate env2023
cd data
mkdir FASTQC
output=FASTQC
for file in *.gz
do
fastqc -f fastq -o ${output} ${file}
echo ${file}
done
#multiQC-----------
cd
cd data
multiqc FASTQC/.
#cutadapt-------------------------
#https://cutadapt.readthedocs.io/en/v2.10/guide.html#filtering
conda activate env2023
cd data
mkdir Cutadapt
ls *_L003_R1.fastq.gz > samples.txt
cat samples.txt | rev | cut -c18- | rev > samples1.txt
cat samples1.txt > samples.txt
rm samples1.txt
samples=$(cat samples.txt)
output=Cutadapt/
for file in $samples
do
cutadapt -j 0 -u 12 -U 12 -q 20 -m 10:10 \
-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \
-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT \
-b CTGTCTCTTATACACATCT \
-B CTGTCTCTTATACACATCT \
-b AGATGTGTATAAGAGACAG \
-B AGATGTGTATAAGAGACAG \
-g AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT \
-G AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT \
-g GATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
-G GATCGGAAGAGCACACGTCTGAACTCCAGTCAC \
-pair-filter=both \
-o "${output}${file}".trim.R1.fastq \
-p "${output}${file}".trim.R2.fastq \
"${file}"_L003_R1.fastq.gz "${file}"_L003_R2.fastq.gz
done
#fastqc after trimming-----------------------
conda activate env2023
cd data
mkdir Cutadapt/FASTQCresults
samples=$(cat samples.txt)
output=Cutadapt/FASTQCresults/
for file in $samples
do
fastqc Cutadapt/"${file}".trim.R1.fastq Cutadapt/"${file}".trim.R2.fastq -o "${output}"
done
#multiQC-----------
conda activate env2023
cd data
multiqc Cutadapt/FASTQCresults/.
# mapping with STAR-------------------
#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf
conda activate env2023
cd data
mkdir genome
cd genome
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38.primary_assembly.genome.fa.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.primary_assembly.annotation.gtf.gz
gunzip GRCh38.primary_assembly.genome.fa.gz
gunzip gencode.v44.primary_assembly.annotation.gtf.gz
mkdir -p genomeDir
cd ..
#Creating a genome index
STAR --runThreadN 12 \
--runMode genomeGenerate \
--genomeDir genome/genomeDir \
--genomeFastaFiles genome/GRCh38.primary_assembly.genome.fa \
--sjdbGTFfile genome/gencode.v44.primary_assembly.annotation.gtf \
--sjdbOverhang 138 #max lenght of reads -1
#Aligning reads--------------------------
#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf
cd data
mkdir STAR
samples=$(cat samples.txt)
for file in $samples
do
STAR --genomeDir genome/genomeDir \
--runThreadN 12 \
--readFilesIn Cutadapt/${file}.trim.R1.fastq Cutadapt/${file}.trim.R2.fastq \
--outFileNamePrefix STAR/${file} \
--outSAMtype None \
--quantMode GeneCounts
done
#Make count data -------------------------------
cd data
sudo apt install moreutils
sudo apt install parallel
cd STAR
ls -1 *ReadsPerGene.out.tab | parallel 'cat {} | sed '1d' | cut -f2 {} > {/.}_clean.txt'
ls -1 *ReadsPerGene.out.tab | head -1 | xargs cut -f1 > genes.txt
paste genes.txt *_clean.txt > output_unstranded.txt
ls -1 *ReadsPerGene.out.tab | parallel 'cat {} | sed '1d' | cut -f3 {} > {/.}_clean.txt'
ls -1 *ReadsPerGene.out.tab | head -1 | xargs cut -f1 > genes.txt
paste genes.txt *_clean.txt > output_stranded.txt
ls -1 *ReadsPerGene.out.tab | parallel 'cat {} | sed '1d' | cut -f4 {} > {/.}_clean.txt'
ls -1 *ReadsPerGene.out.tab | head -1 | xargs cut -f1 > genes.txt
paste genes.txt *_clean.txt > output_Reverse_stranded.txt
ls -1 *ReadsPerGene.out.tab >Colnames.txt
#Quality check with MultiQC---------------
cd data
cd STAR
multiqc .