-
Notifications
You must be signed in to change notification settings - Fork 2
/
gene.expansion.codes.txt
204 lines (165 loc) · 6.59 KB
/
gene.expansion.codes.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#Install Maker2
#Register and download Maker2 from https://www.yandell-lab.org/software/maker.html.
# Go to directory “maker/src” of downloaded package
# Run the command below to configure
perl Build.PL
# Install maker2 with the following command
./Build install
#Install BUSCO
# Download the package
git clone https://gitlab.com/ezlab/busco.git
# Install
cd busco/
sudo python3 setup.py
# Check installation
busco -h
#Install RepeatMasker
# download repeatMasker from http://www.repeatmasker.org
# download RepBase repeat libraries from https://www.girinst.org/
# Unpack the package of RepeatMasker
cp < RepeatMasker.tar.gz> <path_to_install_RepeatMasker>
cd <path_to_install_RepeatMasker>
gunzip <RepeatMasker.tar.gz>
tar xvf <RepeatMasker.tar>
# Unpack RepBase library
cp <RepBaseRepeat.tar.gz> <path_to_install_RepeatMasker>/RepeatMasker
cd <path_to_install_RepeatMasker>/RepeatMasker
gunzip <RepBaseRepeat.tar.gz>
tar xvf <RepBaseRepeat.tar.gz>
# Set up by run following command
perl ./configure
#Install RepeatModeler
# download the package from http://www.repeatmasker.org/RepeatModeler/
# unpack the package of RepeatModeler
tar -zxvf <RepeatModeler-open-#.#.#.tar.gz>
# go into the folder and configure it
perl ./configure
#Install CAFE5
# Download the package from https://github.com/hahnlab/CAFE5
# Go into the folder and run following commands to install
./configure
./make
# Install GeneWise
# Download the package from ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/
# Binaries are in src/bin after make
cd <path_wise2>/src
make all
#Install Apollo
# run the command below to install
git clone https://github.com/GMOD/Apollo.git Apollo
# See https://genomearchitect.readthedocs.io/en/latest/Apollo2Build.html for details
#Install STAR
# Download STAR from https://github.com/alexdobin/STAR
cd <path_STAR>
make STAR
# RepeatModeler: Construct species-specific repetitive elements
# Input: genomic sequences
# Output: file “consensi.fa.classified”, containing the receptive sequences
# path_RM: path of RepeatModeler
# -pa N: how many cores to run
# -engine ncbi: refers to blast program for alignment
<path_RM>/BuildDatabase -name seqfiledb -engine ncbi <genome.fa>
<path_RM>/RepeatModeler -database seqfiledb -pa N > seqfile.out
#Augustus training
# Input: genome assembly
# Output: trained model
# --long: performs full optimization for Augustus training
python <directory_of_BUSCO>/BUSCO.py --cup <number_thread> --in <genome_assembly>.fa --out <output_name> --lineage <directory_BUSCO_lineage_data> --mode genome --long
#SNAP training
# (a) Generate MAKER control files
# Generate three files with suffix “.ctl”, through which to provide user input
maker -CTL
# (b) Edit maker_opts.ctl file to provide input parameters
genome=<genome_assembly.fa>
organism_type=<eukaryotic|prokaryotic>
# Expressed sequence tags (ESTs) or assembled mRNA
est=<transcript_evidence.fa>
# Protein sequences from other organisms (e.g., UniProt)
protein=<protein.fa>
# Gene prediction method
# (1st round training derive gene mode from EST or protein evidence)
est2genome=1 | protein2genome=1
# (c) Run MAKER
# Run on a single processor by “maker” or on “N” processors by “mpirun -n”
maker | mpirun -n N maker
# (d) Collect annotation result and merge into a single file
cd <maker.ouput>
gff3_merge -d <genome_datastore_index.log> -g
# (e) Make a directory for the training
mkdir <snapTrain1>
cd <snapTrain1>
# (f) Generate files required for training
# Generate <genome.ann>, <genome.dna> required to train SNAP
maker2zff <../all.gff>
# “fathom” separates annotation into categories
# uni: single gene per sequence
# alt: genes with alternative splicing
# olp: genes overlap others
# err: genes with errors
# wrn: genes with warnings
fathom -categorize 1000 <genome.ann> <genome.dna>
# “fathom” exports the genes
# Generate export.aa, export.ann, export.dna, export.txt
fathom export 1000 uni.ann uni.dna
# (g) Generate new parameters
mkdir params
cd params
forge ../export.ann ../export.dna
cd ..
# (h) Generate new HMM
hmm-assembler.pl <genome> params > <genome.hmm>
cd ..
# (i) Update maker_opts.ctl & retrain the model from step (c) to (h)
snaphmm=<genome.hmm>
est2genome=0
protein2genome=0
#Maker2: gene structure annotation
# (a) Generate MAKER control files
# Generate three files with suffix “.ctl”, through which to provide user input
maker -CTL
# (b) Edit maker_opts.ctl file to provide input parameters
genome=<genome_assembly.fa>
organism_type=<eukaryotic|prokaryotic>
# Expressed sequence tags (ESTs) or assembled mRNA
est=<transcript_evidence.fa>
# Protein sequences from other organisms (e.g., UniProt)
protein=<protein.fa>
# Gene prediction models
snaphmm=<SNAP_trained_model>
augustus_species=<augustus_trained_model>
# (c) Run MAKER
# Run on a single processor by “maker” or on “N” processors by ‘”mpirun -n”
maker | mpirun -n N maker
# (d) Collect annotation result and merge into a single file
cd <maker.ouput>
gff3_merge -d <genome_datastore_index.log> -g
#BUSCO: measure completeness of genome assembly or annotated transcripts/proteins
# Input: genome sequence or protein sequence to be measured
# Output: completeness of input regarding to near-universal single-copy orthologs
# -i: input file, either a nucleotide fasta file or a protein fasta file
# -l: lineage dataset
# -o: folder to save results
# -m: assessment mode (i.e., genome, protein, transcriptome)
busco -i <DNA.fa|protein.fa> -l <lineage> -o <output> -m <mode>
#bedtools: get genomic sequence around a predicted gene (used for GeneWise)
# Input: genome sequence, gene coordinate(±5kb) in bed format
# Output: genome sequence in gene coordinate(±5kb)
# fi: the genomic sequence
# -bed: gene coordinate in bed format (extend to upstream/downstream 5kb)
# -s: force strandedness. Return reverse complement if the gene is on the antisense strand.
# -name: use “name” column in bed file as fasta headers of output
bedtools getfasta -fi <genomic.fa> -bed <geneCoordinate.bed> -s -name > genomic-region.fa
#GeneWise: identify pseudogenes
# Input: protein sequence of homolog, DNA sequence of genomic region of a predicted gene copy
# Output: report mutations that pseudogenized the gene copy
# -sum: show summary output
# -pretty: show pretty ascii output
# -pseudo: mark genes with frameshifts as pseudo genes
# -genes: show gene structure
# -cdna: show predicted cDNA sequence
# -trans: show protein translation
# -pep: show predicted peptide
# -para: show parameters
# -both: check both strand
# -quiet: no report on stderr
genewise <protein.fa> <genomic-region.fa> -sum -pretty -pseudo -genes -cdna -trans -pep -para -both -quiet > out.gw