-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig.yaml
214 lines (194 loc) · 9.95 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
---
# ============================================================================
# INPUT/OUTPUT RELATED PATHS - DEFAULT VALUES PROVIDED
# *** Note: Make sure these paths are within the cloned RCRUNCH repo
# OUTPUT
output_dir: "results"
local_log: "logs/local_log"
cluster_log: "logs/cluster_log"
#____________ SCHEMATIC OVERVIEW OF THE INPUT STRUCTURE_______________________
# experiment set
# --- experiment_1 (experiment specific info)
# -- foreground
# - sample_1 { sample specific info}
# ...
# - sample_n { sample specific info}
# -- background: sample_1 (sample_2 ...sample_n)
# - sample_1 { sample specific info}
# ...
# - sample_n { sample specific info}
# ...
# --- experiment_n (experiment specific info)
#_____________________________________________________________________________
# INPUT
# ----------------------------------------------------------------------------
# *** Note: Make sure these paths are within the cloned RCRUNCH repo
# *** Create soft-links within the input_files folder if you already
# have the files stored somewhere else in your system
# - DEFAULT VALUES PROVIDED
input_dir: "input_files" # directory containing files to be used by RCRUNCH
# organism related data - REQUIRED - FILL-IN
genome: input_files/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa
# path_to_genome_file (.fa)
genome_tag: "hg38"
# organism tag - you can chose your own for labeling the organism
#_____________________________________________________________________________
# transcriptomic approach (TR) related data - OPTIONAL
# *** Note: Specify only if you use the TR in "method_types"
gtf: input_files/Homo_sapiens.GRCh38.98.chr.gtf
# path_to_annotation_file (.gtf)
transcriptome: input_files/Homo_sapiens.GRCh38.transcriptome.fa
# path_to_transcriptome_file (.fa)
#_____________________________________________________________________________
EXPERIMENT_SET : ["PUM2_K562_ENCSR661ICQ_2"]
# experiment label names chosen by user
# each of these labels has specifications as indicated below
# only the experiments specified here will be executed by RCRUNCH
#______________ Experiment specific info - REQUIRED___________________________
# ***Note: each of the experimental data set consists of foreground (CLIP)
# and background samples (e.g RNA-seq, SMInput). You can use more
# than one foreground or background samples per experiment. These
# samples will be merged together and treated as one sample (This
# functionality is useful for treating technical replicates as one
# sample). Make sure that the samples you merge are of the same
# kind (e.g paired, sense, dup_type)
# results will be collected in folders whose names are composed from the
# experiment labels name ; use informative values as shown here
PUM2_K562_ENCSR661ICQ_2: {
# experiment labels name
rbp: 'PUM2',
# rna binding protein name, eg: PUM2,
# RCRUNCH searches for motifs in ATtRACT with this name
# ATtRACT rbp naming conventions should be followed
replicates: ["ENCFF041KJT_ENCFF462SCV"],
# user specified sample(s) labels treated as foreground
# if more than one, merged and treated as one sample
# each of the samples has a speicification below tagged
# with the sample label name
smis: ["ENCFF616FCF_ENCFF495ZPY"] ,
# user specified sample(s) labels treated as background
# if more than one, merged and treated as one sample
# each of the samples has a speicification below tagged
# with the sample label name
window_f: 300,
# size to split windows
# should be at least double the fragment size
window_b: 300,
# size to split background windows
# larger background window than foreground can be used
# if bg sample is sparse
step_size: 150,
# step of sliding windows
# eg. if window size=300 and step=150 two successive
# windows will have half overlap
# if you choose step size equal to the window size,
# the windows will have no overlap
background_type: "standard"
# related to future functionality
# leave as is
}
#______________ Sample specific info - REQUIRED_______________________________
ENCFF041KJT_ENCFF462SCV: {
mate1: "ENCFF041KJT",
# if file name is ENCFF041KJT.fastq.gz then add ENCFF041KJT
# files are assumed to be in fastq.gz format
mate2: "ENCFF462SCV",
# if file name is ENCFF462SCV.fastq.gz then add ENCFF462SCV
# files are assumed to be in fastq.gz format.
paired: 2,
# leave as is. used for future support of single-end data
sense: 2,
# indicate if mate 1 is sense (1) or antisense (2)
# for paired-end reads the other mate is assumed to
# be the opposite
format: "encode",
# if fastq file comes from encode leave as is
# otherwise put as 'standard'
dup_type: "umis",
# types of duplicates to be removed
# options: "umis", "duplicates", "with_duplicates"
# the latter option means that no deduplication is performed
mate1_3p: "input_files/mate1_3p.fasta",
# fasta file containing adapters of 3p of mate1 to be removed
# multiple adapters are used here as described in analysis
# of encode data
# fix accordingly for your test case
# put 'XXXXXXX' if you do not desire any adapter trimming
mate1_5p: "input_files/mate1_5p.fasta",
# fasta file containing adapters of 5p of mate1 to be removed
mate2_3p: "input_files/mate2_3p.fasta",
# fasta file containing adapters of 3p of mate2 to be removed
mate2_5p: "input_files/mate2_5p.fasta"
# fasta file containing adapters of 5p of mate2 to be removed
}
ENCFF616FCF_ENCFF495ZPY: {
mate1: "ENCFF616FCF",
mate2: "ENCFF495ZPY",
paired: 2,
sense: 2,
format: "encode",
dup_type: "umis",
mate1_3p: "input_files/mate1_3p.fasta",
mate1_5p: "input_files/mate1_5p.fasta",
mate2_3p: "input_files/mate2_3p.fasta",
mate2_5p: "input_files/mate2_5p.fasta"
}
#_________RCRUNCH specific options - REQUIRED - DEFAULT VALUES SUGGESTED _____
method_types : ["GN", "TR"]
# options: GN, TR
# if you want BOTH methods, you can specify ['GN', 'TR']
# ['GN', 'TR'] will create both transcriptomic & genomic predictions
multimappers: 1
# change if you want to accept reads that map to multiple regions
# (the number indicates the number of regions a read can map to at
# the same time)
FDR : "0.1"
# false discovery cutoff
# peaks with score above the cutoff will be
# included in the total_peaks.csv file
sjdbOverhang: 49
# maximum read length of the read mates -1
fragment_size: 80
# estimated fragment size of the experiment
#___RCRUNCH specific options - DEFAULTS - Be careful of the values you use ___
seq_type: "pe"
# paired end - currently only paired-end seq are allowed
motif_lengths: [6, 8]
# motif sizes to be considered.
peak_center : ["crosslink", "peak_center"]
# use 'crosslink' (position where most reads start)
# or 'peak_center' as the center of the peak when searching for
# motifs ; you can use both by adding ["crosslink", "peak_center"]
runs: 2
# number of randomisations of 'training' and 'test' sets
# in the cross-validation of motif enrichment
# the more randomisations the more reliable the result
# too many runs will make RCRUNCH slower
# a number ~2-10 is suggested
random_sequences_per_peak: 20
# used in the background creation during the motif analysis step
#___________________________ADDITIONAL LIBRARIES______________________________
# ncRNA filtering based on RNAcentral annotation - OPTIONAL
# Leave empty if you haven't downloaded this library.
# ncRNAs: ""
# ncRNA_biotypes: []
ncRNAs: "input_files/homo_sapiens.GRCh38.gff3"
# path_to_ncrna_database (.gff3).
ncRNA_biotypes: []
# e.g 'rRNA', 'snRNA'
# if empty no filtering will take place.
# if it is specified you need to specify ncRNAs
# Include ATtRACT known motifs in the analysis - OPTIONAL
# Leave empty if you haven't downloaded this library.
# wlib_names: ""
# wlib_pwms: ""
# wlib_organism: ""
wlib_names: "input_files/ATtRACT/ATtRACT_db.txt"
# path_to_attract_database_names (.txt)
wlib_pwms: "input_files/ATtRACT/pwm.txt"
# path_to_attract_database_pwm (.txt)
# "/path/to/attract_database_pwm.txt"
wlib_organism: "Homo_sapiens"
# change in case of different organism
# ============================================================================
...