forked from mediacloud/backend
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mediawords.yml.dist
439 lines (339 loc) · 13.9 KB
/
mediawords.yml.dist
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
---
name: MediaWords
### database settings. at least one database connection must be defined. the
### main "production" database should be the first one below.
database:
# production
- label : "LABEL"
type : "pg"
host : "localhost"
port : 5432
db : "mediacloud"
user : "mediaclouduser"
pass : "mediacloud"
# unit tests
- label : "test"
type : "pg"
host : "localhost"
port : 5432
db : "mediacloud_test"
user : "mediaclouduser"
pass : "mediacloud"
### Amazon S3 connection settings
#amazon_s3:
### Bucket for storing downloads
#downloads:
#access_key_id : "AKIAIOSFODNN7EXAMPLE"
#secret_access_key : "wJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY"
#bucket_name : "mediacloud-downloads"
#directory_name : "downloads"
### Bucket for testing
#test:
#access_key_id : "AKIAIOSFODNN7EXAMPLE"
#secret_access_key : "wJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY"
#bucket_name : "mediacloud_test"
### An unique random string will be appended to the directory name
#directory_name : "downloads_test"
### Bucket for storing Bit.ly raw JSON responses
#bitly_processing_results:
#access_key_id : "AKIAIOSFODNN7EXAMPLE"
#secret_access_key : "wJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY"
#bucket_name : "mediacloud-bitly-processing-results"
#directory_name : "json_blobs"
### Cache raw Bit.ly processing results locally?
#cache_bitly_processing_results : "no"
## Job manager (MediaCloud::JobManager) configuration
job_manager:
## When uncommented, will use RabbitMQ as job broker
rabbitmq:
## RabbitMQ client configuration
## (both workers and clients will use this key)
client:
## Connection credentials
hostname: "localhost"
port: 5673 # not the default 5672
username: "mediacloud"
password: "mediacloud"
vhost: "/mediacloud"
timeout: 60
## RabbitMQ server configuration
## (rabbitmq_wrapper.sh will use this for starting up an instance of
## RabbitMQ)
server:
## To disable your very own RabbitMQ instance managed by Supervisord,
## set the below to "no". Default is "yes".
enabled: "yes"
## Host to listen to. You can set the above parameter to an empty string
## so that RabbitMQ will accept connections from anywhere; however, it is
## highly advised use to secure channels (e.g. a SSH tunnel) to make RabbitMQ
## accessible from "outside" instead. Default is "127.0.0.1".
listen: "127.0.0.1"
## Port to use for RabbitMQ. Default port for vendor-provided RabbitMQ
## deployments is 5672, but Media Cloud runs its own RabbitMQ instance via
## Supervisord. Default is 5673.
port: 5673 # not the default 5672
## Node name
node_name: "mediacloud@localhost"
## User credentials and vhost to create upon start (instead of "guest")
username: "mediacloud"
password: "mediacloud"
vhost: "/mediacloud"
### Supervisor (supervisord) configuration
supervisor:
### The log directory for child process logs (absolute or relative to Media
### Cloud's root; must already exist)
childlogdir: "data/supervisor_logs/"
# if set to true, do not autostart any programs, regardless of the settings in the particular programs below.
start_no_supervisor_programs: 'true'
# configure supervisor settings for mediacloud background daemons here.
# the defaults should work for a small dev setup, but you will want to increase
# numprocs for some daemons depending on load. you can also set some daemons
# not to autostart -- for instance you might want to change crawler.autostart
# to 'false' to prevent the crawler from starting automatically on a dev machine.
#programs:
#crawler:
#numprocs: 1
#autostart: 'true'
#autorestart: 'false'
#extract_and_vector:
#numprocs: 1
#autostart: 'true'
#autorestart: 'true'
# other configurable supervisor programs
#create_missing_partitions
#purge_object_caches
#facebook_fetch_story_stats
#process_bitly_schedule
#extractor_python_readability_server
#rabbitmq
#rescrape_media
#topic_mine
#topic_snapshot
#annotate_with_corenlp
#bitly_fetch_story_stats
#bitly_aggregate_story_stats
# Standalone Solr instance
#solr_standalone
# Solr cluster: ZooKeeper instance
#solr_cluster_zookeeper
# Solr cluster: Solr shards
# (Don't set "numprocs" here, adjust "cluster_shard_count" / "local_shard_count" instead.)
#solr_shard
### Solr server, when running as a Supervisor service
supervisor_solr:
### Standalone Solr instance
standalone:
# JVM heap size (-Xmx)
jvm_heap_size: "256m"
### Solr cluster
cluster:
### ZooKeeper instance
zookeeper:
### Address to bind to
listen: "0.0.0.0"
### Port to listen to
port: 9983
### Solr shards
shards:
# Total number of local shards
local_shard_count: 2
# Total number of shards across the cluster ("numShards")
cluster_shard_count: 2
# JVM heap size for a single shard (-Xmx)
jvm_heap_size: "256m"
# ZooKeeper host + port to connect shards to
zookeeper_host: "localhost"
zookeeper_port: 9983
### CoreNLP annotator
corenlp:
### Enable CoreNLP processing
### If enabled, CoreNLP processing will happen after every "content"
### download extraction
enabled: "no"
### Annotator URL, e.g. "http://www.example.com:8080/corenlp/annotator"
annotator_url: ""
### Annotator timeout (in seconds)
### If you annotate huge chunks of text or the CoreNLP annotator is busy in
### general, you might want to increase this value even more because
### otherwise the job worker will exit() often.
annotator_timeout: 600
### CoreNLP annotator level; you might want to use this configuration
### parameter to limit the scope of annotations returned from the service.
### Default is an empty string; you might want to set it to "ner".
annotator_level: ""
### Bit.ly API
bitly:
### Enable Bit.ly processing
enabled: "no"
### (Generic) Access Token
### Get one at: https://bitly.com/a/oauth_apps
access_token: ""
### API request timeout (in seconds)
timeout: 60
### Locations to read raw JSON responses from
### (default is just "postgresql")
json_read_stores:
### Try "bitly_processing_results" table first
- postgresql
### ...then fallback to Amazon S3
#- amazon_s3
### Locations to write raw JSON responses to
### (default is just "postgresql")
json_write_stores:
### Write to "bitly_processing_results" table first
- postgresql
### ...then to Amazon S3 too
#- amazon_s3
### Bit.ly processing for all stories
#story_processing:
### Enable Bit.ly processing for all stories (not just the ones that
### belong to topics enabled for Bit.ly processing)
#enabled: "no"
### Delay for which to postpone story processing since its
### "publish_date" / "collect_date"
###
### For example, you might want to process the story against Bit.ly after:
### * 295200 seconds (3 days), and
### * 2952000 seconds (30 days)
### from story's "publish_date" (or "collect_date" if "publish_date"
### is invalid).
#schedule:
### 3 days from "stories.publish_date"
#- 295200
### 30 days from "stories.publish_date"
#- 2952000
### Facebook API
### (see doc/README.facebook_api.markdown)
facebook:
### Enable Facebook processing
enabled: "no"
## App ID
app_id: ""
## App Secret
app_secret: ""
## Request timeout
#timeout: 60
#twitter:
# consumer_key: ""
# consumer_secret: ""
# access_token: ""
# access_token_secret: ""
# key to fetch tweets from crimson hexagon. necessary for topic tweets as implements in FetchTopicTweets.pm
#crimson_hexagon:
# key: ""
### Univision.com feed credentials
#univision:
### Client ID
#client_id: 83db02e1cba58c43d01116c50014913b47fa473b
### Client Secret (Secret Key)
#client_secret: 7187037755de2dd77451f491d46b103b86fbcf79
### Email configuration
mail:
# "From:" email address that is being set in emails sent by Media Cloud
from_address: "[email protected]"
### (optional) SMTP configuration
smtp:
### SMTP host
host: "localhost"
### SMTP port
port: 25
### Use STARTTLS? If you enable that, you probably want to change the port to 587.
starttls: no
### (optional) SMTP login credentials
username: ""
password: ""
### everything below is optional. the system should work out of the box without
### touching any of these other than calais_key for tagging
#session:
#expires: 3600
### directory where web app sessions are stored. default to $homedir/tmp
#storage: "~/tmp/mediawords-session"
## Uncomment and fill in to use Google Analytics
#google_analytics:
# account: "<ACOUNT>"
# domainname: "<DOMAIN>"
mediawords:
### defaults to http://$hostname:$port/.
#base_url: "http://your.mediacloud.server/and/path"
### Directory in which various kinds of data (logs, etc.) is being stored
#data_dir: "<bindir>/../data"
### HTTP user agent and the email address of the owner of the bot
user_agent: "mediawords bot (http://cyber.law.harvard.edu)"
owner: "[email protected]"
### Uncomment one or more storage methods to store downloads in.
### Default is "postgresql" which stores downloads directly in the
### PostgreSQL database.
###
### Very short downloads will be stored directly in the database, under
### "downloads.path"
###
### The path of the last download storage method listed below will be
### stored in "downloads.path" database column.
download_storage_locations:
### store downloads in the PostgreSQL database, "raw_downloads" table
- postgresql
### store downloads in Amazon S3
#- amazon_s3
### Read all non-inline ("content") downloads from S3
read_all_downloads_from_s3 : "no"
### Uncomment to fallback PostgreSQL downloads to Amazon S3 (if download
### doesn't exist in PostgreSQL storage, S3 will be tried instead)
fallback_postgresql_downloads_to_s3 : "no"
### Enable local Amazon S3 download caching?
cache_s3_downloads : "no"
#controls the maximum time SQL queries can run for -- time is in ms
#uncomment to enable a 10 minute timeout
#db_statement_timeout: "600000"
# Uncommit to speed up slow queries by setting the Postgresql work_mem parameter to this value
# By default the initial Postgresql value of work_mem is used
# large_work_mem: "3GB"
# An experiment parameter to dump stack traces in error message even if not in debug mode
# NOTE: may leak DB passwords and is not to be use in production
always_show_stack_traces: "no"
# reCAPTCHA public key (used to prevent brute-force in the password reset form)
# The default value was set up for http://127.0.0.1 and is a global key (should work across all domains)
recaptcha_public_key: "6LfEVt0SAAAAAFwQI0pOZ1bTHgDTpQcMeQY6VLd_"
# reCAPTCHA private key (used to prevent brute-force in the password reset form)
# The default value was set up for http://127.0.0.1 and is a global key (should work across all domains)
recaptcha_private_key: "6LfEVt0SAAAAABmI-8IJmx4g93eNcSeyeCxvLMs2"
#uncomment to make the public homepage the default start page
default_home_page: "admin/media/list"
# downloads id under which to strip all non-ascii characters
#ascii_hack_downloads_id: 123456789
# settings for mediawords_web_store.pl script that does in process parallel fetching
# web_store_num_parallel: 10
# web_store_timeout: 90
# web_store_per_domain_timeout: 1
# tablespace in which to create temporary tables -- defaults to the postgres default
# temporary_table_tablespace: temporary_tablespace
# url for solr word counting url. if this is set, fetch word counts from a remote server
# using this url; otherwise, generate word counts locally
# solr_wc_url: http://localhost/api/v2/wc
# mc api key for appending to sol_wc_url for fetching remote word counts
# solr_wc_key: FOO
# URLs for Solr queries; include multiple to make Media Cloud choose a random URL from
# the list for each Solr query
solr_url:
# Standalone Solr instance...
- http://localhost:8983/solr
# ...or SolrCloud shards
#- http://127.0.0.1:7981/solr
#- http://127.0.0.1:7982/solr
#- http://127.0.0.1:7983/solr
#- http://127.0.0.1:7984/solr
#- http://127.0.0.1:7985/solr
#- http://127.0.0.1:7986/solr
#- http://127.0.0.1:7987/solr
#- http://127.0.0.1:7988/solr
# Solr importer configuration
solr_import:
# Stories to import into Solr on a single run
max_queued_stories: 100000
# set to "yes" to skip requirement to run on the correct database schema version
# ignore_schema_version: "no"
# increment wc_cache_version to invalidate existing cache
# wc_cache_version: 1
# list of emails to which to send all topic alerts
# topic_alert_emails: