mediawords.yml.dist

---
name: MediaWords

### database settings. at least one database connection must be defined. the
### main "production" database should be the first one below.
database:

    # production
    - label : "LABEL"
      type  : "pg"
      host  : "localhost"
      port  : 5432
      db    : "mediacloud"
      user  : "mediaclouduser"
      pass  : "mediacloud"

    # unit tests
    - label : "test"
      type  : "pg"
      host  : "localhost"
      port  : 5432
      db    : "mediacloud_test"
      user  : "mediaclouduser"
      pass  : "mediacloud"

### Amazon S3 connection settings
#amazon_s3:

    ### Bucket for storing downloads
    #downloads:
        #access_key_id      : "AKIAIOSFODNN7EXAMPLE"
        #secret_access_key  : "wJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY"
        #bucket_name        : "mediacloud-downloads"
        #directory_name     : "downloads"

    ### Bucket for testing
    #test:
        #access_key_id      : "AKIAIOSFODNN7EXAMPLE"
        #secret_access_key  : "wJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY"
        #bucket_name        : "mediacloud_test"

        ### An unique random string will be appended to the directory name
        #directory_name     : "downloads_test"

    ### Bucket for storing Bit.ly raw JSON responses
    #bitly_processing_results:
        #access_key_id      : "AKIAIOSFODNN7EXAMPLE"
        #secret_access_key  : "wJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY"
        #bucket_name        : "mediacloud-bitly-processing-results"
        #directory_name     : "json_blobs"
        
        ### Cache raw Bit.ly processing results locally?
        #cache_bitly_processing_results : "no"

## Job manager (MediaCloud::JobManager) configuration
job_manager:

    ## When uncommented, will use RabbitMQ as job broker
    rabbitmq:

        ## RabbitMQ client configuration
        ## (both workers and clients will use this key)
        client:

            ## Connection credentials
            hostname: "localhost"
            port: 5673     # not the default 5672
            username: "mediacloud"
            password: "mediacloud"
            vhost: "/mediacloud"
            timeout: 60

        ## RabbitMQ server configuration
        ## (rabbitmq_wrapper.sh will use this for starting up an instance of
        ## RabbitMQ)
        server:

            ## To disable your very own RabbitMQ instance managed by Supervisord,
            ## set the below to "no". Default is "yes".
            enabled: "yes"

            ## Host to listen to. You can set the above parameter to an empty string
            ## so that RabbitMQ will accept connections from anywhere; however, it is
            ## highly advised use to secure channels (e.g. a SSH tunnel) to make RabbitMQ
            ## accessible from "outside" instead. Default is "127.0.0.1".
            listen: "127.0.0.1"

            ## Port to use for RabbitMQ. Default port for vendor-provided RabbitMQ
            ## deployments is 5672, but Media Cloud runs its own RabbitMQ instance via
            ## Supervisord. Default is 5673.
            port: 5673     # not the default 5672

            ## Node name
            node_name: "mediacloud@localhost"

            ## User credentials and vhost to create upon start (instead of "guest")
            username: "mediacloud"
            password: "mediacloud"
            vhost: "/mediacloud"

### Supervisor (supervisord) configuration
supervisor:

    ### The log directory for child process logs (absolute or relative to Media
    ### Cloud's root; must already exist)
    childlogdir: "data/supervisor_logs/"

    # if set to true, do not autostart any programs, regardless of the settings in the particular programs below.
    start_no_supervisor_programs: 'true'

    # configure supervisor settings for mediacloud background daemons here.
    # the defaults should work for a small dev setup, but you will want to increase
    # numprocs for some daemons depending on load.  you can also set some daemons
    # not to autostart -- for instance you might want to change crawler.autostart
    # to 'false' to prevent the crawler from starting automatically on a dev machine.
    #programs:

        #crawler:
            #numprocs: 1
            #autostart: 'true'
            #autorestart: 'false'

        #extract_and_vector:
            #numprocs: 1
            #autostart: 'true'
            #autorestart: 'true'

        # other configurable supervisor programs
        #create_missing_partitions
        #purge_object_caches
        #facebook_fetch_story_stats
        #process_bitly_schedule
        #extractor_python_readability_server
        #rabbitmq
        #rescrape_media
        #topic_mine
        #topic_snapshot
        #annotate_with_corenlp
        #bitly_fetch_story_stats
        #bitly_aggregate_story_stats

        # Standalone Solr instance
        #solr_standalone

        # Solr cluster: ZooKeeper instance
        #solr_cluster_zookeeper

        # Solr cluster: Solr shards
        # (Don't set "numprocs" here, adjust "cluster_shard_count" / "local_shard_count" instead.)
        #solr_shard

### Solr server, when running as a Supervisor service
supervisor_solr:

    ### Standalone Solr instance
    standalone:

        # JVM heap size (-Xmx)
        jvm_heap_size: "256m"

    ### Solr cluster
    cluster:

        ### ZooKeeper instance
        zookeeper:

            ### Address to bind to
            listen: "0.0.0.0"

            ### Port to listen to
            port: 9983

        ### Solr shards
        shards:

            # Total number of local shards
            local_shard_count: 2

            # Total number of shards across the cluster ("numShards")
            cluster_shard_count: 2

            # JVM heap size for a single shard (-Xmx)
            jvm_heap_size: "256m"

            # ZooKeeper host + port to connect shards to
            zookeeper_host: "localhost"
            zookeeper_port: 9983

### CoreNLP annotator
corenlp:

    ### Enable CoreNLP processing
    ### If enabled, CoreNLP processing will happen after every "content"
    ### download extraction
    enabled: "no"

    ### Annotator URL, e.g. "http://www.example.com:8080/corenlp/annotator"
    annotator_url: ""

    ### Annotator timeout (in seconds)
    ### If you annotate huge chunks of text or the CoreNLP annotator is busy in
    ### general, you might want to increase this value even more because
    ### otherwise the job worker will exit() often.
    annotator_timeout: 600

    ### CoreNLP annotator level; you might want to use this configuration
    ### parameter to limit the scope of annotations returned from the service.
    ### Default is an empty string; you might want to set it to "ner".
    annotator_level: ""

### Bit.ly API
bitly:

    ### Enable Bit.ly processing
    enabled: "no"

    ### (Generic) Access Token
    ### Get one at: https://bitly.com/a/oauth_apps
    access_token: ""

    ### API request timeout (in seconds)
    timeout: 60

    ### Locations to read raw JSON responses from
    ### (default is just "postgresql")
    json_read_stores:
        ### Try "bitly_processing_results" table first
        - postgresql
        ### ...then fallback to Amazon S3
        #- amazon_s3

    ### Locations to write raw JSON responses to
    ### (default is just "postgresql")
    json_write_stores:
        ### Write to "bitly_processing_results" table first
        - postgresql
        ### ...then to Amazon S3 too
        #- amazon_s3

    ### Bit.ly processing for all stories
    #story_processing:

        ### Enable Bit.ly processing for all stories (not just the ones that
        ### belong to topics enabled for Bit.ly processing)
        #enabled: "no"

        ### Delay for which to postpone story processing since its
        ### "publish_date" / "collect_date"
        ###
        ### For example, you might want to process the story against Bit.ly after:
        ### * 295200 seconds (3 days), and
        ### * 2952000 seconds (30 days)
        ### from story's "publish_date" (or "collect_date" if "publish_date"
        ### is invalid).
        #schedule:
            ### 3 days from "stories.publish_date"
            #- 295200
            ### 30 days from "stories.publish_date"
            #- 2952000

### Facebook API
### (see doc/README.facebook_api.markdown)
facebook:

    ### Enable Facebook processing
    enabled: "no"

    ## App ID
    app_id: ""

    ## App Secret
    app_secret: ""

    ## Request timeout
    #timeout: 60

#twitter:
#    consumer_key: ""
#    consumer_secret: ""
#    access_token: ""
#    access_token_secret: ""

# key to fetch tweets from crimson hexagon.  necessary for topic tweets as implements in FetchTopicTweets.pm
#crimson_hexagon:
#   key: ""

### Univision.com feed credentials
#univision:
    ### Client ID
    #client_id: 83db02e1cba58c43d01116c50014913b47fa473b

    ### Client Secret (Secret Key)
    #client_secret: 7187037755de2dd77451f491d46b103b86fbcf79

### Email configuration
mail:

    # "From:" email address that is being set in emails sent by Media Cloud
    from_address: "noreply@mediacloud.org"

    ### (optional) SMTP configuration
    smtp:

        ### SMTP host
        host: "localhost"

        ### SMTP port
        port: 25

        ### Use STARTTLS? If you enable that, you probably want to change the port to 587.
        starttls: no

        ### (optional) SMTP login credentials
        username: ""
        password: ""

### everything below is optional.  the system should work out of the box without
### touching any of these other than calais_key for tagging

#session:
    #expires: 3600

    ### directory where web app sessions are stored.  default to $homedir/tmp
    #storage: "~/tmp/mediawords-session"

## Uncomment and fill in to use Google Analytics
#google_analytics:
#      account: "<ACOUNT>"
#      domainname: "<DOMAIN>"

mediawords:
    ### defaults to http://$hostname:$port/.
    #base_url: "http://your.mediacloud.server/and/path"

    ### Directory in which various kinds of data (logs, etc.) is being stored
    #data_dir: "<bindir>/../data"

    ### HTTP user agent and the email address of the owner of the bot
    user_agent: "mediawords bot (http://cyber.law.harvard.edu)"
    owner: "mediawords@cyber.law.harvard.edu"

    ### Uncomment one or more storage methods to store downloads in.
    ### Default is "postgresql" which stores downloads directly in the
    ### PostgreSQL database.
    ###
    ### Very short downloads will be stored directly in the database, under
    ### "downloads.path"
    ###
    ### The path of the last download storage method listed below will be
    ### stored in "downloads.path" database column.
    download_storage_locations:
        ### store downloads in the PostgreSQL database, "raw_downloads" table
        - postgresql
        ### store downloads in Amazon S3
        #- amazon_s3

    ### Read all non-inline ("content") downloads from S3
    read_all_downloads_from_s3 : "no"

    ### Uncomment to fallback PostgreSQL downloads to Amazon S3 (if download
    ### doesn't exist in PostgreSQL storage, S3 will be tried instead)
    fallback_postgresql_downloads_to_s3 : "no"

    ### Enable local Amazon S3 download caching?
    cache_s3_downloads : "no"

    #controls the maximum time SQL queries can run for -- time is in ms
    #uncomment to enable a 10 minute timeout
    #db_statement_timeout: "600000"

    # Uncommit to speed up slow queries by setting the Postgresql work_mem parameter to this value
    # By default the initial Postgresql value of work_mem is used
    # large_work_mem: "3GB"

    # An experiment parameter to dump stack traces in error message even if not in debug mode
    # NOTE: may leak DB passwords and is not to be use in production
    always_show_stack_traces: "no"

    # reCAPTCHA public key (used to prevent brute-force in the password reset form)
    # The default value was set up for http://127.0.0.1 and is a global key (should work across all domains)
    recaptcha_public_key: "6LfEVt0SAAAAAFwQI0pOZ1bTHgDTpQcMeQY6VLd_"

    # reCAPTCHA private key (used to prevent brute-force in the password reset form)
    # The default value was set up for http://127.0.0.1 and is a global key (should work across all domains)
    recaptcha_private_key: "6LfEVt0SAAAAABmI-8IJmx4g93eNcSeyeCxvLMs2"

    #uncomment to make the public homepage the default start page
    default_home_page: "admin/media/list"

    # downloads id under which to strip all non-ascii characters
    #ascii_hack_downloads_id: 123456789

    # settings for mediawords_web_store.pl script that does in process parallel fetching
    # web_store_num_parallel: 10
    # web_store_timeout: 90
    # web_store_per_domain_timeout: 1

    # tablespace in which to create temporary tables -- defaults to the postgres default
    # temporary_table_tablespace: temporary_tablespace

    # url for solr word counting url.  if this is set, fetch word counts from a remote server
    # using this url; otherwise, generate word counts locally
    # solr_wc_url: http://localhost/api/v2/wc

    # mc api key for appending to sol_wc_url for fetching remote word counts
    # solr_wc_key: FOO

    # URLs for Solr queries; include multiple to make Media Cloud choose a random URL from
    # the list for each Solr query
    solr_url:

        # Standalone Solr instance...
        - http://localhost:8983/solr

        # ...or SolrCloud shards
        #- http://127.0.0.1:7981/solr
        #- http://127.0.0.1:7982/solr
        #- http://127.0.0.1:7983/solr
        #- http://127.0.0.1:7984/solr
        #- http://127.0.0.1:7985/solr
        #- http://127.0.0.1:7986/solr
        #- http://127.0.0.1:7987/solr
        #- http://127.0.0.1:7988/solr

    # Solr importer configuration
    solr_import:
        # Stories to import into Solr on a single run
        max_queued_stories: 100000

    # set to "yes" to skip requirement to run on the correct database schema version
    # ignore_schema_version: "no"

    # increment wc_cache_version to invalidate existing cache
    # wc_cache_version: 1

    # list of emails to which to send all topic alerts
    # topic_alert_emails:
    #     - topicupdates@mediacloud.org
    #     - slackupdates@mediacloud.org