From 20fa8247c19cae38ad95e0350b8352245f8d9f03 Mon Sep 17 00:00:00 2001 From: "Evan L. Ray" Date: Tue, 10 Dec 2024 15:55:14 -0500 Subject: [PATCH 1/3] first draft workflow for getting nhsn data into s3 bucket --- .github/workflows/snapshot-nhsn-data.yml | 43 ++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 .github/workflows/snapshot-nhsn-data.yml diff --git a/.github/workflows/snapshot-nhsn-data.yml b/.github/workflows/snapshot-nhsn-data.yml new file mode 100644 index 0000000..f76961a --- /dev/null +++ b/.github/workflows/snapshot-nhsn-data.yml @@ -0,0 +1,43 @@ +name: Snapshot NHSN data and upload to S3 + +on: + schedule: + - cron: "45 12 * * 3" # every Wednesday at 5:45PM UTC == 12:45PM EST + workflow_dispatch: + +env: + # Reich lab AWS account number + AWS_ACCOUNT: 312560106906 + +jobs: + snapshot-nhsn-data: + runs-on: ubuntu-latest + steps: + - name: Set up R 📊 + uses: r-lib/actions/setup-r@v2 + with: + r-version: 4.4.1 + install-r: true + use-public-rspm: true + extra-repositories: 'https://hubverse-org.r-universe.dev' + + - name: install R packages + run: | + Rscript -e "install.packages('remotes')" + Rscript -e "remotes::install_github('Chicago/RSocrata@v1.7.11')" + + - name: Snapshot NHSN data + run: Rscript -e "nhsn_data <- RSocrata::read.socrata('https://data.cdc.gov/resource/mpgq-jmmr.csv'); + write.csv(nhsn_data, paste0('nhsn-', Sys.Date(), '.csv'))" + + - name: Install rclone + run: | + curl https://rclone.org/install.sh | sudo bash + rclone version + + - name: Copy to cloud storage + # copy the created file to S3 + run: | + rclone copy ./data/ ":s3,provider=AWS,env_auth:infectious-disease-data/data-raw/influenza-nhsn" \ + --checksum --verbose --stats-one-line --config=/dev/null + shell: bash From 9557921d9ab16a929dbf8bba25c80e6408858ecf Mon Sep 17 00:00:00 2001 From: "Evan L. Ray" Date: Tue, 10 Dec 2024 15:57:46 -0500 Subject: [PATCH 2/3] fix run time --- .github/workflows/snapshot-nhsn-data.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/snapshot-nhsn-data.yml b/.github/workflows/snapshot-nhsn-data.yml index f76961a..b6e5d5c 100644 --- a/.github/workflows/snapshot-nhsn-data.yml +++ b/.github/workflows/snapshot-nhsn-data.yml @@ -2,7 +2,7 @@ name: Snapshot NHSN data and upload to S3 on: schedule: - - cron: "45 12 * * 3" # every Wednesday at 5:45PM UTC == 12:45PM EST + - cron: "45 5 * * 3" # every Wednesday at 5:45PM UTC == 12:45PM EST workflow_dispatch: env: From 16e94409658d6696291851b004007758ee0e4528 Mon Sep 17 00:00:00 2001 From: "Evan L. Ray" Date: Tue, 10 Dec 2024 16:24:43 -0500 Subject: [PATCH 3/3] fixed the action --- .github/workflows/snapshot-nhsn-data.yml | 31 ++++++++++++++---------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/.github/workflows/snapshot-nhsn-data.yml b/.github/workflows/snapshot-nhsn-data.yml index b6e5d5c..0af7a03 100644 --- a/.github/workflows/snapshot-nhsn-data.yml +++ b/.github/workflows/snapshot-nhsn-data.yml @@ -26,18 +26,23 @@ jobs: Rscript -e "install.packages('remotes')" Rscript -e "remotes::install_github('Chicago/RSocrata@v1.7.11')" + - name: Get file name + run: echo "FILE_NAME=nhsn-$(date +'%Y-%m-%d').csv" >> $GITHUB_ENV + - name: Snapshot NHSN data run: Rscript -e "nhsn_data <- RSocrata::read.socrata('https://data.cdc.gov/resource/mpgq-jmmr.csv'); - write.csv(nhsn_data, paste0('nhsn-', Sys.Date(), '.csv'))" - - - name: Install rclone - run: | - curl https://rclone.org/install.sh | sudo bash - rclone version - - - name: Copy to cloud storage - # copy the created file to S3 - run: | - rclone copy ./data/ ":s3,provider=AWS,env_auth:infectious-disease-data/data-raw/influenza-nhsn" \ - --checksum --verbose --stats-one-line --config=/dev/null - shell: bash + write.csv(nhsn_data, file = $FILE_NAME, row.names = FALSE)" + env: + FILE_NAME: ${{ env.FILE_NAME }} + + - name: Configure AWS credentials + # request credentials to assume the hub's AWS role via OpenID Connect + if: env.CLOUD_ENABLED == 'true' + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT }}:role/iddata-github-action + aws-region: us-east-1 + + - name: Copy files to cloud storage + run: | + aws s3 cp "./$FILE_NAME" "s3://infectious-disease-data/data-raw/influenza-nhsn" --dryrun