.gitlab-ci.yml

variables:
  REGISTRY_ROOT: "/home/cloudadm/.ci-python_packages"
  REGISTRY_KEY: "${CI_PIPELINE_ID}"
  REGISTRY: "${REGISTRY_ROOT}/${REGISTRY_KEY}"
  PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
  PACKAGE: "python_packages.tar.gz"
  LAKE_HOME: "/projets/TSF"
  LAKE_MODULES_DIR: "${LAKE_HOME}/applicatifs"
  LAKE_PKG_TARGET: "${LAKE_MODULES_DIR}/sf-packages-${CI_COMMIT_REF_SLUG}.tar.gz"
  LAKE_ENTRYPOINT: "${LAKE_MODULES_DIR}/main.py"
  DEPLOYMENT_TYPE: "null"

### Workflow: defines when branch and merge request pipelines should run.

workflow:
  rules:
    - if: '$CI_COMMIT_BRANCH == "main" || $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"'
      variables:
        DEPLOYMENT_TYPE: "prod"
    - if: '$CI_COMMIT_BRANCH == "develop" || $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "develop"'
      variables:
        DEPLOYMENT_TYPE: "small"
    - if: '$CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS'
      when: never
    - if: '$CI_COMMIT_TAG'
      when: never
    - if: '$CI_COMMIT_BRANCH != "main" && $CI_COMMIT_BRANCH != "develop"'


.lake:
  tags:
    - datalake
  before_script:
    - cd ${REGISTRY}/bin && pwd
  after_script:
    - pwd

.python:
  tags:
    - ci
  image: python:3.6.8
  before_script:
    - source venv/bin/activate
  cache:
    key: $CI_COMMIT_REF_SLUG
    paths:
      - .cache/pip
      - venv/


##########################
#  PIPELINE STARTS HERE  #
##########################

stages:
  - .pre
  - build
  - test
  - deploy
  - .post

remove_older_working_directories:
  stage: .pre
  tags:
    - datalake
  script:
    - echo "Cleaning older CI pipeline data. Keeping only the 10 most recent directories in ${REGISTRY_ROOT}."
    - ls -1QAtd ${REGISTRY_ROOT}/* | tail -n+10 | xargs rm -rf

prepare_python_venv:
  stage: .pre
  extends: .python
  before_script: null
  script:
    - python -V  # Print out python version for debugging
    - pip install -U pip
    - pip install virtualenv==20.16.2
    - virtualenv venv

prepare_new_working_directory:
  stage: .pre
  tags:
    - datalake
  script:
    - echo "Preparing runners shared space & configuration in ${REGISTRY}."
    - "[[ -d ${REGISTRY} ]] && rm -rf ${REGISTRY}"
    - mkdir -p ${REGISTRY}
    - mkdir -p ${REGISTRY}/trace
    - mkdir -p ${REGISTRY}/log
    - mkdir -p ${REGISTRY}/bin
    - mkdir -p ${REGISTRY}/results
    - echo "Preparing spark configuration inside ${REGISTRY}/spark_config.json (will be set to 'null' if no prediction should be run afterwards)."
    - jq --arg REF "${DEPLOYMENT_TYPE}" '.[$REF]' ${CI_PROJECT_DIR}/.ci/datalake/spark_config.json | envsubst > ${REGISTRY}/spark_config.json
    - cat ${REGISTRY}/spark_config.json
    - echo "Getting MaacDo version from default or ${CI_COMMIT_REF_SLUG} from .ci/datalake/maacdo_version.json "
    - MAACDO_VERSION=$(jq --raw-output --arg BRANCH "${CI_COMMIT_REF_SLUG}" '.maacdo | .default + .[$BRANCH] | .version' ${CI_PROJECT_DIR}/.ci/datalake/maacdo_version.json)
    - echo "${MAACDO_VERSION} > ${REGISTRY}/.maacdo_version"
    - echo "Got MaacDo version as ${MAACDO_VERSION} "
    - git clone --depth 1 --branch ${MAACDO_VERSION} https://${GIT_MAACDO_USERNAME}:${GIT_MAACDO_TOKEN}@forge.dgfip.finances.rie.gouv.fr/raphaelventura/maac-do.git ${REGISTRY}/bin

prepare_maacdo_configuration:
  stage: .pre
  extends: .lake
  needs:
    - job: "prepare_new_working_directory"
  script:
    - SPARK_CONFIG=$(jq  --compact-output '.sparkTask' ${REGISTRY}/spark_config.json)
    - >
     [[ -z ${SPARK_CONFIG} ]]
     && [[ ${DEPLOYMENT_TYPE} != "null" ]]
     && { echo "Error: no config for Spark task excecution was provided."; exit 1; }
    - export SPARK_CONFIG=${SPARK_CONFIG}
    - envsubst < ${CI_PROJECT_DIR}/.ci/datalake/datalake_config_template.par > ${REGISTRY}/ci_job.par
    - cat ${REGISTRY}/ci_job.par

install_package:
  stage: build
  extends: .python
  needs:
    - job: "prepare_new_working_directory"
    - job: "prepare_python_venv"
  script:
    # Installing shap through wheel causes import failure when sent to the datalake servers.
    - pip install -v .[pack] --no-binary=shap

venv_pack:
  stage: build
  extends: .python
  needs:
    - job: "install_package"
  script:
    - venv-pack -o /packages/${REGISTRY_KEY}/${PACKAGE}

pylint:
  stage: test
  extends: .python
  script:
    - pip install pylint
    - pylint --rcfile=.pylintrc src/sf_datalake/
  rules:
    - changes:
      - src/sf_datalake/**/*.py

send_package_to_lake:
  stage: deploy
  extends: .lake
  needs:
    - job: "venv_pack"
    - job: "prepare_maacdo_configuration"
    - job: "pylint"
      optional: true
  script:
    - echo "Deleting old packages archive (if it exists) on the lake HDFS."
    - ./api_maacdo.pl
        -a SuppressionFichier
        -i ${LAKE_PKG_TARGET}
        -c ${REGISTRY}/ci_job.par ||
        echo "Failed removing ${LAKE_PKG_TARGET}. Maybe this file doesn't exist."
    - echo "Sending new python packages archive to ${LAKE_PKG_TARGET}."
    - ./api_maacdo.pl
      -a EnvoyerFichier
      -l ${REGISTRY}/${PACKAGE}
      -i ${LAKE_PKG_TARGET}
      -c ${REGISTRY}/ci_job.par

send_entrypoint_to_lake:
  stage: deploy
  extends: .lake
  needs:
    - job: "prepare_maacdo_configuration"
    - job: "pylint"
      optional: true
    - job: "send_package_to_lake"
  script:
    - echo "Deleting previous entry point (if it exists) on the lake HDFS."
    - ./api_maacdo.pl
        -a SuppressionFichier
        -i ${LAKE_ENTRYPOINT}
        -c ${REGISTRY}/ci_job.par ||
        echo "Failed removing ${LAKE_ENTRYPOINT}. Maybe this file doesn't exist."
    - SCRIPT="${CI_PROJECT_DIR}/src/sf_datalake/__main__.py"
    - echo "Sending new python main script ${SCRIPT} to ${LAKE_ENTRYPOINT}."
    - ./api_maacdo.pl
      -a EnvoyerFichier
      -l ${SCRIPT}
      -i ${LAKE_ENTRYPOINT}
      -c ${REGISTRY}/ci_job.par
  rules:
    - if: $DEPLOYMENT_TYPE != "null"

### Spark jobs handling

start_spark_task:
  stage: deploy
  extends: .lake
  needs:
    - job: "send_entrypoint_to_lake"
    - job: "send_package_to_lake"
  script:
    - echo "Launching spark job, getting livy ID."
    - 'idLivy=$(./api_maacdo.pl -a DemarrageTacheSpark -c ${REGISTRY}/ci_job.par | grep ID |  cut -d: -f2)'
    - echo "Spark task has started. Livy id is [$idLivy]."
    - echo ${idLivy} | xargs > ${REGISTRY}/idLivy
    - echo "To get yarns logs even in success case, execute the following from the server running the gitlab runner process:"
    - echo "--> ./api_maacdo.pl -a YarnLogsTacheSpark -j ${idLivy} -c ${REGISTRY}/ci_job.par"
    - echo "To stop the spark task, execute the following from the server running the gitlab runner process:"
    - echo "--> ./api_maacdo.pl -a ArretTacheSpark -j ${idLivy} -c ${REGISTRY}/ci_job.par"
  rules:
    - if: $DEPLOYMENT_TYPE != "null"

wait_for_task_finishing:
  stage: deploy
  extends: .lake
  needs:
    - job: "start_spark_task"
  script:
    - export ID_LIVY=$(cat ${REGISTRY}/idLivy)
    - |
      LAST=""
      while true; do
        TACHE_SPARK=$(./api_maacdo.pl -a EtatTacheSpark -j ${ID_LIVY} -c ${REGISTRY}/ci_job.par)
        STATUS=$(echo ${TACHE_SPARK} | awk '{print $NF}' | tr -dc '[:alnum:]\r\n' | tr '[:lower:]' '[:upper:]')
        echo "${STATUS}" > ${REGISTRY}/status_${ID_LIVY}
        echo "Status of task ${ID_LIVY} is '${STATUS}'"
        [[ ${STATUS} == 'DEAD' ]] && exit 111
        [[ ${STATUS} == 'STARTING' ||  ${STATUS} == 'RUNNING' ]] || break
        LOGS=$(./api_maacdo.pl -a LogsTacheSpark -j ${ID_LIVY} -c ${REGISTRY}/ci_job.par)
        CURRENT=$(printf '%s' "${LOGS}" | md5sum)
        if [[ ${LAST} == ${CURRENT} ]]
        then
          echo "No fresh logs, waiting for another 60s."
        else
          echo "New available logs:"
          echo ${LOGS}
          LAST=${CURRENT}
          echo "Waiting for another 60s."
        fi
        sleep 60
      done
  rules:
    - if: $DEPLOYMENT_TYPE != "null"

stop_spark_task:
  stage: deploy
  extends: .lake
  needs:
    - job: "start_spark_task"
  script:
    - export ID_LIVY=$(cat ${REGISTRY}/idLivy)
    - export STATUS=$(cat ${REGISTRY}/status_${ID_LIVY})
    - echo "Task ${ID_LIVY} has status ${STATUS}."
    - |
      if [[ ${STATUS} == 'STARTING' ||  ${STATUS} == 'RUNNING' ]]
      then
        ./api_maacdo.pl -a ArretTacheSpark -j ${ID_LIVY} -c ${REGISTRY}/ci_job.par
      else
        echo "Nothing to do."
      fi
  rules:
    - if: $DEPLOYMENT_TYPE != "null"
      when: manual
      allow_failure: true

### Post-processing

pages:
  stage: deploy
  extends: .python
  needs:
    - job: "prepare_python_venv"
  script:
    - pip install sphinx sphinx-rtd-theme
    - cd docs/
    - sphinx-apidoc -f -o source/ ../src/
    - make html
    - mv build/html/ ../public/
  when: manual
#  artifacts:
#    paths:
#      - public

fetch_yarn_logs:
  stage: .post
  extends: .lake
  needs:
    - job: "wait_for_task_finishing"
  script:
    - export ID_LIVY=$(cat ${REGISTRY}/idLivy)
    - echo "Fetching YARN logs after job failure."
    - ./api_maacdo.pl -a YarnLogsTacheSpark -j ${ID_LIVY} -c ${REGISTRY}/ci_job.par
    - find ${REGISTRY}/log/ -type f -name 'YarnLogsTacheSpark_*' -execdir cat '{}' + | grep -v 'OnOutOfMemoryError' | grep 'Error' -B 20
  rules:
    - if: $DEPLOYMENT_TYPE != "null"
      when: on_failure

fetch_run_outputs_from_HDFS:
  stage: .post
  extends: .lake
  needs:
    - job: "wait_for_task_finishing"
  script:
    - export ID_LIVY=$(cat ${REGISTRY}/idLivy)
    - echo "Parsing ${REGISTRY}/spark_config.json looking for '--prediction_path' argument."
    - cat ${REGISTRY}/spark_config.json
    - > # select 'sparkTask.args', build json objects like { "--param": "value"}, add these objects in an array, then select value for "--prediction_path" key
      OUTPUT_DIR=$(jq --raw-output '[.sparkTask.args as $m | range(0; $m | length; 2) | {($m[.]): $m[(. + 1)]} ] | add | .["--prediction_path"]' ${REGISTRY}/spark_config.json)
    - echo "Found argument '--prediction_path'=${OUTPUT_DIR}"
    - bash fetchFolder.sh ${LAKE_HOME}/${OUTPUT_DIR} ${REGISTRY}/results ${REGISTRY}/ci_job.par
    - echo "Gathering fetched outputs into ${REGISTRY}/results.zip"
    - zip -j ${REGISTRY}/results.zip ${REGISTRY}/results/*
    - rm -r ${REGISTRY}/results/
    - echo "Removing remote results in ${OUTPUT_DIR}"
    - bash rmRecursiveHdfs.sh ${LAKE_HOME}/${OUTPUT_DIR} ${REGISTRY}/ci_job.par
  rules:
    - if: $DEPLOYMENT_TYPE != "null"
      when: always

######################
# PIPELINE ENDS HERE #
######################


print-python-env:
  stage: .pre
  extends: .python
  script:
    - env | sort
    - pwd
    - whoami
    - ls -al /packages
  when: manual

print-shell-env:
  stage: .pre
  extends: .lake
  script:
    - env | sort
    - whoami
    - pwd
    - echo "ID_LIVY = $(cat ${REGISTRY}/idLivy)"
    - echo "ID_LIVY status = $(cat ${REGISTRY}/status_${ID_LIVY})"
    - echo "Spark config:"
    - cat ${REGISTRY}/spark_config.json
    - echo "MaacDo version = $(cat ${REGISTRY}/.maacdo_version"
    - echo "MaacDo configuration:"
    - cat ${REGISTRY}/ci_job.par
  when: manual