run.sh.template

#!/bin/bash
#======================================================
#
# Ricostruzione ambiente per la creazione del WARC delle tesi di dottorato
# Autore:   Argentino Trombin
# Data:     12/07/2019
# NOATA:    Ricostruito da vari pezzi non documentati di Raffaele Messuti trovati
#           sul server md-front02.bncf.lan (192.168.7.151)
#           cartelle:   /home/rmessuti
#                       /mnt/volume2/in-progress
#           https://github.com/depositolegale/ojs-archive

# Per fare test della wayback machine in ambiente di collaudo/esercizio(.151)
#   sudo service supervisor stop
#   sudo service supervisor start
#   sudo service supervisor restart
#
# Per fare test della wayback machine in ambiente di collaudo/esercizio(.83)
#   /mnt/volume2/wayback/venv/bin/uwsgi --plugin python --ini /mnt/volume2/wayback/memoria.depositolegale.it/uwsgi-wayback.ini
#   /mnt/volume2/wayback/venv/bin/uwsgi --plugin python --ini /mnt/volume2/wayback/index.depositolegale.it/uwsgi-cdx-server.ini
#
# Per fare test della wayback machine in ambiente locale
#   cd /home/argentino/magazzini_digitali/wayback
#   wayback   (will run on port 8080)

#======================================================
# Prendi l'ambiente dell'utente
#. ~.bash_profile


# Configure Error Traps
# ---------------------
trap exit_trap EXIT
function exit_trap {
    local r=$?
   	set +o xtrace
 
    echo "==================="
    echo "Exit from exit_trap"
    echo "exit code: $r"
    echo "==================="

  	# Do some clean up if required

    exit $r
}

# Exit on any errors so that errors don't compound
trap err_trap ERR
function err_trap {
    local r=$?
    set +o xtrace
    # if [[ -n "$LOGFILE" ]]; then
    #     echo "${0##*/} failed: full log in $LOGFILE"
    # else
    #     echo "${0##*/} failed"
    # fi

    echo "==================="
    echo "Exit from err_trap"
    echo "${0##*/} failed"
    echo "exit code: $r"
    echo "==================="

  	# Do some clean up if required

    exit $r
}

# Begin trapping error exit codes

# to make your script exit when a command fails.
# set -o errexit

# to exit when your script tries to use undeclared variables.
# set -o nounset TROPPI PROBLEMI 


source scripts/run_main.sh


#======================================
# Start execution
# ./run.sh
# Any missing required subdirectory will be automatically generated
#======================================
cd $HARVEST_DIR
ora="date +%Y-%m-%d-%H:%M:%S"
tmr=$(timer)

echo "===================================================="
echo " Procedura per l'acquisizione delle tesi di "
echo " dottorato o riviste (eJournals)"
# echo " Versione 2019_11_08.1" # aggiunto check_for_bad_seeds_lookup
# echo " Versione 2019_11_18.2" # nuova gestione per gestire spazio disco
# echo " Versione 2019_11_27.1" # eliminazione di seed duplicati (vedi seeds_dup.csv)
# echo " Versione 2019_11_28.1" # gestione ricevute in un unico foglio excel (ok, [ko, no didl resource, missing, url doppie])
                              # nuova gestione report in formato excel
# echo " Versione 2019_12_03.1" # gestione errori in warc filtrati per seed da scaricare
# echo " Versione 2019_12_08.1" # get_warcked_seeds_and_not_from_logs (unificato  le get delle url salvate ed andate in errore)

# echo " Versione 2019_12_09.1"   # Rifatto index_warcs per gestire + volumi usando i symbolic links
                                # Pulizia indici tramite liste seed da scaricare
# echo " Versione 2020_23_01.1" # Generazione lista delle tesi in ricevuta
# echo " Versione 2021_01_25.1"	# Prove con wget recursive
# echo " Versione 2019_12_08.1"
# echo " Versione 2019_12_08.1"
echo " Versione 2021_06_30.1"	# Gestione exit con trap


echo " `${ora}` Inizio processo di run.sh"
echo "==================================================="


# -------------
# PRELIMINARIES
# -------------
check_arguments "$@"
init_variables

if [[ -v "${AMBIENTE_HARVEST_VM}" ]]; then
	update_sw
fi
print_configuration


# ----
# MAIN
# ----


# -------------
# ONCE AT HE START OF A HARVEST SESSION
# -------------
# To execute if in need to recreate the .csv files for harvesting (./csv/etd.csv and ./csv/e_journals.csv )
#
# NMB: scarica le tabelle in ./csv con estensione ".out"
#
# scripts/csvDownload.sh $materiale  $MATERIALE_TESI $MATERIALE_EJOURNAL


# harvest_metadata

# createSeeds

# create_warcs_concurrently

# if [ $materiale == $MATERIALE_EJOURNAL ]; then
#     change_pdf_viewer_url
# fi

# create_warcs_md5

# check_pdf_download

# copy_warcs_and_logs_to_destination_dir_and_remove

# check_harvest

# index_warcs

# compress_indexes

# make_receipts


# - Mettere nei file accessori unimarc anche la lista dei bid che devono essere presenti in opac
# createUnimarc


# IN STAND BY elabora_doppioni_tesi

# store_in_S3
# temp_warcs_backup


# check_for_receipts_mismatch

# if [ $materiale == $MATERIALE_TESI ]; then
#     # find_embargoed
#     # scripts/DbUpdateInsertEmbargoed.sh    # Carichiamo le tesi sotto embargo sul db. Cancellazione da fare con script a parte. scripts/DbDeleteUnembargoed.sh
# fi


# make_report

# ===================================
# Aggiornamento last harvest date

# scripts/DbDownload.sh 	# Da schema harvest prendi dati dell'nanagrafe e date (in ./csv)
# generate_harvest_dates_from_metadata_logs
# scripts/DbUpdateInsertHarvestDate.sh 	# Carichiamo range di date per l'ultimo harvest

# ===================================


# ===================================
# 07/12/2020
# Archiviazione documenti in Magazzini Digitali
# ===================================
# prepare_docs_for_MD


# ---------------
# TESTING
# ---------------

# generate_nbn_identifiers_IN_STAND_BY

# ---------------
# END TESTING
# ---------------


echo
echo "Elapsed time $(timer $tmr)"
echo `${ora}` "Fine processo di run.sh"
echo
echo "================="
echo "FINE ELABORAZIONE"
echo "================="

# Start Stop wayback in esercizio
#   sudo service supervisor stop
#   sudo service supervisor start

# Per cercare in memporia senza dover digitare ma  copincollando
# http://memoria.depositolegale.it/*/URL
# http://memoria.depositolegale.it/*/http://amsdottorato.unibo.it/480/    fino 2018
# http://memoria.depositolegale.it/*/http://amsdottorato.unibo.it/2206/   2019 (trovato 2015) nell'harvest del 05/11/19 questa url non e' stata scaricata


# =====================
# OBSOLETE 
# 
# prepare_wget_sites_list
# 
# 22/12/2020
# Procedura da farsi una tantum per splittare warc.gz troppo grandi da caricare unzippati su S3
# Warc coinvolti solo alcuni dell'harvest del 05/08/2020
# split_warcs

# =====================
# NO LONGER WANTED
# 
# Procedura da farsi una tantum per generare numero NBN delle tesi archiviate in memoria (storico)
# generate_archived_thesis_nbn


# =====================
# PROCEDURE UNA TANTUM 
# 
# Solo se non abbiamo generato gli md5 precedentemente!!!
# create_dest_warcs_md5
# 
# Controlliamo se tesi archiviate in warc
# find_archived_thesis
# 
# Procedura da farsi una tantum per recuperare il pregresso (storico) da DB MySql di BNCF
# create_unimarc_from_dublin_core


# =====================
# TODO
# 
# 19/01/2022
# TO DO export DB
#     - manca la 005
#     - in 100 data1 e data2  ha i trattini che devono diventare ' '