-
Notifications
You must be signed in to change notification settings - Fork 0
/
oai-harvester.sh
executable file
·89 lines (72 loc) · 1.81 KB
/
oai-harvester.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
#@file
# Harvests from $url, saves xml results to directory.
######## Vars ########
base_url=""
res_token_regex="<resumptionToken[^>]*>(.*)<\/resumptionToken>"
target_dir="$(pwd)/results/$(date +%Y-%m-%dH%H)"
max_harvests=0 # 0 means no maximum
file_i=1
curl_args=(
--fail
# -m 300 # Maximum number of seconds before giving up on a request.
)
######## Initial Checks ########
if [ -d ${target_dir} ]; then
echo -e "${target_dir} already exists somehow. How often are you running this?"
exit 1
fi
######## Functions ########
##
# @function
# Uses res_token_regex to get the resumption token from the file.
#
# @param {str} 1
# The xml contents of the oai_pmh harvest.
get_res_token() {
if [[ "${1}" =~ $res_token_regex ]]; then
echo "${BASH_REMATCH[1]}"
else
echo ""
fi
}
##
# @function
# Curls a url and dumps the xml into a file.
#
# @param {str} 1
# The url to harvest.
# @param {str} 2
# The file location to save the xml to.
curl_server() {
echo -e "\n\nHarvesting: ${1}\n"
curl "${1}" "${curl_args[@]}" -o "${2}"
return $?
}
##
# @function
# Performs the complete harvest, including handling resumption tokens.
#
# @param {str} 1
# The url to harvest.
oai_harvest() {
local filename=${target_dir}/${file_i}.xml
curl_server "${1}" "${filename}"
if [ $? -gt 0 ]; then
echo -e "\nHarvest failed. Exiting"
exit 2
fi
local res_token=$(get_res_token "$(cat ${filename})")
if [ $file_i -eq $max_harvests ]; then
echo -e "\nHit max harvest limit of ${max_harvests}. Quitting."
elif [ -z "${res_token}" ]; then
echo -e "\nNo resumption token. End of harvesting."
else
local new_url="${base_url}&resumptionToken=${res_token}"
((file_i=file_i+1))
oai_harvest "${new_url}"
fi
}
######## Procedure ########
mkdir ${target_dir}
oai_harvest "${base_url}"