forked from MichaelChirico/r-mailing-list-archive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_archives.R
executable file
·63 lines (57 loc) · 1.76 KB
/
update_archives.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/local/bin/Rscript
library(xml2)
URL_BASE ='https://stat.ethz.ch/pipermail'
# POSIXlt facilitates formatting as quarter
today = as.POSIXlt(Sys.time())
mailing_lists = list(
c(name = 'r-devel', current = format(today, '%Y-%B.txt')),
c(name = 'r-package-devel',
current = with(today, sprintf('%dq%d.txt', year + 1900L, mon %/% 3L + 1L))),
c(name = 'r-sig-mac', current = format(today, '%Y-%B.txt')),
c(name = 'r-help', current = format(today, '%Y-%B.txt')),
c(name = 'r-announce', current = format(today, '%Y'))
)
for (ii in seq_along(mailing_lists)) {
this_list = mailing_lists[[ii]]
outdir = this_list[['name']]
URL = file.path(URL_BASE, outdir)
dir.create(outdir, recursive = TRUE, showWarnings = FALSE)
# Always re-write current period
extant_gz = outdir |>
list.files() |>
setdiff(this_list[['current']]) |>
paste0('.gz')
zips = URL |>
read_html() |>
# linked under "Gzip'd Text NNN KB"
xml_find_all('//a[contains(text(), "Gzip")]') |>
xml_attr('href') |>
# only download new archives
setdiff(extant_gz) |>
file.path(URL, ..gzpath = _ ) |>
# this should be unnecessary
sort()
if (length(zips)) {
message(sprintf(
'Acquiring %d %s archives: %s - %s',
length(zips), outdir, basename(head(zips, 1L)), basename(tail(zips, 1L))
))
} else {
message('No new ', outdir, ' archives to acquire')
}
for (zip in zips) {
local({
zip_tmp <- tempfile()
on.exit(unlink(zip_tmp))
download.file(zip, zip_tmp)
local({
zip_conn <- gzfile(zip_tmp)
on.exit(close(zip_conn))
outfile <- gsub('.gz', '', basename(zip), fixed = TRUE)
zip_conn |>
readLines() |>
writeLines(file.path(outdir, outfile))
})
})
}
}