-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_data2.py
82 lines (56 loc) · 2.16 KB
/
get_data2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import boto3, configparser, os, botocore
from bs4 import BeautifulSoup
s3resource = None
def setup():
"""Creates S3 resource & sets configs to enable download."""
# Securely import configs from private config file
configs = configparser.SafeConfigParser()
configs.read('config.ini')
# Create S3 resource & set configs
global s3resource
s3resource = boto3.resource(
's3', # the AWS resource we want to use
aws_access_key_id=configs['DEFAULT']['ACCESS_KEY'],
aws_secret_access_key=configs['DEFAULT']['SECRET_KEY'],
region_name='us-east-1'
)
def download_file(key):
"""
Downloads given filename from source bucket to destination directory.
Parameters
----------
key : str
Name of file to download
"""
# Ensure src directory exists
if not os.path.isdir('src'):
os.makedirs('src')
# Download file
print('\nDownloading s3://arxiv/{} to {}...'.format(key, key))
try:
s3resource.meta.client.download_file(
Bucket='arxiv',
Key=key, # name of key to download from
Filename=key, # path to file to download to
ExtraArgs={'RequestPayer':'requester'})
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print('ERROR: ' + key + " does not exist in arxiv bucket")
def explore_metadata():
"""Explores arxiv bucket metadata."""
manifest = open('src/arXiv_src_manifest.xml', 'r')
soup = BeautifulSoup(manifest, 'xml')
# Print last time the manifest was edited
timestamp = soup.arXivSRC.find('timestamp', recursive=False).string
print("Manifest was last edited on " + timestamp)
# Print number of files in bucket
numOfFiles = len(soup.find_all('file'))
print("arxiv bucket contains " + str(numOfFiles) + " .tar files")
if __name__ == '__main__':
"""Runs if script is called on command line"""
# Create S3 resource & set configs
setup()
# Download manifest file to current directory
# download_file('src/arXiv_src_manifest.xml')
# Explore the metadata in the manifest file
explore_metadata()