diff --git a/.gitignore b/.gitignore index 84f32d26566..0f83ae23f86 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ ehthumbs.db Thumbs.db *.swp *~ +.ipynb_checkpoints # R ####################### @@ -202,3 +203,4 @@ ssl/ # pyenv .python-version + diff --git a/CHANGELOG b/CHANGELOG index da406c46bf7..f0b26e1e32d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,11 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +19.31.0 (2019-11-7) +=================== +- EGAP: Parse project structure, add contributors, add files, ingest the draft registration, and add a Jupyter notebook +- Modify a Chronos field for proper contributor classification + 19.30.0 (2019-10-16) =================== - Fix weirdness around deleted nodes by not deleing OSF Storage diff --git a/docker-compose.yml b/docker-compose.yml index 62d0afb345d..05a02c16c59 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -383,6 +383,7 @@ services: restart: unless-stopped environment: DJANGO_SETTINGS_MODULE: api.base.settings + LANG: en_US.UTF-8 volumes: - ./:/code:cached - osf_requirements_vol:/usr/lib/python2.7 @@ -396,6 +397,7 @@ services: restart: unless-stopped environment: DJANGO_SETTINGS_MODULE: admin.base.settings + LANG: en_US.UTF-8 volumes: - ./:/code:cached - osf_requirements_vol:/usr/lib/python2.7 diff --git a/egap_assets.zip b/egap_assets.zip new file mode 100644 index 00000000000..85b13477ac2 Binary files /dev/null and b/egap_assets.zip differ diff --git a/osf/external/chronos.py b/osf/external/chronos.py index caf48790d23..64f19f7b51e 100644 --- a/osf/external/chronos.py +++ b/osf/external/chronos.py @@ -105,7 +105,7 @@ def serialize_author(cls, contributor): if contributor._order == 0: contribution = 'firstAuthor' else: - contribution = 'submittingAuthor' + contribution = 'Author' ret.update({ 'CONTRIBUTION': contribution, 'ORGANIZATION': '', diff --git a/osf/management/commands/import_EGAP.py b/osf/management/commands/import_EGAP.py new file mode 100644 index 00000000000..d300c7789f3 --- /dev/null +++ b/osf/management/commands/import_EGAP.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- +import logging + +import os +import json +import shutil +import requests +import tempfile +from django.core.management.base import BaseCommand +from osf.utils.permissions import WRITE +from osf.models import ( + RegistrationSchema, + Node, + DraftRegistration, + OSFUser +) +from website.project.metadata.schemas import ensure_schema_structure, from_json +from website.settings import WATERBUTLER_INTERNAL_URL +from osf_tests.factories import ApiOAuth2PersonalTokenFactory +from framework.auth.core import Auth +from zipfile import ZipFile + +logger = logging.getLogger(__name__) +HERE = os.path.dirname(os.path.abspath(__file__)) + + +class EGAPUploadException(Exception): + pass + + +def ensure_egap_schema(): + schema = ensure_schema_structure(from_json('egap-registration.json')) + schema_obj, created = RegistrationSchema.objects.update_or_create( + name=schema['name'], + schema_version=schema.get('version', 1), + defaults={ + 'schema': schema, + } + ) + if created: + schema_obj.save() + return RegistrationSchema.objects.get(name='EGAP Registration') + + +def get_creator_auth_header(creator_username): + creator = OSFUser.objects.get(username=creator_username) + token = ApiOAuth2PersonalTokenFactory(owner=creator) + token.save() + return creator, {'Authorization': 'Bearer {}'.format(token.token_id)} + + +def create_node_from_project_json(egap_assets_path, epag_project_dir, creator): + with open(os.path.join(egap_assets_path, epag_project_dir, 'project.json'), 'r') as fp: + project_data = json.load(fp) + title = project_data['title'] + node = Node(title=title, creator=creator) + node.save() # must save before adding contribs for auth reasons + + for contributor in project_data['contributors']: + node.add_contributor_registered_or_not( + Auth(creator), + full_name=contributor['name'], + email=contributor['email'], + permissions=WRITE, + send_email='false' + ) + + node.set_visible(creator, visible=False, log=False, save=True) + + return node + + +def recursive_upload(auth, node, dir_path, parent='', metadata=list()): + try: + for item in os.listdir(dir_path): + item_path = os.path.join(dir_path, item) + base_url = '{}/v1/resources/{}/providers/osfstorage/{}'.format(WATERBUTLER_INTERNAL_URL, node._id, parent) + if os.path.isfile(item_path): + with open(item_path, 'rb') as fp: + url = base_url + '?name={}&kind=file'.format(item) + resp = requests.put(url, data=fp.read(), headers=auth) + else: + url = base_url + '?name={}&kind=folder'.format(item) + resp = requests.put(url, headers=auth) + metadata = recursive_upload(auth, node, item_path, parent=resp.json()['data']['attributes']['path'], metadata=metadata) + + if resp.status_code == 409: # if we retry something already uploaded just skip. + continue + + if resp.status_code != 201: + raise EGAPUploadException('Error waterbutler response is {}, with {}'.format(resp.status_code, resp.content)) + + metadata.append(resp.json()) + except EGAPUploadException as e: + logger.info(str(e)) + metadata = recursive_upload(auth, node, dir_path, parent=parent, metadata=metadata) + + return metadata + + +def get_egap_assets(guid, creator_auth): + node = Node.load(guid) + zip_file = node.files.first() + temp_path = tempfile.mkdtemp() + + url = '{}/v1/resources/{}/providers/osfstorage/{}'.format(WATERBUTLER_INTERNAL_URL, guid, zip_file._id) + zip_file = requests.get(url, headers=creator_auth).content + + egap_assets_path = os.path.join(temp_path, 'egap_assets.zip') + + with open(egap_assets_path, 'w') as fp: + fp.write(zip_file) + + with ZipFile(egap_assets_path, 'r') as zipObj: + zipObj.extractall(temp_path) + + return temp_path + + +def main(guid, creator_username): + egap_schema = ensure_egap_schema() + creator, creator_auth = get_creator_auth_header(creator_username) + + egap_assets_path = get_egap_assets(guid, creator_auth) + + # __MACOSX is a hidden file created by the os when zipping + directory_list = [directory for directory in os.listdir(egap_assets_path) if directory not in ('egap_assets.zip', '__MACOSX')] + + for epag_project_dir in directory_list: + node = create_node_from_project_json(egap_assets_path, epag_project_dir, creator=creator) + + non_anon_files = os.path.join(egap_assets_path, epag_project_dir, 'data', 'nonanonymous') + non_anon_metadata = recursive_upload(creator_auth, node, non_anon_files) + + anon_files = os.path.join(egap_assets_path, epag_project_dir, 'data', 'anonymous') + if os.path.isdir(anon_files): + anon_metadata = recursive_upload(creator_auth, node, anon_files) + else: + anon_metadata = {} + + with open(os.path.join(egap_assets_path, epag_project_dir, 'registration-schema.json'), 'r') as fp: + registration_metadata = json.load(fp) + + # add selectedFileName Just so filenames are listed in the UI + for data in non_anon_metadata: + data['selectedFileName'] = data['data']['attributes']['name'] + + for data in anon_metadata: + data['selectedFileName'] = data['data']['attributes']['name'] + + non_anon_titles = ', '.join([data['data']['attributes']['name'] for data in non_anon_metadata]) + registration_metadata['q37'] = {'comments': [], 'extra': non_anon_metadata, 'value': non_anon_titles} + + anon_titles = ', '.join([data['data']['attributes']['name'] for data in anon_metadata]) + registration_metadata['q38'] = {'comments': [], 'extra': anon_metadata, 'value': anon_titles} + + DraftRegistration.create_from_node( + node, + user=creator, + schema=egap_schema, + data=registration_metadata, + ) + + shutil.rmtree(egap_assets_path) + + +class Command(BaseCommand): + """Magically morphs csv data into lovable nodes with draft registrations attached + """ + + def add_arguments(self, parser): + super(Command, self).add_arguments(parser) + parser.add_argument( + '-c', + '--creator', + help='This should be the username of the initial adminstrator for the imported nodes', + required=True + ) + parser.add_argument( + '-id', + '--guid', + help='This should be the guid of the private project with the directory structure', + required=True + ) + + def handle(self, *args, **options): + creator_username = options.get('creator', False) + guid = options.get('guid', False) + main(guid, creator_username) diff --git a/osf_tests/management_commands/test_EGAP_import.py b/osf_tests/management_commands/test_EGAP_import.py new file mode 100644 index 00000000000..c685722e277 --- /dev/null +++ b/osf_tests/management_commands/test_EGAP_import.py @@ -0,0 +1,210 @@ +# encoding: utf-8 +import os +import shutil +import pytest +import responses +HERE = os.path.dirname(os.path.abspath(__file__)) + +from osf_tests.factories import ( + AuthUserFactory, + NodeFactory, + ApiOAuth2PersonalTokenFactory +) +from osf.models import ( + RegistrationSchema, + ApiOAuth2PersonalToken +) +from osf.management.commands.import_EGAP import ( + get_egap_assets, + ensure_egap_schema, + create_node_from_project_json, + recursive_upload, + get_creator_auth_header +) +from api_tests.utils import create_test_file +from website.settings import WATERBUTLER_INTERNAL_URL + + +@pytest.mark.django_db +class TestEGAPImport: + + @pytest.fixture() + def greg(self): + return AuthUserFactory(username='greg@greg.com') + + @pytest.fixture() + def node(self, greg): + return NodeFactory(creator=greg) + + @pytest.fixture() + def node_with_file(self): + node = NodeFactory() + file = create_test_file(node, node.creator) + file.save() + node.save() + return node + + @pytest.fixture() + def egap_assets_path(self): + return os.path.join(HERE, 'test_directory', 'EGAP') + + @pytest.fixture() + def zip_data(self, egap_assets_path): + test_zip_path = os.path.join(egap_assets_path, 'test-egap.zip') + with open(test_zip_path, 'rb') as fp: + return fp.read() + + @pytest.fixture() + def egap_project_name(self): + return '20120220AA' + + def test_get_creator_auth_header(self, greg): + greg, auth_header = get_creator_auth_header(greg.username) + + gregs_token = ApiOAuth2PersonalToken.objects.get(owner=greg).token_id + assert auth_header['Authorization'] == 'Bearer {}'.format(gregs_token) + + def test_ensure_egap_schema(self): + ensure_egap_schema() + + assert RegistrationSchema.objects.get(name='EGAP Registration') + + def test_create_node_from_project_json(self, egap_assets_path, egap_project_name, greg): + node = create_node_from_project_json(egap_assets_path, egap_project_name, greg) + + assert node.title == 'Home Security and Infidelity: a case study by Fletcher Cox' + assert node.creator == greg + + assert len(node.contributors.all()) == 5 + contrib = node.contributors.exclude(username='greg@greg.com').first() + assert contrib.fullname == 'Fletcher Cox' + assert node.get_permissions(contrib) == ['read', 'write'] + assert not node.get_visible(greg) + + @responses.activate + def test_recursive_upload(self, node, greg, egap_assets_path, egap_project_name): + responses.add( + responses.Response( + responses.PUT, + '{}/v1/resources/{}/providers/osfstorage/?name=test_folder&kind=folder'.format( + WATERBUTLER_INTERNAL_URL, + node._id, + ), + json={'data': {'attributes': {'path': 'parent'}}}, + status=201, + ) + ) + responses.add( + responses.Response( + responses.PUT, + '{}/v1/resources/{}/providers/osfstorage/parent?name=test-2.txt&kind=file'.format( + WATERBUTLER_INTERNAL_URL, + node._id, + ), + json={'metadata': 'for test-2!'}, + status=201, + ) + ) + responses.add( + responses.Response( + responses.PUT, + '{}/v1/resources/{}/providers/osfstorage/?name=test-1.txt&kind=file'.format( + WATERBUTLER_INTERNAL_URL, + node._id, + ), + json={'metadata': 'for test-1!'}, + status=201, + ) + ) + token = ApiOAuth2PersonalTokenFactory(owner=greg) + token.save() + auth = {'Authorization': 'Bearer {}'.format(token.token_id)} + + egap_project_path = os.path.join(egap_assets_path, egap_project_name, 'data', 'nonanonymous') + + metadata = recursive_upload(auth, node, egap_project_path) + + assert metadata[0] == {'metadata': 'for test-2!'} + assert metadata[1] == {'data': {'attributes': {'path': 'parent'}}} + assert metadata[2] == {'metadata': 'for test-1!'} + + @responses.activate + def test_recursive_upload_retry(self, node, greg, egap_assets_path, egap_project_name): + responses.add( + responses.Response( + responses.PUT, + '{}/v1/resources/{}/providers/osfstorage/?name=test_folder&kind=folder'.format( + WATERBUTLER_INTERNAL_URL, + node._id, + ), + json={'data': {'attributes': {'path': 'parent'}}}, + status=201, + ) + ) + responses.add( + responses.Response( + responses.PUT, + '{}/v1/resources/{}/providers/osfstorage/parent?name=test-2.txt&kind=file'.format( + WATERBUTLER_INTERNAL_URL, + node._id, + ), + status=500, + ) + ) + responses.add( + responses.Response( + responses.PUT, + '{}/v1/resources/{}/providers/osfstorage/parent?name=test-2.txt&kind=file'.format( + WATERBUTLER_INTERNAL_URL, + node._id, + ), + json={'metadata': 'for test-2!'}, + status=201, + ) + ) + responses.add( + responses.Response( + responses.PUT, + '{}/v1/resources/{}/providers/osfstorage/?name=test-1.txt&kind=file'.format( + WATERBUTLER_INTERNAL_URL, + node._id, + ), + json={'metadata': 'for test-1!'}, + status=201, + ) + ) + token = ApiOAuth2PersonalTokenFactory(owner=greg) + token.save() + auth = {'Authorization': 'Bearer {}'.format(token.token_id)} + + egap_project_path = os.path.join(egap_assets_path, egap_project_name, 'data', 'nonanonymous') + + metadata = recursive_upload(auth, node, egap_project_path) + + assert metadata[0] == {'metadata': 'for test-2!'} + assert metadata[1] == {'data': {'attributes': {'path': 'parent'}}} + assert metadata[2] == {'metadata': 'for test-1!'} + + @responses.activate + def test_get_egap_assets(self, node_with_file, zip_data): + file_node = node_with_file.files.first() + + responses.add( + responses.Response( + responses.GET, + '{}/v1/resources/{}/providers/osfstorage/{}'.format( + WATERBUTLER_INTERNAL_URL, + node_with_file._id, + file_node._id + ), + body=zip_data, + status=200, + ) + ) + + asset_path = get_egap_assets(node_with_file._id, {'fake auth': 'sadasdadsdasdsds'}) + directory_list = os.listdir(asset_path) + # __MACOSX is a hidden file created by the os when zipping + assert set(directory_list) == set(['20110307AA', '__MACOSX', '20110302AA', 'egap_assets.zip', '20120117AA']) + + shutil.rmtree(asset_path) diff --git a/osf_tests/management_commands/test_directory/EGAP/20120220AA/data/nonanonymous/test-1.txt b/osf_tests/management_commands/test_directory/EGAP/20120220AA/data/nonanonymous/test-1.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/osf_tests/management_commands/test_directory/EGAP/20120220AA/data/nonanonymous/test_folder/test-2.txt b/osf_tests/management_commands/test_directory/EGAP/20120220AA/data/nonanonymous/test_folder/test-2.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/osf_tests/management_commands/test_directory/EGAP/20120220AA/project.json b/osf_tests/management_commands/test_directory/EGAP/20120220AA/project.json new file mode 100644 index 00000000000..99056c1ed47 --- /dev/null +++ b/osf_tests/management_commands/test_directory/EGAP/20120220AA/project.json @@ -0,0 +1 @@ +{"post-date": "03/01/2011 - 17:00", "id": "20110302AA", "contributors": [{"name": "Fletcher Cox", "email": "Cox@burds.com"}, {"name": "Brandon Graham", "email": "Graham@burds.com"}, {"name": "Nigel Bradham", "email": "Bradham@birds.com"}, {"name": "Vinny Curry", "email": "curry@burds.com"}], "title": "Home Security and Infidelity: a case study by Fletcher Cox"} \ No newline at end of file diff --git a/osf_tests/management_commands/test_directory/EGAP/20120220AA/registration-schema.json b/osf_tests/management_commands/test_directory/EGAP/20120220AA/registration-schema.json new file mode 100644 index 00000000000..d9a435c7c3e --- /dev/null +++ b/osf_tests/management_commands/test_directory/EGAP/20120220AA/registration-schema.json @@ -0,0 +1 @@ +{"q35": {"value": "Agree", "comments": [], "extra": []}, "q34": {"value": [""], "comments": [], "extra": []}, "q20": {"value": "", "comments": [], "extra": []}, "q22": {"value": "", "comments": [], "extra": []}, "q24": {"value": "", "comments": [], "extra": []}, "q25": {"value": "", "comments": [], "extra": []}, "q26": {"value": "", "comments": [], "extra": []}, "q28": {"value": "", "comments": [], "extra": []}, "q1": {"value": "The Causes of Compliance in International Relations: Evidence from a Field Experiment on Financial Transparency.", "comments": [], "extra": []}, "q3": {"value": "20110302AA", "comments": [], "extra": []}, "q2": {"value": "Mike Findley, Daniel Nielson, Jason Sharman, Shima Baradaran", "comments": [], "extra": []}, "q5": {"value": "", "comments": [], "extra": []}, "q4": {"value": "03/01/2011 - 17:00", "comments": [], "extra": []}, "q6": {"value": "Yes", "comments": [], "extra": []}, "q8": {"value": "", "comments": [], "extra": []}, "q15": {"value": "\"Observational studies of compliance with international law have produced mixed findings and been plagued by methodological challenges that might be addressed through random assignment to treatment and control groups. But potential field experiments manipulating sovereign governments would likely prove both impractical and unethical. In many IR realms, however, the actors who comply or not with international standards are ordinary firms and citizens, who can be studied ethically and practically using field experiments. The present study examines compliance with international standards that require full identity disclosure when incorporating a business. Without such disclosure, individuals are able to form anonymous shell corporations that can hide corruption, organized crime, and the financing of terrorism. Thus, this particular area of IR, while not focused directly on the behavior of national governments, nonetheless proves important globally\"", "comments": [], "extra": []}, "q14": {"value": "No", "comments": [], "extra": []}, "q17": {"value": "", "comments": [], "extra": []}, "q16": {"value": "\" The expectation of the first treatment is that service providers should be somewhat more likely to follow international standards when they receive a prompt about the existence of these standards (H1), than when they do not receive the prompt. The second hypothesis is that providers will be more likely to comply when they are informed that international standards are enforced by domestic agencies, which can apply meaningful penalties (H2), than when they are only prompted about international standards. This is broadly consistent with a realist view that compliance with international standards will only occur when backed up with state power. The third expectation is that service providers will be more likely to allow anonymous incorporation to a client from an explicitly non-corrupt country as opposed to a potential client from a clearly corrupt nation (H3). Finally, our fourth expectation is that providers in the United States will be far less likely to allow any client associated with global terror to incorporate without disclosing identification (H4). We expect the most significant treatment effects from H4, and suspect that the previous three treatments will garner much less substantial or statistically insignificant effects. In the international sample, the alias consultant makes an inquiry that is identical to the one in the U.S. control condition. In this case, however, the researcher purports to be from one of the basket of low-corruption countries discussed in treatment three above (Denmark, New Zealand, Finland, Sweden, Netherlands, Australia, Norway, or Austria), instead of from a Guineastan nation. Prior evidence suggests that U.S. incorporators are probably inclined to offer anonymous incorporation to the bulk of their clients; however, we presume that providers worldwide may exercise more discretion. So, we posit that sending the control emails from one of the worlds least-corrupt nations may raise fewer red flags, as it were, and increase our likelihood of seeing effects from the various treatments. The first international treatment is nearly identical to the first treatment in the U.S. sample and invokes the regulatory power of the FATF. In a similar manner, we hypothesize that service providers worldwide should be somewhat more likely to follow international standards when they receive a prompt about the existence of these standards (H5). The second treatment is identical to the control, but it lists the United States as its country of origin instead of the non-corrupt basket of countries. In this case, we are evaluating the difference that a request from the U.S. as the global hegemon has on services likelihood of allowing anonymous incorporation. We expect that most providers will be less likely to accept anonymous incorporation from hegemon-based clients and also predict that this treatment will garner some of our most significant results (H6). 23 The seventh and eighth treatments similarly reference the FATF, but they also offer rationales for raising the question of international law, mapping to different approaches core to the international relations literature. These treatments probe how incorporation services respond to rationalist or normative references to international standards. The treatment and matching hypothesis (H7) is designed to tap a constructivist logic of appropriateness. According to this view, actors engage in ethical reasoning to ensure their behavior conforms with generally shared conceptions of appropriate conduct. Thus H7 maintains that service providers will be more likely to comply, relative to the control condition but not necessarily compared to treatments one and two, if they are provided with cues about the appropriate course of action to preserve their self-esteem and reputation for propriety. The next treatment substitutes an explicitly rationalist logic of consequences whereby noncompliance runs the risk of costly punishment. Hypothesis eight thus explains compliance as a costbenefit calculation by service providers seeking to avoid sanctions (H8), and we expect that compliance should increase relative to the control, but are agnostic about whether compliance should be higher or lower than other conditions. \"", "comments": [], "extra": []}, "q11": {"value": "", "comments": [], "extra": []}, "q10": {"value": "No", "comments": [], "extra": []}, "q13": {"value": "No", "comments": [], "extra": []}, "q12": {"value": "", "comments": [], "extra": []}, "q19": {"value": "", "comments": [], "extra": []}, "q18": {"value": "", "comments": [], "extra": []}, "q33": {"value": [""], "comments": [], "extra": []}, "q32": {"value": "", "comments": [], "extra": []}, "q30": {"value": "", "comments": [], "extra": []}, "q36": {"value": "Agree", "comments": [], "extra": []}} \ No newline at end of file diff --git a/osf_tests/management_commands/test_directory/EGAP/test-egap.zip b/osf_tests/management_commands/test_directory/EGAP/test-egap.zip new file mode 100644 index 00000000000..85b13477ac2 Binary files /dev/null and b/osf_tests/management_commands/test_directory/EGAP/test-egap.zip differ diff --git a/package.json b/package.json index 8a756331088..f884cb97f9d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "19.30.0", + "version": "19.31.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science", diff --git a/scripts/EGAP/EGAP_tests.py b/scripts/EGAP/EGAP_tests.py new file mode 100644 index 00000000000..b78f659ea88 --- /dev/null +++ b/scripts/EGAP/EGAP_tests.py @@ -0,0 +1,151 @@ +import unittest +from create_EGAP_json import (schema_to_spreadsheet_mapping, + make_project_dict, + make_registration_dict, + other_mapping, +) + +HEADER_ROW = ['POST DATE', + 'ID', + 'STATUS', + 'TITLE', + 'B2 AUTHORS', + 'EMAIL', + 'B3 ACKNOWLEDGEMENTS', + 'B4 FACULTY MEMBER?', + 'B5 PROSPECTIVE OR RETROSPECTIVE?', + 'B6 EXPERIMENTAL STUDY?', + 'B7 DATE OF START OF STUDY', + 'B8 GATE DATE', + 'B8 FORMERLY GATED UNTIL', + 'B9 PRESENTED AT EGAP MEETING?', + 'B10 PRE-ANALYSIS PLAN WITH REGISTRATION?', + 'C1 BACKGROUND', + 'C2 HYPOTHESES', + 'C3 TESTING PLAN', + 'C4 COUNTRY', + 'C5 SAMPLE SIZE', + 'C6 POWER ANALYSIS?', + 'C7 IRB APPROVAL?', + 'C8 IRB NUMBER', + 'C9 DATE OF IRB APPROVAL', + 'C10 INTERVENTION IMPLEMENTER', + 'C11 REMUNERATION?', + 'C12 PUBLICATION AGREEMENT?', + 'C13 JEL CODES', + 'METHODOLOGY', + 'POLICY'] + +TEST_ROW_WITH_OTHER = ['03/05/2017 - 17:00', + '20170305AA', + 'Status is not saved, so this field doesnt matter', + 'The members of Nsync', + 'Justin Timberlake | Joey Fatone | Lance Bass', + 'doesnt@matter.com', + 'We acknolowledge Chris Kirkpatrick', + 'Justin Timberlake is a faculty Member', + 'This is my other response for prospective', + 'Yes', + '05/01/2017', + '05/01/2020', + '', + 'No', + 'No', + 'Test background', + 'test hypothesis', + 'This is my testing plan', + 'Switzerland', + '3242', + 'This is a power analysis other response', + 'This is an other irb response', + '343434', + '03/06/2017', + 'This is an other intervention response', + 'This is an other renumeration response', + 'This is an other publication agreement response', + 'Jel Code', + 'Survey Methodology', + 'Gender'] + +TEST_ROW_WITH_OTHER_AUTHORS = [ + {'name': 'Justin Timberlake', 'email': 'jt@gmail.com'}, + {'name': 'Joey Fatone'}, + {'name': 'Lance Bass', 'email': 'lBass@gmail.com'}] + +TEST_ROW = ['05/05/2018 - 17:00', + '20180505AA', + 'Status is not saved, so this field doesnt matter', + 'The members of Backstreet boys', + 'Nick Carter | Brian Littrell, Ph.D. | AJ McLean | U.S. Agency Bureau, Department of Agency affairs (DOAA)', + 'doesnt@matter.com', + 'We acknolowledge Chris Kirkpatrick', + 'Yes', + 'Registration prior to any research activities', + 'Yes', + '05/01/2017', + '05/01/2020', + '', + 'No', + 'No', + 'Test background', + 'test hypothesis', + 'This is my testing plan', + 'Switzerland', + '3242', + 'Yes', + 'Yes', + '343434', + '03/06/2017', + 'Researchers', + 'Yes', + 'Yes', + 'Jel Code', + 'Survey Methodology', + 'Gender'] + +TEST_ROW_AUTHORS = [ + {'name': 'Nick Carter', 'email': 'nickc@gmail.com'}, + {'name': 'Brian Littrell, Ph.D.'}, + {'name': 'AJ McLean', 'email': 'AJML@gmail.com'}, + {'name': 'U.S. Agency Bureau, Department of Agency affairs (DOAA)', 'email': 'DOAA@UAB.gov'}] + +class TestProjectDict(unittest.TestCase): + + def test_row_with_other(self): + project_dict = make_project_dict(TEST_ROW_WITH_OTHER, TEST_ROW_WITH_OTHER_AUTHORS, HEADER_ROW) + self.assertEqual(project_dict['title'], TEST_ROW_WITH_OTHER[3]) + self.assertEqual(project_dict['contributors'], TEST_ROW_WITH_OTHER_AUTHORS) + self.assertEqual(project_dict['post-date'], TEST_ROW_WITH_OTHER[0]) + self.assertEqual(project_dict['id'], TEST_ROW_WITH_OTHER[1]) + + def test_row(self): + project_dict = make_project_dict(TEST_ROW, TEST_ROW_AUTHORS, HEADER_ROW) + self.assertEqual(project_dict['title'], TEST_ROW[3]) + self.assertEqual(project_dict['contributors'], TEST_ROW_AUTHORS) + self.assertEqual(project_dict['post-date'], TEST_ROW[0]) + self.assertEqual(project_dict['id'], TEST_ROW[1]) + +class TestRegistrationDict(unittest.TestCase): + + def run_registration_test(self, row, header_row): + project_dict = make_registration_dict(row, header_row) + for question_dict in schema_to_spreadsheet_mapping: + question_key = question_dict.keys()[0] + spreadsheet_column = question_dict[question_key] + column_index = header_row.index(spreadsheet_column) + if type(project_dict[question_key]['value']) == list: + field_val = project_dict[question_key]['value'][0] + else: + field_val = project_dict[question_key]['value'] + if row[column_index] != field_val and question_key in other_mapping: + self.assertEqual(project_dict[question_key]['value'], 'Other (describe in text box below)') + field_val = project_dict[other_mapping[question_key]]['value'] + self.assertEqual(row[column_index], field_val) + else: + self.assertEqual(row[column_index], field_val) + + def test_row_with_other(self): + self.run_registration_test(TEST_ROW_WITH_OTHER, HEADER_ROW) + + def test_row(self): + self.run_registration_test(TEST_ROW, HEADER_ROW) diff --git a/scripts/EGAP/__init__.py b/scripts/EGAP/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/EGAP/create_EGAP_json.py b/scripts/EGAP/create_EGAP_json.py new file mode 100644 index 00000000000..9edb0679b2d --- /dev/null +++ b/scripts/EGAP/create_EGAP_json.py @@ -0,0 +1,533 @@ +import logging +import csv +import datetime +import json +import os +import shutil +import jsonschema +import argparse + +from jsonschema.exceptions import ValidationError + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +parser = argparse.ArgumentParser() +parser.add_argument('-a', '--authorsource', help='Specify the source file for the author csv file') +parser.add_argument('-r', '--registrysource', help='Specify the source file for the registrty csv file') +parser.add_argument('-t', '--target', help='Specify the target directory of the registry directories') +parser.add_argument('-d', '--dry', action='store_true', help='Dry run: Have the script delete the target directory after completion') + +schema_to_spreadsheet_mapping = [ + {'q1': 'TITLE'}, + {'q2': 'B2 AUTHORS'}, + {'q3': 'ID'}, + {'q4': 'POST DATE'}, + {'q5': 'B3 ACKNOWLEDGEMENTS'}, + {'q6': 'B4 FACULTY MEMBER?'}, + {'q8': 'B5 PROSPECTIVE OR RETROSPECTIVE?'}, + {'q10': 'B6 EXPERIMENTAL STUDY?'}, + {'q11': 'B7 DATE OF START OF STUDY'}, + {'q12': 'B8 GATE DATE'}, + {'q13': 'B9 PRESENTED AT EGAP MEETING?'}, + {'q14': 'B10 PRE-ANALYSIS PLAN WITH REGISTRATION?'}, + {'q15': 'C1 BACKGROUND'}, + {'q16': 'C2 HYPOTHESES'}, + {'q17': 'C3 TESTING PLAN'}, + {'q18': 'C4 COUNTRY'}, + {'q19': 'C5 SAMPLE SIZE'}, + {'q20': 'C6 POWER ANALYSIS?'}, + {'q22': 'C7 IRB APPROVAL?'}, + {'q24': 'C8 IRB NUMBER'}, + {'q25': 'C9 DATE OF IRB APPROVAL'}, + {'q26': 'C10 INTERVENTION IMPLEMENTER'}, + {'q28': 'C11 REMUNERATION?'}, + {'q30': 'C12 PUBLICATION AGREEMENT?'}, + {'q32': 'C13 JEL CODES'}, + {'q33': 'METHODOLOGY'}, + {'q34': 'POLICY'}, +] + +# Any multiple choice questions where "Other" is a possible response, have subsequent "Other" +# question to log that response. If multiple choice question value is invalid, +# attempt to log the value in the corresponding "Other" question response. +other_mapping = { + 'q6': 'q7', + 'q8': 'q9', + 'q20': 'q21', + 'q22': 'q23', + 'q26': 'q27', + 'q28': 'q29', + 'q30': 'q31' +} + + +here = os.path.split(os.path.abspath(__file__))[0] + + +def from_json(fname): + with open(os.path.join(here, fname)) as f: + return json.load(f) + + +def ensure_schema_structure(schema): + schema['pages'] = schema.get('pages', []) + schema['title'] = schema['name'] + schema['version'] = schema.get('version', 1) + return schema + + +def create_file_tree_and_json(author_source, registry_source, target): + # Things this function needs to do: + # For each row in the registry function, create a directory. + # Create two JSON files, one project json with ID, Title, Postdate, and authors listed + # with emails. And another with all the key value pairs for the registry meta. + top_dir = target + logger.info('Creating EGAP directory at {}'.format(top_dir)) + os.mkdir(top_dir) + author_list = create_author_dict(author_source) + with open(registry_source, 'rt', encoding='utf-8-sig') as csv_registry_file: + csv_reader = csv.reader(csv_registry_file, delimiter=',') + header_row = next(csv_reader) + normalized_header_row = [col_header.strip() for col_header in header_row] + logger.info('Debug data') + logger.info('Header row: {}'.format(header_row)) + logger.info('Normalized header row: {}'.format(normalized_header_row)) + + id_index = normalized_header_row.index('ID') + for line in csv_reader: + row = [cell for cell in line] + project_id = row[id_index] + logger.info('Adding project ID: {}'.format(project_id)) + root_directory = os.path.join(top_dir, project_id) + os.mkdir(root_directory) + data_directory = os.path.join(root_directory, 'data') + os.mkdir(data_directory) + os.mkdir(os.path.join(data_directory, 'nonanonymous')) + project_dict = make_project_dict(row, author_list, normalized_header_row) + make_json_file(root_directory, project_dict, 'project') + registration_dict = make_registration_dict(row, normalized_header_row) + make_json_file(root_directory, registration_dict, 'registration') + + +def create_author_dict(source): + # Reads in author CSV and returns a list of dicts with names and emails of EGAP Authors + authors = [] + with open(source, 'rt', encoding='utf-8-sig') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + header_row = next(csv_reader) + normalized_header_row = [col_header.strip() for col_header in header_row] + logger.info('Debug data') + logger.info('Header row: {}'.format(header_row)) + logger.info('Normalized header row: {}'.format(normalized_header_row)) + name_index = normalized_header_row.index('Name') + email_index = normalized_header_row.index('Email') + for line in csv_reader: + row = [cell for cell in line] + logger.info('Adding user: ' + row[name_index]) + if row[email_index] != '': + author_dict = {'name': row[name_index].strip(), 'email': row[email_index]} + else: + author_dict = {'name': row[name_index].strip()} + authors.append(author_dict) + return authors + + +def make_project_dict(row, author_list, normalized_header_row): + project = {} + title_index = normalized_header_row.index('TITLE') + id_index = normalized_header_row.index('ID') + postdate_index = normalized_header_row.index('POST DATE') + contributors_index = normalized_header_row.index('B2 AUTHORS') + project['id'] = row[id_index] + project['title'] = row[title_index] + project['post-date'] = row[postdate_index] + + authors = row[contributors_index] + + authors = authors.split('|') + project['contributors'] = [] + author_name_list = [author['name'] for author in author_list] + for author in authors: + author = author.strip() + if author: + if author not in author_name_list: + logger.warning('Author {} not in Author spreadsheet for project {}.'.format(author,row[id_index])) + project['contributors'].append({'name': author}) + else: + author_list_index = author_name_list.index(author) + project['contributors'].append(author_list[author_list_index]) + return project + + +def make_registration_dict(row, normalized_header_row): + registration = {} + + for question in schema_to_spreadsheet_mapping: + qid = list(question.keys())[0] + column_name = list(question.values())[0] + value = build_question_response(normalized_header_row, row, qid, column_name) + validated_qid, other_response = validate_response(qid, value) + registration[validated_qid] = value + if other_response: + registration[other_response] = build_nested_response('Other (describe in text box below)') + # q35 and q36 are required questions at the end of the schema, certification and + # confirmation questions. Just marking as agree - + registration['q35'] = build_nested_response('Agree') + registration['q36'] = build_nested_response('Agree') + return registration + + +def make_json_file(filepath, data, json_type): + if json_type == 'project': + filepath = filepath + '/project.json' + if json_type == 'registration': + filepath = filepath + '/registration-schema.json' + with open(filepath, 'w') as outfile: + json.dump(data, outfile) + + +def build_question_response(header_row, row, question_key, column_title): + """Format the question's response to go in the registration_metadata + :param header_row: Header row in spreadsheet + :param row: Row in spreadsheet + :param question_key: string, Official question key as part of schema + :param column_title: string, Corresponding question_key column title in EGAP spreadsheet + """ + index = header_row.index(column_title) + value = clean_value(row[index]) + # Spreadsheet has these as comma-separated values, but looking for array + if question_key in ['q33', 'q34']: + value = value.split(', ') + return build_nested_response(value) + + +def clean_value(value): + """Clean spreadsheet values of issues that will affect validation """ + if value == 'n/a': + return 'N/A' + elif value == 'Design was registered before field was added': + return '' + return value + +def build_nested_response(value): + return { + 'comments': [], + 'extra': [], + 'value': value + } + + +def base_metaschema(metaschema): + json_schema = { + 'type': 'object', + 'description': metaschema['description'], + 'title': metaschema['title'], + 'additionalProperties': False, + 'properties': { + } + } + return json_schema + + +def get_required(question): + """ + Returns True if metaschema question is required. + """ + required = question.get('required', False) + if not required: + properties = question.get('properties', False) + if properties and isinstance(properties, list): + for item, property in enumerate(properties): + if isinstance(property, dict) and property.get('required', False): + required = True + break + return required + + +COMMENTS_SCHEMA = { + 'type': 'array', + 'items': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'seenBy': { + 'type': 'array', + }, + 'canDelete': {'type': 'boolean'}, + 'created': {'type': 'string'}, + 'lastModified': {'type': 'string'}, + 'author': {'type': 'string'}, + 'value': {'type': 'string'}, + 'isOwner': {'type': 'boolean'}, + 'getAuthor': {'type': 'string'}, + 'user': { + 'type': 'object', + 'additionalProperties': True, + 'properties': { + 'fullname': {'type': 'string'}, + 'id': {'type': 'integer'} + } + }, + 'saved': {'type': 'boolean'}, + 'canEdit': {'type': 'boolean'}, + 'isDeleted': {'type': 'boolean'} + } + } +} + + +def get_options_jsonschema(options, required): + """ + Returns multiple choice options for schema questions + """ + for item, option in enumerate(options): + if isinstance(option, dict) and option.get('text'): + options[item] = option.get('text') + value = {'enum': options} + + if not required and '' not in value['enum']: # Non-required fields need to accept empty strings as a value. + value['enum'].append('') + + return value + + +def get_object_jsonschema(question, required_fields, is_reviewer, is_required): + """ + Returns jsonschema for nested objects within schema + """ + object_jsonschema = { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + + } + } + required = [] + properties = question.get('properties') + if properties: + for property in properties: + if property.get('required', False) and required_fields: + required.append(property['id']) + values = extract_question_values(property, required_fields, is_reviewer, is_required) + object_jsonschema['properties'][property['id']] = { + 'type': 'object', + 'additionalProperties': False, + 'properties': values + } + if required_fields: + object_jsonschema['properties'][property['id']]['required'] = ['value'] + if required_fields and is_required: + object_jsonschema['required'] = required + + return object_jsonschema + + +OSF_UPLOAD_EXTRA_SCHEMA = { + 'type': 'array', + 'items': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'data': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'kind': {'type': 'string'}, + 'contentType': {'type': 'string'}, + 'name': {'type': 'string'}, + 'extra': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'downloads': {'type': 'integer'}, + 'version': {'type': 'integer'}, + 'latestVersionSeen': {'type': 'string'}, + 'guid': {'type': 'string'}, + 'checkout': {'type': 'string'}, + 'hashes': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'sha256': {'type': 'string'}, + 'md5': {'type': 'string'} + } + } + } + }, + 'materialized': {'type': 'string'}, + 'modified': {'type': 'string'}, + 'nodeId': {'type': 'string'}, + 'etag': {'type': 'string'}, + 'provider': {'type': 'string'}, + 'path': {'type': 'string'}, + 'nodeUrl': {'type': 'string'}, + 'waterbutlerURL': {'type': 'string'}, + 'resource': {'type': 'string'}, + 'nodeApiUrl': {'type': 'string'}, + 'type': {'type': 'string'}, + 'accept': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'acceptedFiles': {'type': 'boolean'}, + 'maxSize': {'type': 'integer'}, + } + }, + 'links': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'download': {'type': 'string'}, + 'move': {'type': 'string'}, + 'upload': {'type': 'string'}, + 'delete': {'type': 'string'} + } + }, + 'permissions': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'edit': {'type': 'boolean'}, + 'view': {'type': 'boolean'} + } + }, + 'created_utc': {'type': 'string'}, + 'id': {'type': 'string'}, + 'modified_utc': {'type': 'string'}, + 'size': {'type': 'integer'}, + 'sizeInt': {'type': 'integer'}, + } + }, + 'fileId': {'type': ['string', 'object']}, + 'descriptionValue': {'type': 'string'}, + 'sha256': {'type': 'string'}, + 'selectedFileName': {'type': 'string'}, + 'nodeId': {'type': 'string'}, + 'viewUrl': {'type': 'string'} + } + } +} + + +def extract_question_values(question, required_fields, is_reviewer, is_required): + """ + Pulls structure for 'value', 'comments', and 'extra' items + """ + response = { + 'value': {'type': 'string'}, + 'comments': COMMENTS_SCHEMA, + 'extra': {'type': 'array'} + } + if question.get('type') == 'object': + response['value'] = get_object_jsonschema(question, required_fields, is_reviewer, is_required) + elif question.get('type') == 'choose': + options = question.get('options') + if options: + enum_options = get_options_jsonschema(options, is_required) + if question.get('format') == 'singleselect': + response['value'] = enum_options + elif question.get('format') == 'multiselect': + response['value'] = {'type': 'array', 'items': enum_options} + elif question.get('type') == 'osf-upload': + response['extra'] = OSF_UPLOAD_EXTRA_SCHEMA + + if is_reviewer: + del response['extra'] + if not question.get('type') == 'object': + del response['value'] + + return response + + +def create_jsonschema_from_metaschema(metaschema, required_fields=False, is_reviewer=False): + """ + Creates jsonschema from registration metaschema for validation. + + Reviewer schemas only allow comment fields. + """ + json_schema = base_metaschema(metaschema) + required = [] + + for page in metaschema['pages']: + for question in page['questions']: + is_required = get_required(question) + if is_required and required_fields: + required.append(question['qid']) + json_schema['properties'][question['qid']] = { + 'type': 'object', + 'additionalProperties': False, + 'properties': extract_question_values(question, required_fields, is_reviewer, is_required) + } + if required_fields: + json_schema['properties'][question['qid']]['required'] = ['value'] + + if required and required_fields: + json_schema['required'] = required + + return json_schema + + +def validate_response(qid, value): + """Validate question response + + Validating each question response individually. If there is an error, we will + attempt to add the value to the corresponding "Other" block. Return that question id instead. + + For example, q6 is a multiple choice question, with "Other" as a choice. If text is entered + for q6 that does not match one of the multiple choice answers, assuming that this is "other" + text, and this response should go to the corresponding q7 question. q6 will be marked + as "Other" + + :param qid: string, question id from schema + :param value: question response + :param draft: DraftRegistration + :return qid: tuple, (qid corresponding to value, optional "Other" qid) + """ + temporary_check = {} + temporary_check[qid] = value + egap_schema = ensure_schema_structure(from_json('egap-registration.json')) + schema = create_jsonschema_from_metaschema(egap_schema, + required_fields=False, + is_reviewer=False) + + try: + json_schema = jsonschema.validate(temporary_check, schema) + except ValidationError as exc: + if qid in other_mapping: + return other_mapping[qid], qid + else: + raise Exception(exc) + return qid, None + + +def main(default_args=False): + if default_args: + args = parser.parse_args(['--source', 'default', '--target', 'default']) + else: + args = parser.parse_args() + + author_source = args.authorsource + registry_source = args.registrysource + target_directory = args.target + dry_run = args.dry + + if not author_source: + author_source = 'EGAP_author_emails.csv' + + if not registry_source: + registry_source = 'EGAP_registry_for_OSF.csv' + + if not target_directory: + target_directory = 'EGAP_data_{}'.format(datetime.datetime.now().strftime('%m-%d-%Y')) + + create_file_tree_and_json(author_source, registry_source, target_directory) + + if dry_run: + shutil.rmtree(target_directory) + raise RuntimeError('Dry run, file tree being deleted.') + + +if __name__ == '__main__': + + main(default_args=False) diff --git a/scripts/EGAP/egap-registration.json b/scripts/EGAP/egap-registration.json new file mode 100644 index 00000000000..28d3721e8bf --- /dev/null +++ b/scripts/EGAP/egap-registration.json @@ -0,0 +1,382 @@ +{ + "name": "EGAP Registration", + "version": 2, + "description": "The EGAP registry focuses on designs for experiments and observational studies in governance and politics.", + "pages": [{ + "id": "page1", + "title": "General Information About the Project", + "questions": [{ + "qid": "q1", + "nav": "Title", + "type": "string", + "format": "text", + "title": "B1 Title of Study", + "description": "Provide the working title of your study.", + "required": true + }, + { + "qid": "q2", + "nav": "Authors", + "title": "B2 Authors", + "help": "Jimmy Stewart, Ava Gardner, Bob Hope, Greta Garbo", + "format": "textarea", + "required": true + }, + { + "qid": "q3", + "nav": "EGAP Registration ID", + "title": "EGAP Registration ID", + "format": "textarea", + "required": true + }, + { + "qid": "q4", + "nav": "Timestamp", + "title": "Timestamp of original registration", + "format": "textarea", + "required": true + }, + { + "qid": "q5", + "nav": "Acknowledgements", + "title": "B3 Acknowledgements", + "type": "string", + "format": "textarea", + "required": false + }, + { + "qid": "q6", + "title": "B4 Is one of the study authors a university faculty member?", + "nav": "University Faculty Member?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No", + "Other (describe in text box below)" + ], + "description": "Please choose one" + }, + { + "qid": "q7", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q8", + "title": "B5 Is this Registration Prospective or Retrospective?", + "nav": "Prospective or Retrospective?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Registration prior to any research activities", + "Registration prior to assignment of treatment", + "Registration prior to realization of outcomes", + "Registration prior to researcher access to outcome data", + "Registration prior to researcher analysis of outcome data", + "Registration after researcher analysis of outcome data", + "Other (describe in text box below)" + ], + "description": "Please choose one" + }, + { + "qid": "q9", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q10", + "title": "B6 Is this an experimental study?", + "nav": "Experimental study?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No" + ], + "description": "(with random assignment of units to different conditions)" + }, + { + "qid": "q11", + "title": "B7 Date of start of study", + "nav": "Date of start of study", + "type": "string", + "format": "text", + "description": "Understood as first date of treatment assignment or equivalent for observational study", + "help": "E.g., 06/02/2018" + }, + { + "qid": "q12", + "title": "B8 Gate Date", + "nav": "Gate Date?", + "type": "string", + "format": "text", + "description": "Gating is discouraged, but if necessary, EGAP policy limits the gate range to 18 months maximum.", + "help": "E.g., 06/02/2018" + }, + { + "qid": "q13", + "title": "B9 Was this design presented at an EGAP meeting?", + "nav": "Presented at an EGAP meeting?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes" + ], + "description": "Indicate if the design received feedback from a EGAP design workshop or other special EGAP session prior to registration" + }, + { + "qid": "q14", + "title": "B10 Is there a pre-analysis plan associated with this registration?", + "nav": "Pre-analysis plan associated with this registration?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes" + ], + "description": "If so, please attach it in the Additional Documentation section on the final screen." + } + ] + }, + { + "id": "page2", + "title": "Registration Data", + "questions": [{ + "qid": "q15", + "nav": "Background and explanation of rationale.", + "title": "C1 Background and explanation of rationale.", + "format": "textarea", + "required": true, + "description": "Brief description of goals of project. If you are also attaching a pre-analysis plan, please refrain from simply copying and pasting a section from your plan here. If possible, please also avoid saying \"see attached pre-analysis plan,\" as it renders the search functionality less useful. Rather, please provide a short (1-2 paragraph) summary of the project background." + }, + { + "qid": "q16", + "nav": "Background and explanation of rationale.", + "title": "C2 What are the hypotheses to be tested/quantities of interest to be estimated?", + "format": "textarea", + "required": true, + "description": "Please list the hypotheses including hypotheses on heterogeneous effects. If you are also attaching a pre-analysis plan, please refrain from simply copying and pasting a section from your plan here. If possible, please also avoid saying \"see attached pre-analysis plan,\" as it renders the search functionality less useful. Rather, please provide a short (1-2 paragraph) summary of project hypotheses." + }, + { + "qid": "q17", + "nav": "How will these hypotheses be tested?", + "title": "C3 How will these hypotheses be tested?", + "format": "textarea", + "required": true, + "description": "Brief description of your methodology. If you are also attaching a pre-analysis plan, please refrain from simply copying and pasting a section from your plan here. If possible, please also avoid saying \"see attached pre-analysis plan,\" as it renders the search functionality less useful. Rather, please provide a short (1-2 paragraph) summary of project methodology." + }, + { + "qid": "q18", + "title": "C4 Country", + "nav": "Country", + "type": "string", + "format": "text", + "help": "comma separated names of countries (e.g. Canada, United States of America, Mexico)" + }, + { + "qid": "q19", + "title": "C5 Sample Size (# of Units)", + "nav": "Sample Size", + "type": "string", + "format": "text" + }, + { + "qid": "q20", + "title": "C6 Was a power analysis conducted prior to data collection?", + "nav": "Power analysis conducted prior to data collection?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes", + "Other (describe in text box below)" + ] + }, + { + "qid": "q21", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q22", + "title": "C7 Has this research received Institutional Review Board (IRB) or ethics committee approval?", + "nav": "Review Board (IRB) or ethics committee approval?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes", + "Other (describe in text box below)" + ] + }, + { + "qid": "q23", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q24", + "title": "C8 IRB Number", + "nav": "IRB Number", + "type": "string", + "format": "text" + }, + { + "qid": "q25", + "title": "C9 Date of IRB Approval", + "nav": "IRB Number", + "type": "string", + "format": "text" + }, + { + "qid": "q26", + "title": "C10 Will the intervention be implemented by the researcher or a third party? If a third party, please provide the name.", + "nav": "Review Board (IRB) or ethics committee approval?", + "type": "choose", + "format": "singleselect", + "options": [ + "Researchers", + "Other (describe in text box below)" + ] + }, + { + "qid": "q27", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q28", + "title": "C11 Did any of the research team receive remuneration from the implementing agency for taking part in this research?", + "nav": "Remuneration?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No", + "Other (describe in text box below)" + ] + }, + { + "qid": "q29", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q30", + "title": "C12 If relevant, is there an advance agreement with the implementation group that all results can be published?", + "nav": "is there an advance agreement with the implementation group that all results can be published?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No", + "Other (describe in text box below)" + ] + }, + { + "qid": "q31", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q32", + "title": "C13 JEL classification(s)", + "nav": "JEL classification(s)", + "type": "string", + "format": "text", + "description": "Please provide alphanumeric code(s). If multiple classifications, separate by commas (e.g. D31, C19, F22)" + } + ] + }, + { + "id": "page3", + "title": "Keywords and Data", + "questions": [{ + "qid": "q33", + "nav": "Keywords", + "type": "choose", + "format": "multiselect", + "title": "Keywords for Methodology", + "description": "Choose one or more categories that describe your study methodology.", + "options": [ + "Experimental Design", + "Field Experiments", + "Lab Experiments", + "Mixed Method", + "Statistics", + "Survey Methodology" + ] + }, { + "qid": "q34", + "nav": "Keywords", + "type": "choose", + "format": "multiselect", + "title": "Keywords for Policy", + "description": "Choose one or more policy categories.", + "options": [ + "Conflict and Violence", + "Corruption", + "Development", + "Elections", + "Ethnic Politics", + "Gender", + "Governance" + ] + }, { + "qid": "q35", + "title": "Certification", + "nav": "Certification", + "type": "choose", + "format": "singleselect", + "description": "By submitting this form and accompanying documents with EGAP, I confirm that I have rights to put this information in the public domain and I understand that this information will remain on the EGAP registry in perpetuity, regardless of whether the research is subsequently implemented or not.", + "options": [ + "Agree" + ], + "required": true + }, { + "qid": "q36", + "title": "Confirmation", + "nav": "Confirmation", + "type": "choose", + "format": "singleselect", + "description": "You should receive a confirmation of your registration within three business days. Your registration is considered complete only when confirmation is received. If you do not receive confirmation within three business days please contact paps@egap.org.", + "options": [ + "Agree" + ], + "required": true + }, { + "qid": "q37", + "nav": "Additional Documentation", + "title": "Additional Documentation", + "type": "osf-upload", + "format": "osf-upload-open", + "description": "Please upload your pre-analysis plan, along with any other supporting documents, such as survey instrument, research protocol, any data, etc." + }, { + "qid": "q38", + "nav": "Anonymous Documentation", + "title": "Anonymous Documentation", + "type": "osf-upload", + "format": "osf-upload-open", + "description": "Please upload your anonymized pre-analysis plan, along with any other supporting documents, such as survey instrument, research protocol, any data, etc." + }] + } + ] +} diff --git a/scripts/EGAP/egap_workflow.ipynb b/scripts/EGAP/egap_workflow.ipynb new file mode 100644 index 00000000000..96f5af42703 --- /dev/null +++ b/scripts/EGAP/egap_workflow.ipynb @@ -0,0 +1,1542 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the `scripts/EGAP` folder, with your virtualenv active, `pip install -r requirements.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from create_EGAP_json import create_file_tree_and_json\n", + "from files_to_import_structure import audit_files, main as convert_files\n", + "\n", + "author_source = '/Users/bgeiger/Desktop/EGAP/20190821_author_emails.csv'\n", + "registry_source = '/Users/bgeiger/Desktop/EGAP/20191014_OSF_database.csv'\n", + "metadata_directory = '/Users/bgeiger/Desktop/EGAP/metadata/'\n", + "raw_files_directory = '/Users/bgeiger/Desktop/EGAP/raw_files/'\n", + "directories_to_process = [\n", + " '20110302AA',\n", + " '20110307AA',\n", + " '20120117AA',\n", + " '20120220AA',\n", + " '20120727AA',\n", + " '20120925AA',\n", + " '20120926AA',\n", + " '20121001AA',\n", + " '20121002AA',\n", + " '20121012AA',\n", + " '20121026AA',\n", + " '20121031AA',\n", + " '20121101AA',\n", + " '20121104AA',\n", + " '20121106AA',\n", + " '20121107AA',\n", + " '20121123AA',\n", + " '20121212AA',\n", + " '20130122AA',\n", + " '20130403AA',\n", + " '20130406AA',\n", + " '20130410AA',\n", + " '20130426AA',\n", + " '20130518AA',\n", + " '20130607AA',\n", + " '20130616AA',\n", + " '20130704AA',\n", + " '20130729AA',\n", + " '20130731AA',\n", + " '20130803AA',\n", + " '20130813AA',\n", + " '20130819AA',\n", + " '20130913AA',\n", + " '20130921AA',\n", + " '20131012AA',\n", + " '20131024AA',\n", + " '20131101AA',\n", + " '20131105AA',\n", + " '20131110AA',\n", + " '20131117AA',\n", + " '20131118AA',\n", + " '20131130AA',\n", + " '20131203AA',\n", + " '20131206AA',\n", + " '20131210AA',\n", + " '20131211AA',\n", + " '20131216AA',\n", + " '20131220AA',\n", + " '20140110AA',\n", + " '20140112AA',\n", + " '20140113AA',\n", + " '20140120AA',\n", + " '20140124AA',\n", + " '20140126AA',\n", + " '20140131AA',\n", + " '20140203AA',\n", + " '20140203AB',\n", + " '20140222AA',\n", + " '20140222AB',\n", + " '20140228AA',\n", + " '20140303AA',\n", + " '20140308AA',\n", + " '20140316AA',\n", + " '20140320AA',\n", + " '20140417AA',\n", + " '20140502AA',\n", + " '20140503AA',\n", + " '20140506AA',\n", + " '20140509AA',\n", + " '20140509AB',\n", + " '20140512AA',\n", + " '20140521AA',\n", + " '20140523AA',\n", + " '20140529AA',\n", + " '20140610AA',\n", + " '20140611AA',\n", + " '20140611AB',\n", + " '20140613AB',\n", + " '20140627AA',\n", + " '20140627AB',\n", + " '20140627AC',\n", + " '20140701AA',\n", + " '20140701AB',\n", + " '20140707AA',\n", + " '20140708AA',\n", + " '20140715AA',\n", + " '20140722AA',\n", + " '20140723AA',\n", + " '20140723AB',\n", + " '20140806AA',\n", + " '20140812AA',\n", + " '20140820AA',\n", + " '20140912AA',\n", + " '20140915AA',\n", + " '20140918AA',\n", + " '20140922AA',\n", + " '20141002AA',\n", + " '20141006AA',\n", + " '20141023AA',\n", + " '20141025AA',\n", + " '20141027AA',\n", + " '20141031AA',\n", + " '20141031AB',\n", + " '20141101AA',\n", + " '20141103AA',\n", + " '20141107AA',\n", + " '20141117AA',\n", + " '20141202AA',\n", + " '20141208AA',\n", + " '20141213AA',\n", + " '20141223AA',\n", + " '20141225AA',\n", + " '20141227AA',\n", + " '20141231AA',\n", + " '20150110AA',\n", + " '20150111AA',\n", + " '20150118AA',\n", + " '20150122AA',\n", + " '20150122AB',\n", + " '20150127AA',\n", + " '20150131AA',\n", + " '20150202AA',\n", + " '20150204AA',\n", + " '20150206AA',\n", + " '20150211AA',\n", + " '20150216AA',\n", + " '20150304AA',\n", + " '20150308AA',\n", + " '20150309AA',\n", + " '20150310AA',\n", + " '20150311AA',\n", + " '20150313AA',\n", + " '20150320AA',\n", + " '20150323AA',\n", + " '20150324AA',\n", + " '20150326AA',\n", + " '20150330AA',\n", + " '20150420AA',\n", + " '20150423AA',\n", + " '20150428AA',\n", + " '20150429AA',\n", + " '20150508AA',\n", + " '20150513AA',\n", + " '20150513AB',\n", + " '20150513AC',\n", + " '20150513AD',\n", + " '20150513AE',\n", + " '20150513AF',\n", + " '20150513AG',\n", + " '20150513AH',\n", + " '20150513AI',\n", + " '20150514AA',\n", + " '20150517AA',\n", + " '20150518AA',\n", + " '20150520AA',\n", + " '20150522AA',\n", + " '20150526AA',\n", + " '20150527AA',\n", + " '20150602AA',\n", + " '20150602AB',\n", + " '20150603AA',\n", + " '20150604AA',\n", + " '20150605AA',\n", + " '20150605AB',\n", + " '20150616AA',\n", + " '20150617AA',\n", + " '20150619AA',\n", + " '20150622AA',\n", + " '20150623AA',\n", + " '20150701AA',\n", + " '20150702AA',\n", + " '20150703AA',\n", + " '20150707AA',\n", + " '20150708AA',\n", + " '20150709AA',\n", + " '20150709AB',\n", + " '20150710AA',\n", + " '20150713AA',\n", + " '20150716AA',\n", + " '20150716AB',\n", + " '20150717AA',\n", + " '20150718AA',\n", + " '20150720AA',\n", + " '20150723AA',\n", + " '20150724AA',\n", + " '20150727AA',\n", + " '20150731AA',\n", + " '20150731AB',\n", + " '20150803AA',\n", + " '20150803AB',\n", + " '20150812AA',\n", + " '20150813AA',\n", + " '20150813AB',\n", + " '20150819AA',\n", + " '20150819AB',\n", + " '20150820AA',\n", + " '20150824AA',\n", + " '20150824AB',\n", + " '20150825AA',\n", + " '20150827AA',\n", + " '20150903AA',\n", + " '20150903AB',\n", + " '20150914AA',\n", + " '20150915AA',\n", + " '20150917AA',\n", + " '20150921AA',\n", + " '20150922AA',\n", + " '20150924AA',\n", + " '20150925AA',\n", + " '20150927AA',\n", + " '20150928AA',\n", + " '20150928AB',\n", + " '20150929AA',\n", + " '20150929AB',\n", + " '20150930AA',\n", + " '20150930AB',\n", + " '20151003AA',\n", + " '20151006AA',\n", + " '20151006AB',\n", + " '20151012AA',\n", + " '20151013AA',\n", + " '20151013AB',\n", + " '20151014AA',\n", + " '20151014AB',\n", + " '20151016AA',\n", + " '20151016AB',\n", + " '20151016AC',\n", + " '20151017AA',\n", + " '20151019AA',\n", + " '20151023AA',\n", + " '20151027AA',\n", + " '20151030AA',\n", + " '20151102AA',\n", + " '20151102AB',\n", + " '20151102AC',\n", + " '20151103AA',\n", + " '20151107AA',\n", + " '20151112AA',\n", + " '20151112AB',\n", + " '20151114AA',\n", + " '20151116AA',\n", + " '20151116AB',\n", + " '20151118AA',\n", + " '20151119AA',\n", + " '20151119AB',\n", + " '20151120AA',\n", + " '20151120AB',\n", + " '20151123AA',\n", + " '20151125AA',\n", + " '20151125AB',\n", + " '20151128AA',\n", + " '20151201AA',\n", + " '20151201AB',\n", + " '20151202AA',\n", + " '20151204AA',\n", + " '20151206AA',\n", + " '20151207AA',\n", + " '20151209AA',\n", + " '20151218AA',\n", + " '20160105AA',\n", + " '20160106AA',\n", + " '20160112AA',\n", + " '20160112AB',\n", + " '20160113AA',\n", + " '20160113AB',\n", + " '20160119AA',\n", + " '20160121AA',\n", + " '20160202AA',\n", + " '20160208AA',\n", + " '20160208AB',\n", + " '20160208AC',\n", + " '20160216AA',\n", + " '20160217AA',\n", + " '20160219AA',\n", + " '20160222AA',\n", + " '20160224AA',\n", + " '20160224AB',\n", + " '20160225AA',\n", + " '20160308AA',\n", + " '20160308AB',\n", + " '20160309AA',\n", + " '20160313AA',\n", + " '20160315AA',\n", + " '20160318AA',\n", + " '20160321AA',\n", + " '20160323AA',\n", + " '20160324AA',\n", + " '20160327AA',\n", + " '20160330AA',\n", + " '20160401AA',\n", + " '20160404AA',\n", + " '20160404AB',\n", + " '20160405AA',\n", + " '20160405AB',\n", + " '20160406AA',\n", + " '20160408AA',\n", + " '20160409AA',\n", + " '20160409AB',\n", + " '20160410AA',\n", + " '20160411AA',\n", + " '20160411AB',\n", + " '20160411AC',\n", + " '20160411AD',\n", + " '20160413AA',\n", + " '20160414AA',\n", + " '20160415AA',\n", + " '20160416AA',\n", + " '20160416AB',\n", + " '20160421AA',\n", + " '20160426AA',\n", + " '20160427AA',\n", + " '20160427AB',\n", + " '20160429AA',\n", + " '20160429AB',\n", + " '20160507AA',\n", + " '20160514AA',\n", + " '20160515AA',\n", + " '20160516AA',\n", + " '20160517AA',\n", + " '20160517AB',\n", + " '20160517AC',\n", + " '20160517AD',\n", + " '20160517AE',\n", + " '20160519AA',\n", + " '20160520AA',\n", + " '20160524AA',\n", + " '20160531AA',\n", + " '20160601AA',\n", + " '20160601AB',\n", + " '20160601AC',\n", + " '20160601AD',\n", + " '20160602AA',\n", + " '20160603AA',\n", + " '20160605AA',\n", + " '20160607AA',\n", + " '20160607AB',\n", + " '20160609AA',\n", + " '20160609AB',\n", + " '20160609AC',\n", + " '20160611AA',\n", + " '20160612AA',\n", + " '20160613AA',\n", + " '20160613AB',\n", + " '20160615AA',\n", + " '20160615AB',\n", + " '20160616AA',\n", + " '20160617AA',\n", + " '20160617AB',\n", + " '20160618AA',\n", + " '20160621AA',\n", + " '20160621AB',\n", + " '20160621AC',\n", + " '20160621AD',\n", + " '20160622AA',\n", + " '20160622AB',\n", + " '20160624AA',\n", + " '20160624AB',\n", + " '20160625AA',\n", + " '20160628AA',\n", + " '20160629AA',\n", + " '20160630AA',\n", + " '20160702AA',\n", + " '20160708AA',\n", + " '20160708AB',\n", + " '20160712AA',\n", + " '20160717AA',\n", + " '20160719AA',\n", + " '20160719AB',\n", + " '20160721AA',\n", + " '20160726AA',\n", + " '20160726AB',\n", + " '20160727AA',\n", + " '20160729AA',\n", + " '20160730AA',\n", + " '20160801AA',\n", + " '20160801AB',\n", + " '20160803AA',\n", + " '20160803AB',\n", + " '20160809AA',\n", + " '20160809AB',\n", + " '20160812AA',\n", + " '20160812AB',\n", + " '20160813AA',\n", + " '20160813AB',\n", + " '20160813AC',\n", + " '20160819AA',\n", + " '20160819AB',\n", + " '20160820AA',\n", + " '20160822AA',\n", + " '20160824AA',\n", + " '20160825AA',\n", + " '20160829AA',\n", + " '20160831AA',\n", + " '20160905AA',\n", + " '20160907AA',\n", + " '20160911AA',\n", + " '20160912AB',\n", + " '20160913AA',\n", + " '20160913AB',\n", + " '20160916AA',\n", + " '20160916AB',\n", + " '20160918AA',\n", + " '20160919AA',\n", + " '20160921AA',\n", + " '20160921AB',\n", + " '20160926AA',\n", + " '20160926AB',\n", + " '20160926AC',\n", + " '20161001AA',\n", + " '20161004AA',\n", + " '20161006AA',\n", + " '20161011AA',\n", + " '20161016AA',\n", + " '20161017AA',\n", + " '20161019AA',\n", + " '20161019AB',\n", + " '20161020AA',\n", + " '20161026AA',\n", + " '20161028AA',\n", + " '20161030AA',\n", + " '20161101AA',\n", + " '20161101AB',\n", + " '20161103AA',\n", + " '20161103AB',\n", + " '20161104AA',\n", + " '20161104AB',\n", + " '20161109AA',\n", + " '20161109AB',\n", + " '20161109AC',\n", + " '20161110AA',\n", + " '20161110AB',\n", + " '20161110AC',\n", + " '20161110AD',\n", + " '20161110AE',\n", + " '20161112AA',\n", + " '20161118AA',\n", + " '20161121AA',\n", + " '20161122AA',\n", + " '20161122AB',\n", + " '20161123AA',\n", + " '20161125AA',\n", + " '20161127AA',\n", + " '20161128AA',\n", + " '20161129AA',\n", + " '20161204AA',\n", + " '20161204AB',\n", + " '20161205AA',\n", + " '20161206AA',\n", + " '20161207AA',\n", + " '20161208AA',\n", + " '20161212AA',\n", + " '20161216AA',\n", + " '20161216AB',\n", + " '20161227AA',\n", + " '20161227AB',\n", + " '20170103AA',\n", + " '20170109AA',\n", + " '20170112AA',\n", + " '20170115AA',\n", + " '20170117AA',\n", + " '20170118AA',\n", + " '20170123AA',\n", + " '20170124AA',\n", + " '20170130AA',\n", + " '20170131AA',\n", + " '20170203AA',\n", + " '20170203AB',\n", + " '20170203AC',\n", + " '20170203AD',\n", + " '20170205AA',\n", + " '20170207AA',\n", + " '20170208AA',\n", + " '20170209AA',\n", + " '20170210AA',\n", + " '20170212AA',\n", + " '20170214AA',\n", + " '20170214AB',\n", + " '20170215AA',\n", + " '20170216AA',\n", + " '20170216AB',\n", + " '20170220AA',\n", + " '20170222AA',\n", + " '20170223AA',\n", + " '20170223AB',\n", + " '20170223AC',\n", + " '20170224AA',\n", + " '20170225AA',\n", + " '20170227AA',\n", + " '20170227AB',\n", + " '20170227AC',\n", + " '20170301AA',\n", + " '20170302AA',\n", + " '20170307AA',\n", + " '20170308AA',\n", + " '20170308AB',\n", + " '20170309AA',\n", + " '20170310AA',\n", + " '20170312AA',\n", + " '20170317AA',\n", + " '20170320AA',\n", + " '20170320AB',\n", + " '20170320AC',\n", + " '20170321AA',\n", + " '20170321AB',\n", + " '20170322AA',\n", + " '20170322AB',\n", + " '20170323AA',\n", + " '20170324AA',\n", + " '20170325AA',\n", + " '20170328AA',\n", + " '20170329AA',\n", + " '20170330AA',\n", + " '20170403AA',\n", + " '20170412AA',\n", + " '20170412AB',\n", + " '20170413AA',\n", + " '20170413AB',\n", + " '20170413AC',\n", + " '20170414AA',\n", + " '20170416AA',\n", + " '20170417AA',\n", + " '20170417AB',\n", + " '20170420AA',\n", + " '20170421AA',\n", + " '20170422AA',\n", + " '20170423AA',\n", + " '20170423AB',\n", + " '20170426AA',\n", + " '20170427AA',\n", + " '20170428AA',\n", + " '20170501AA',\n", + " '20170501AB',\n", + " '20170501AC',\n", + " '20170501AD',\n", + " '20170503AA',\n", + " '20170503AB',\n", + " '20170503AC',\n", + " '20170504AA',\n", + " '20170504AB',\n", + " '20170505AA',\n", + " '20170505AB',\n", + " '20170505AC',\n", + " '20170506AA',\n", + " '20170507AA',\n", + " '20170508AA',\n", + " '20170508AB',\n", + " '20170509AA',\n", + " '20170509AB',\n", + " '20170510AA',\n", + " '20170511AA',\n", + " '20170515AA',\n", + " '20170515AB',\n", + " '20170516AA',\n", + " '20170517AA',\n", + " '20170519AA',\n", + " '20170520AA',\n", + " '20170522AA',\n", + " '20170523AA',\n", + " '20170524AA',\n", + " '20170525AA',\n", + " '20170525AB',\n", + " '20170527AA',\n", + " '20170531AA',\n", + " '20170602AA',\n", + " '20170603AA',\n", + " '20170606AA',\n", + " '20170608AA',\n", + " '20170609AA',\n", + " '20170609AB',\n", + " '20170609AC',\n", + " '20170611AA',\n", + " '20170611AB',\n", + " '20170612AA',\n", + " '20170613AA',\n", + " '20170614AA',\n", + " '20170615AA',\n", + " '20170615AB',\n", + " '20170616AA',\n", + " '20170617AA',\n", + " '20170618AA',\n", + " '20170619AA',\n", + " '20170626AA',\n", + " '20170626AB',\n", + " '20170629AA',\n", + " '20170705AA',\n", + " '20170706AA',\n", + " '20170706AB',\n", + " '20170706AC',\n", + " '20170711AA',\n", + " '20170712AA',\n", + " '20170714AA',\n", + " '20170716AA',\n", + " '20170716AB',\n", + " '20170717AA',\n", + " '20170720AA',\n", + " '20170720AB',\n", + " '20170720AC',\n", + " '20170721AA',\n", + " '20170721AB',\n", + " '20170724AA',\n", + " '20170725AA',\n", + " '20170725AB',\n", + " '20170727AA',\n", + " '20170728AA',\n", + " '20170728AB',\n", + " '20170729AA',\n", + " '20170731AA',\n", + " '20170803AA',\n", + " '20170804AA',\n", + " '20170805AA',\n", + " '20170807AA',\n", + " '20170808AA',\n", + " '20170809AA',\n", + " '20170810AA',\n", + " '20170811AA',\n", + " '20170811AB',\n", + " '20170814AA',\n", + " '20170815AA',\n", + " '20170816AA',\n", + " '20170819AA',\n", + " '20170821AA',\n", + " '20170821AB',\n", + " '20170822AA',\n", + " '20170823AA',\n", + " '20170828AA',\n", + " '20170829AA',\n", + " '20170831AA',\n", + " '20170901AA',\n", + " '20170905AA',\n", + " '20170906AA',\n", + " '20170907AA',\n", + " '20170908AA',\n", + " '20170908AB',\n", + " '20170908AC',\n", + " '20170910AA',\n", + " '20170911AA',\n", + " '20170913AA',\n", + " '20170913AB',\n", + " '20170913AC',\n", + " '20170914AA',\n", + " '20170914AB',\n", + " '20170915AA',\n", + " '20170915AB',\n", + " '20170918AA',\n", + " '20170919AA',\n", + " '20170920AA',\n", + " '20170920AB',\n", + " '20170920AC',\n", + " '20170921AA',\n", + " '20170922AA',\n", + " '20170922AB',\n", + " '20170922AC',\n", + " '20170925AA',\n", + " '20170926AA',\n", + " '20170926AB',\n", + " '20170927AA',\n", + " '20170927AB',\n", + " '20170928AA',\n", + " '20170929AA',\n", + " '20170930AA',\n", + " '20171001AA',\n", + " '20171001AB',\n", + " '20171002AA',\n", + " '20171003AA',\n", + " '20171003AB',\n", + " '20171004AA',\n", + " '20171009AA',\n", + " '20171010AA',\n", + " '20171010AB',\n", + " '20171012AA',\n", + " '20171013AA',\n", + " '20171015AA',\n", + " '20171016AA',\n", + " '20171017AA',\n", + " '20171018AA',\n", + " '20171019AA',\n", + " '20171020AA',\n", + " '20171022AA',\n", + " '20171023AA',\n", + " '20171024AA',\n", + " '20171024AB',\n", + " '20171024AC',\n", + " '20171025AA',\n", + " '20171027AA',\n", + " '20171101AA',\n", + " '20171103AA',\n", + " '20171104AA',\n", + " '20171104AB',\n", + " '20171105AA',\n", + " '20171106AA',\n", + " '20171106AB',\n", + " '20171106AC',\n", + " '20171107AA',\n", + " '20171109AA',\n", + " '20171109AB',\n", + " '20171113AA',\n", + " '20171113AB',\n", + " '20171113AC',\n", + " '20171114AA',\n", + " '20171115AA',\n", + " '20171117AA',\n", + " '20171117AB',\n", + " '20171117AC',\n", + " '20171119AA',\n", + " '20171120AA',\n", + " '20171120AB',\n", + " '20171121AA',\n", + " '20171121AB',\n", + " '20171122AA',\n", + " '20171122AB',\n", + " '20171122AC',\n", + " '20171124AA',\n", + " '20171127AA',\n", + " '20171127AB',\n", + " '20171128AA',\n", + " '20171129AA',\n", + " '20171205AA',\n", + " '20171205AB',\n", + " '20171206AA',\n", + " '20171208AA',\n", + " '20171210AA',\n", + " '20171210AB',\n", + " '20171211AA',\n", + " '20171211AB',\n", + " '20171211AC',\n", + " '20171213AA',\n", + " '20171218AA',\n", + " '20171218AB',\n", + " '20171218AC',\n", + " '20171221AA',\n", + " '20171222AA',\n", + " '20171223AA',\n", + " '20171228AA',\n", + " '20171229AA',\n", + " '20171230AA',\n", + " '20180105AA',\n", + " '20180105AB',\n", + " '20180105AC',\n", + " '20180105AD',\n", + " '20180108AA',\n", + " '20180108AB',\n", + " '20180109AA',\n", + " '20180109AB',\n", + " '20180110AA',\n", + " '20180110AB',\n", + " '20180113AA',\n", + " '20180119AA',\n", + " '20180120AA',\n", + " '20180121AA',\n", + " '20180123AA',\n", + " '20180124AA',\n", + " '20180125AA',\n", + " '20180126AA',\n", + " '20180126AB',\n", + " '20180127AA',\n", + " '20180128AA',\n", + " '20180130AA',\n", + " '20180201AA',\n", + " '20180201AB',\n", + " '20180201AC',\n", + " '20180202AA',\n", + " '20180202AB',\n", + " '20180202AC',\n", + " '20180204AA',\n", + " '20180204AB',\n", + " '20180205AA',\n", + " '20180205AB',\n", + " '20180205AC',\n", + " '20180206AA',\n", + " '20180208AA',\n", + " '20180208AB',\n", + " '20180209AA',\n", + " '20180211AA',\n", + " '20180213AA',\n", + " '20180213AB',\n", + " '20180214AA',\n", + " '20180215AA',\n", + " '20180215AB',\n", + " '20180215AC',\n", + " '20180219AA',\n", + " '20180219AB',\n", + " '20180220AA',\n", + " '20180221AA',\n", + " '20180221AB',\n", + " '20180222AA',\n", + " '20180222AB',\n", + " '20180227AA',\n", + " '20180228AA',\n", + " '20180228AB',\n", + " '20180302AA',\n", + " '20180303AA',\n", + " '20180304AA',\n", + " '20180304AB',\n", + " '20180304AC',\n", + " '20180304AD',\n", + " '20180305AA',\n", + " '20180306AA',\n", + " '20180308AA',\n", + " '20180310AA',\n", + " '20180313AA',\n", + " '20180315AA',\n", + " '20180315AB',\n", + " '20180315AC',\n", + " '20180316AA',\n", + " '20180316AB',\n", + " '20180318AA',\n", + " '20180319AA',\n", + " '20180319AB',\n", + " '20180319AC',\n", + " '20180320AA',\n", + " '20180321AA',\n", + " '20180323AA',\n", + " '20180323AB',\n", + " '20180324AA',\n", + " '20180325AA',\n", + " '20180327AA',\n", + " '20180328AA',\n", + " '20180329AA',\n", + " '20180331AA',\n", + " '20180401AA',\n", + " '20180402AA',\n", + " '20180402AB',\n", + " '20180403AA',\n", + " '20180404AA',\n", + " '20180409AA',\n", + " '20180409AB',\n", + " '20180409AC',\n", + " '20180413AA',\n", + " '20180413AB',\n", + " '20180416AA',\n", + " '20180417AA',\n", + " '20180418AA',\n", + " '20180418AB',\n", + " '20180423AA',\n", + " '20180424AA',\n", + " '20180425AA',\n", + " '20180425AB',\n", + " '20180425AC',\n", + " '20180425AD',\n", + " '20180426AA',\n", + " '20180426AB',\n", + " '20180426AC',\n", + " '20180427AA',\n", + " '20180430AA',\n", + " '20180430AB',\n", + " '20180430AC',\n", + " '20180502AA',\n", + " '20180503AA',\n", + " '20180503AB',\n", + " '20180504AA',\n", + " '20180507AA',\n", + " '20180508AA',\n", + " '20180509AA',\n", + " '20180509AB',\n", + " '20180514AA',\n", + " '20180515AA',\n", + " '20180515AB',\n", + " '20180516AA',\n", + " '20180516AB',\n", + " '20180518AA',\n", + " '20180521AA',\n", + " '20180521AB',\n", + " '20180523AA',\n", + " '20180528AA',\n", + " '20180529AA',\n", + " '20180529AB',\n", + " '20180529AC',\n", + " '20180530AA',\n", + " '20180601AA',\n", + " '20180602AA',\n", + " '20180605AA',\n", + " '20180605AB',\n", + " '20180605AC',\n", + " '20180605AD',\n", + " '20180607AA',\n", + " '20180608AA',\n", + " '20180608AB',\n", + " '20180610AA',\n", + " '20180610AB',\n", + " '20180611AA',\n", + " '20180611AB',\n", + " '20180612AA',\n", + " '20180613AA',\n", + " '20180614AA',\n", + " '20180614AB',\n", + " '20180615AA',\n", + " '20180616AA',\n", + " '20180619AA',\n", + " '20180619AB',\n", + " '20180620AA',\n", + " '20180625AA',\n", + " '20180626AA',\n", + " '20180628AA',\n", + " '20180628AB',\n", + " '20180701AA',\n", + " '20180703AA',\n", + " '20180703AB',\n", + " '20180703AC',\n", + " '20180707AA',\n", + " '20180709AA',\n", + " '20180709AB',\n", + " '20180710AA',\n", + " '20180710AB',\n", + " '20180710AC',\n", + " '20180711AA',\n", + " '20180711AB',\n", + " '20180712AA',\n", + " '20180713AA',\n", + " '20180716AA',\n", + " '20180719AA',\n", + " '20180720AA',\n", + " '20180722AA',\n", + " '20180723AA',\n", + " '20180723AB',\n", + " '20180724AA',\n", + " '20180724AB',\n", + " '20180725AA',\n", + " '20180725AB',\n", + " '20180725AC',\n", + " '20180730AA',\n", + " '20180731AA',\n", + " '20180801AA',\n", + " '20180801AB',\n", + " '20180802AA',\n", + " '20180802AB',\n", + " '20180803AA',\n", + " '20180804AA',\n", + " '20180807AA',\n", + " '20180807AB',\n", + " '20180808AA',\n", + " '20180809AA',\n", + " '20180809AB',\n", + " '20180809AC',\n", + " '20180810AA',\n", + " '20180811AA',\n", + " '20180812AA',\n", + " '20180814AA',\n", + " '20180814AB',\n", + " '20180815AA',\n", + " '20180816AA',\n", + " '20180816AB',\n", + " '20180817AA',\n", + " '20180819AA',\n", + " '20180819AB',\n", + " '20180821AA',\n", + " '20180821AB',\n", + " '20180822AA',\n", + " '20180826AA',\n", + " '20180827AA',\n", + " '20180827AB',\n", + " '20180829AA',\n", + " '20180831AA',\n", + " '20180903AA',\n", + " '20180904AA',\n", + " '20180904AB',\n", + " '20180905AA',\n", + " '20180906AA',\n", + " '20180910AA',\n", + " '20180910AB',\n", + " '20180912AA',\n", + " '20180914AA',\n", + " '20180918AA',\n", + " '20180918AB',\n", + " '20180919AA',\n", + " '20180920AA',\n", + " '20180920AB',\n", + " '20180925AA',\n", + " '20180925AB',\n", + " '20180927AA',\n", + " '20180927AB',\n", + " '20181001AA',\n", + " '20181003AA',\n", + " '20181005AA',\n", + " '20181006AA',\n", + " '20181010AA',\n", + " '20181010AB',\n", + " '20181012AA',\n", + " '20181013AA',\n", + " '20181015AA',\n", + " '20181016AA',\n", + " '20181017AA',\n", + " '20181017AB',\n", + " '20181018AA',\n", + " '20181019AA',\n", + " '20181022AA',\n", + " '20181023AA',\n", + " '20181023AB',\n", + " '20181024AA',\n", + " '20181024AB',\n", + " '20181024AC',\n", + " '20181025AA',\n", + " '20181026AA',\n", + " '20181029AA',\n", + " '20181030AA',\n", + " '20181030AB',\n", + " '20181031AA',\n", + " '20181101AA',\n", + " '20181101AB',\n", + " '20181101AC',\n", + " '20181101AD',\n", + " '20181102AA',\n", + " '20181102AB',\n", + " '20181102AC',\n", + " '20181102AD',\n", + " '20181105AA',\n", + " '20181105AB',\n", + " '20181105AC',\n", + " '20181105AD',\n", + " '20181106AA',\n", + " '20181106AB',\n", + " '20181106AC',\n", + " '20181106AD',\n", + " '20181106AE',\n", + " '20181106AF',\n", + " '20181107AA',\n", + " '20181108AA',\n", + " '20181108AB',\n", + " '20181108AC',\n", + " '20181110AA',\n", + " '20181111AA',\n", + " '20181112AA',\n", + " '20181112AB',\n", + " '20181112AC',\n", + " '20181112AD',\n", + " '20181113AA',\n", + " '20181114AA',\n", + " '20181115AA',\n", + " '20181115AB',\n", + " '20181115AC',\n", + " '20181115AD',\n", + " '20181120AA',\n", + " '20181120AB',\n", + " '20181120AC',\n", + " '20181123AA',\n", + " '20181125AA',\n", + " '20181126AA',\n", + " '20181126AB',\n", + " '20181127AA',\n", + " '20181127AB',\n", + " '20181127AC',\n", + " '20181128AA',\n", + " '20181129AA',\n", + " '20181129AB',\n", + " '20181130AA',\n", + " '20181130AB',\n", + " '20181201AA',\n", + " '20181201AB',\n", + " '20181204AA',\n", + " '20181204AB',\n", + " '20181204AC',\n", + " '20181205AA',\n", + " '20181205AB',\n", + " '20181206AA',\n", + " '20181206AB',\n", + " '20181206AC',\n", + " '20181206AD',\n", + " '20181206AE',\n", + " '20181206AF',\n", + " '20181206AG',\n", + " '20181207AA',\n", + " '20181208AA',\n", + " '20181210AA',\n", + " '20181210AB',\n", + " '20181211AA',\n", + " '20181211AB',\n", + " '20181211AC',\n", + " '20181212AA',\n", + " '20181214AA',\n", + " '20181216AA',\n", + " '20181216AB',\n", + " '20181219AA',\n", + " '20181219AB',\n", + " '20181221AA',\n", + " '20181221AB',\n", + " '20181221AC',\n", + " '20181222AA',\n", + " '20181228AA',\n", + " '20181230AA',\n", + " '20190103AA',\n", + " '20190108AA',\n", + " '20190109AA',\n", + " '20190110AA',\n", + " '20190110AB',\n", + " '20190110AC',\n", + " '20190111AA',\n", + " '20190113AA',\n", + " '20190113AB',\n", + " '20190114AA',\n", + " '20190115AA',\n", + " '20190116AA',\n", + " '20190116AB',\n", + " '20190116AC',\n", + " '20190119AA',\n", + " '20190121AA',\n", + " '20190122AA',\n", + " '20190125AA',\n", + " '20190128AA',\n", + " '20190128AB',\n", + " '20190128AC',\n", + " '20190129AA',\n", + " '20190129AB',\n", + " '20190130AA',\n", + " '20190131AA',\n", + " '20190131AB',\n", + " '20190131AC',\n", + " '20190201AA',\n", + " '20190201AB',\n", + " '20190201AC',\n", + " '20190202AA',\n", + " '20190204AA',\n", + " '20190205AA',\n", + " '20190205AB',\n", + " '20190205AC',\n", + " '20190206AA',\n", + " '20190206AB',\n", + " '20190206AC',\n", + " '20190208AA',\n", + " '20190212AA',\n", + " '20190212AB',\n", + " '20190213AA',\n", + " '20190213AB',\n", + " '20190213AC',\n", + " '20190213AD',\n", + " '20190213AE',\n", + " '20190215AA',\n", + " '20190215AB',\n", + " '20190215AC',\n", + " '20190215AD',\n", + " '20190215AE',\n", + " '20190215AF',\n", + " '20190219AA',\n", + " '20190220AA',\n", + " '20190220AB',\n", + " '20190220AC',\n", + " '20190221AA',\n", + " '20190223AA',\n", + " '20190223AB',\n", + " '20190223AC',\n", + " '20190225AA',\n", + " '20190227AA',\n", + " '20190227AB',\n", + " '20190301AA',\n", + " '20190301AB',\n", + " '20190301AC',\n", + " '20190301AD',\n", + " '20190304AA',\n", + " '20190304AB',\n", + " '20190304AC',\n", + " '20190304AD',\n", + " '20190305AA',\n", + " '20190306AA',\n", + " '20190306AB',\n", + " '20190307AA',\n", + " '20190309AA',\n", + " '20190310AA',\n", + " '20190311AA',\n", + " '20190311AB',\n", + " '20190311AC',\n", + " '20190312AA',\n", + " '20190313AA',\n", + " '20190313AB',\n", + " '20190313AC',\n", + " '20190313AD',\n", + " '20190313AE',\n", + " '20190314AA',\n", + " '20190314AB',\n", + " '20190314AC',\n", + " '20190314AD',\n", + " '20190314AE',\n", + " '20190314AF',\n", + " '20190314AG',\n", + " '20190314AH',\n", + " '20190314AI',\n", + " '20190314AJ',\n", + " '20190314AK',\n", + " '20190314AL',\n", + " '20190315AA',\n", + " '20190315AB',\n", + " '20190315AC',\n", + " '20190315AD',\n", + " '20190320AA',\n", + " '20190320AB',\n", + " '20190320AC',\n", + " '20190320AD',\n", + " '20190325AA',\n", + " '20190326AA',\n", + " '20190326AB',\n", + " '20190327AA',\n", + " '20190327AB',\n", + " '20190327AC',\n", + " '20190328AA',\n", + " '20190329AA',\n", + " '20190329AB',\n", + " '20190401AA',\n", + " '20190401AB',\n", + " '20190402AA',\n", + " '20190404AA',\n", + " '20190405AA',\n", + " '20190406AA',\n", + " '20190410AA',\n", + " '20190410AB',\n", + " '20190410AC',\n", + " '20190411AA',\n", + " '20190411AB',\n", + " '20190411AC',\n", + " '20190412AA',\n", + " '20190416AA',\n", + " '20190417AA',\n", + " '20190417AB',\n", + " '20190418AA',\n", + " '20190420AA',\n", + " '20190422AA',\n", + " '20190423AA',\n", + " '20190424AA',\n", + " '20190424AB',\n", + " '20190426AA',\n", + " '20190427AA',\n", + " '20190427AB',\n", + " '20190429AA',\n", + " '20190430AA',\n", + " '20190430AB',\n", + " '20190430AC',\n", + " '20190503AA',\n", + " '20190503AB',\n", + " '20190506AA',\n", + " '20190507AA',\n", + " '20190507AB',\n", + " '20190512AA',\n", + " '20190513AA',\n", + " '20190513AB',\n", + " '20190513AC',\n", + " '20190514AA',\n", + " '20190515AA',\n", + " '20190515AB',\n", + " '20190515AC',\n", + " '20190515AD',\n", + " '20190515AE',\n", + " '20190516AA',\n", + " '20190517AA',\n", + " '20190520AA',\n", + " '20190522AA',\n", + " '20190522AB',\n", + " '20190522AC',\n", + " '20190522AD',\n", + " '20190522AE',\n", + " '20190522AF',\n", + " '20190524AA',\n", + " '20190524AB',\n", + " '20190526AA',\n", + " '20190526AB',\n", + " '20190527AA',\n", + " '20190527AB',\n", + " '20190528AA',\n", + " '20190528AB',\n", + " '20190528AC',\n", + " '20190529AA',\n", + " '20190529AB',\n", + " '20190529AC',\n", + " '20190530AA',\n", + " '20190530AB',\n", + " '20190530AC',\n", + " '20190531AA',\n", + " '20190603AA',\n", + " '20190604AA',\n", + " '20190604AB',\n", + " '20190604AC',\n", + " '20190604AD',\n", + " '20190604AE',\n", + " '20190605AA',\n", + " '20190605AB',\n", + " '20190605AC',\n", + " '20190606AA',\n", + " '20190606AB',\n", + " '20190606AC',\n", + " '20190606AD',\n", + " '20190607AA',\n", + " '20190608AA',\n", + " '20190609AA',\n", + " '20190611AA',\n", + " '20190612AA',\n", + " '20190613AA',\n", + " '20190613AB',\n", + " '20190614AA',\n", + " '20190615AA',\n", + " '20190616AA',\n", + " '20190616AB',\n", + " '20190616AC',\n", + " '20190617AA',\n", + " '20190618AA',\n", + " '20190620AA',\n", + " '20190620AB',\n", + " '20190620AC',\n", + " '20190620AD',\n", + " '20190621AA',\n", + " '20190621AB',\n", + " '20190624AA',\n", + " '20190625AA',\n", + " '20190625AB',\n", + " '20190625AC',\n", + " '20190625AD',\n", + " '20190625AE',\n", + " '20190626AA',\n", + " '20190701AA',\n", + " '20190703AA',\n", + " '20190707AA',\n", + " '20190707AB',\n", + " '20190708AA',\n", + " '20190708AB',\n", + " '20190709AA',\n", + " '20190709AB',\n", + " '20190710AA',\n", + " '20190711AA',\n", + " '20190711AB',\n", + " '20190711AC',\n", + " '20190712AA',\n", + " '20190713AA',\n", + " '20190714AA',\n", + " '20190716AA',\n", + " '20190716AB',\n", + " '20190717AA',\n", + " '20190717AB',\n", + " '20190717AC',\n", + " '20190718AA',\n", + " '20190718AB',\n", + " '20190718AC',\n", + " '20190719AA',\n", + " '20190719AB',\n", + " '20190722AA',\n", + " '20190722AB',\n", + " '20190722AC',\n", + " '20190723AA',\n", + " '20190724AA',\n", + " '20190724AB',\n", + " '20190724AC',\n", + " '20190724AD',\n", + " '20190725AA',\n", + " '20190726AA',\n", + " '20190729AA',\n", + " '20190729AB',\n", + " '20190730AA',\n", + " '20190731AA',\n", + " '20190731AB',\n", + " '20190731AC',\n", + " '20190731AD',\n", + " '20190731AE',\n", + " '20190802AA',\n", + " '20190804AA',\n", + " '20190806AA',\n", + " '20190807AA',\n", + " '20190807AB',\n", + " '20190808AA',\n", + " '20190808AB',\n", + " '20190810AA',\n", + " '20190810AB',\n", + " '20190812AA',\n", + " '20190812AB',\n", + " '20190813AA',\n", + " '20190813AB',\n", + " '20190814AA',\n", + " '20190814AB',\n", + " '20190814AC',\n", + " '20190815AA',\n", + " '20190816AA',\n", + " '20190819AA',\n", + " '20190819AB',\n", + " '20190821AA',\n", + " '20190822AA',\n", + " '20190822AB',\n", + " '20190822AC',\n", + " '20190823AA',\n", + " '20190825AA',\n", + " '20190826AA',\n", + " '20190826AB',\n", + " '20190827AA',\n", + " '20190828AA',\n", + " '20190828AB',\n", + " '20190828AC',\n", + " '20190829AA',\n", + " '20190901AA',\n", + " '20190901AB',\n", + " '20190901AC',\n", + " '20190903AA',\n", + " '20190903AB',\n", + " '20190904AA',\n", + " '20190905AA',\n", + " '20190905AB',\n", + " '20190906AA',\n", + " '20190906AB',\n", + " '20190906AC',\n", + " '20190909AA',\n", + " '20190909AB',\n", + " '20190909AC',\n", + " '20190911AA',\n", + " '20190912AA',\n", + " '20190912AB',\n", + " '20190912AC',\n", + " '20190912AD',\n", + " '20190912AE',\n", + " '20190912AF',\n", + " '20190913AA',\n", + " '20190914AA',\n", + " '20190914AB',\n", + " '20190915AA',\n", + " '20190916AA',\n", + " '20190916AB',\n", + " '20190916AC',\n", + " '20190917AA',\n", + " '20190917AB',\n", + " '20190917AC',\n", + " '20190917AD',\n", + " '20190919AA',\n", + " '20190920AA',\n", + " '20190920AB',\n", + " '20190922AA',\n", + " '20190922AB',\n", + " '20190923AA',\n", + " '20190924AA',\n", + " '20190924AB',\n", + " '20190924AC',\n", + " '20190924AD',\n", + " '20190925AA',\n", + " '20190925AB',\n", + " '20190925AC',\n", + " '20190926AA',\n", + " '20190926AB',\n", + " '20190926AC',\n", + " '20190927AA',\n", + " '20190930AA',\n", + " '20190930AB',\n", + " '20190930AC',\n", + " '20191001AA',\n", + " '20191003AA',\n", + " '20191003AB',\n", + " '20191003AC',\n", + " '20191004AA',\n", + " '20191007AA',\n", + " '20191008AA',\n", + " '20191008AB',\n", + " '20191009AA',\n", + " '20191009AB',\n", + " '20191010AA',\n", + " '20191011AA',\n", + " '20191012AA',\n", + " '20191013AA',\n", + " '20191014AA',\n", + " '20191017AA',\n", + " '20191017AB',\n", + " '20191017AC',\n", + " '20191020AA',\n", + " '20191021AA',\n", + " '20191021AB',\n", + " '20191022AA',\n", + " '20191023AA',\n", + " '20191023AB',\n", + " '20191023AC',\n", + " '20191023AD',\n", + " '20191024AA',\n", + " '20191024AB',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_file_tree_and_json(author_source, registry_source, metadata_directory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "audit_files(raw_files_directory)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "convert_files(raw_files_directory, metadata_directory, directories_to_process)\n", + "print('Done converting files.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/scripts/EGAP/files_to_import_structure.py b/scripts/EGAP/files_to_import_structure.py new file mode 100644 index 00000000000..43b9dbfa3f1 --- /dev/null +++ b/scripts/EGAP/files_to_import_structure.py @@ -0,0 +1,159 @@ +import os +import re +import shutil +import argparse +from distutils.dir_util import copy_tree +import logging + +from nose.tools import assert_equal + +logger = logging.getLogger(__name__) + + +# This takes the item id from the path of the project directory for example '20121001AA Findley' -> '20121001AA' +get_item_id = lambda _path: _path.split(os.sep)[-1].split(' ')[0] + + +def get_project_id(root, source_dir): + project_id_base = root.split(source_dir)[-1] + if ' ' in project_id_base: + project_id = project_id_base.split(' ')[0].split('/')[-1] + else: + project_id = project_id_base.split('/')[0] + return project_id + + +# Check if file name starts with EGAP id for example '20121001AA_PAP.pdf' +def check_id(project_id, item): + return item.startswith(project_id) + + +# Check if file follows anonymous naming convention +check_anon = lambda item: 'pap_anon' in item.lower() or 'anonymous' in item.lower() + + +def action_files_by_name(root, source_item, item_name): + """ + Pick out anonymous and create new folder to move them into it, remove ones that don't follow id naming convention. + :param root: + :param source_item: + :param item_name: + :return: + """ + project_id = get_project_id(root, source_item) + path = os.path.join(root, item_name) + if not check_id(project_id, item_name): + os.remove(path) + return + + if check_anon(item_name): + destination_parent = os.path.join('/'.join(root.split('/')[:-1]), 'anonymous') + + if not os.path.exists(destination_parent): + os.mkdir(destination_parent) + destination_item = os.path.join(destination_parent, item_name) + shutil.move(path, destination_item) + + +def audit_files(source_directory): + logger.info('Running audit. Source: {}'.format(source_directory)) + + including = open('including.txt', 'w+') + ignoring = open('ignoring.txt', 'w+') + for root, directory, files in os.walk(source_directory): + for item in files: + project_id = get_project_id(root, source_directory) + name = '{}/{}'.format(root.split(source_directory)[-1], item) # get file/folder name from just under source + if not check_id(project_id, item): + ignoring.writelines(name + '\r') + else: + including.writelines(name + '\r') + + ignoring.close() + including.close() + + projects = set(os.listdir(source_directory)) + project_ids = set([get_item_id(folders) for folders in list(projects)]) + + # check for duplicate ids + assert_equal(len(projects), len(project_ids)) + + +def main(files_dir, metadata_dir, id_list=None): + """ + This is a script for our EGAP partnership that converts the EGAP provided dump of files into a directory structure + we can easily import into the OSF. Some files in the dump are anonymous and need to be sorted into a special folder + some don't follow an id naming convention and should be ignored and not imported. + + This script copies whole file tree for a project to preserve file hierarchy then picks out anonymous files and moves + them to the anonymous folder and delete those that don't follow the naming convention. + + This script can be safely removed once all EGAP registrations have been imported. + + :param files_dir: the source path we're picking files out of + :param metadata_dir: a pre-made directory structure for importing projects that we are packing files into. + :param id_list: an optional list of project ids to limit what gets processed + :return: + """ + logger.info('Processing files. Source: {} Destination: {}'.format(files_dir, metadata_dir)) + + project_dirs = os.listdir(files_dir) + if id_list: + project_dirs = [project for project in project_dirs if get_item_id(project) in id_list] + + logger.info('Processing directories: {}'.format(project_dirs)) + + # Copy whole tree to preserve file hierarchy then + for item in project_dirs: + item_id = get_item_id(item) + source_item = os.path.join(files_dir, item) + destination_item = os.path.join(metadata_dir, item_id, 'data', 'nonanonymous') + if os.path.isdir(source_item): + copy_tree(source_item, destination_item) + + for root, directory, files in os.walk(metadata_dir): + for item in files: + if item not in ('project.json', 'registration-schema.json'): + action_files_by_name(root, metadata_dir, item) + + # Check All anon files in /anonymous/ directory + for root, directory, files in os.walk(metadata_dir): + for item in files: + if item not in ('project.json', 'registration-schema.json', '.DS_Store'): + if check_anon(item): + assert '/anonymous' in root + else: + assert '/nonanonymous' in root + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '-source', + '--source', + help='The directory for the EGAP data files, traditionally called "EGAP_REGISTRY_staging/3 Registrations/"' + ) + parser.add_argument( + '-destination', + '--destination', + help='The directory of the import file structure containing the bags of data.' + ) + parser.add_argument( + '-list', + '--list', + help='An optional list of ids to import into a the new metadata directory.' + ) + parser.add_argument( + '-audit', + '--audit', + help='Boolean to generate two lists of all files that should and should not be included. Needs "source".' + ) + + args = parser.parse_args() + source = args.source + destination = args.destination + audit = args.audit + if audit: + audit_files(source) + else: + main(source, destination) diff --git a/scripts/EGAP/requirements.txt b/scripts/EGAP/requirements.txt new file mode 100644 index 00000000000..7e65d67ac4f --- /dev/null +++ b/scripts/EGAP/requirements.txt @@ -0,0 +1,68 @@ +appnope==0.1.0 +attrs==19.3.0 +backcall==0.1.0 +bcrypt==3.1.7 +bleach==3.1.0 +blinker==1.4 +bson==0.5.8 +cffi==1.13.1 +Click==7.0 +decorator==4.4.0 +defusedxml==0.6.0 +Django==2.2.6 +django-rest-framework==0.1.0 +djangorestframework==3.10.3 +entrypoints==0.3 +Flask==1.1.1 +furl==2.1.0 +future==0.18.1 +importlib-metadata==0.23 +ipykernel==5.1.3 +ipython==7.8.0 +ipython-genutils==0.2.0 +ipywidgets==7.5.1 +itsdangerous==1.1.0 +jedi==0.15.1 +Jinja2==2.10.3 +json5==0.8.5 +jsonschema==3.1.1 +jupyter==1.0.0 +jupyter-client==5.3.4 +jupyter-console==6.0.0 +jupyter-core==4.6.1 +jupyterlab==1.1.4 +jupyterlab-server==1.0.6 +MarkupSafe==1.1.1 +mistune==0.8.4 +more-itertools==7.2.0 +nbconvert==5.6.1 +nbformat==4.4.0 +nose==1.3.7 +notebook==6.0.1 +orderedmultidict==1.0.1 +pandocfilters==1.4.2 +parso==0.5.1 +pexpect==4.7.0 +pickleshare==0.7.5 +prometheus-client==0.7.1 +prompt-toolkit==2.0.10 +ptyprocess==0.6.0 +pycparser==2.19 +Pygments==2.4.2 +pyrsistent==0.15.4 +python-dateutil==2.8.0 +pytz==2019.3 +pyzmq==18.1.0 +qtconsole==4.5.5 +Send2Trash==1.5.0 +six==1.12.0 +sqlparse==0.3.0 +terminado==0.8.2 +testpath==0.4.2 +tornado==6.0.3 +traitlets==4.3.3 +wcwidth==0.1.7 +webencodings==0.5.1 +Werkzeug==0.16.0 +widgetsnbextension==3.5.1 +zipp==0.6.0 diff --git a/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf b/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_PAP.pdf b/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_PAP.pdf new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf b/scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/tests/test_files_to_import_structure.py b/scripts/tests/test_files_to_import_structure.py new file mode 100644 index 00000000000..0778df7f82f --- /dev/null +++ b/scripts/tests/test_files_to_import_structure.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +import mock +from tests.base import OsfTestCase +from scripts.EGAP.files_to_import_structure import action_files_by_name + + +class TestEGAPFilesToImportStructure(OsfTestCase): + + @mock.patch('scripts.EGAP.files_to_import_structure.os.mkdir') + @mock.patch('scripts.EGAP.files_to_import_structure.shutil.move') + def test_doesnt_move_nonanon_files(self, mock_move, mock_mkdir): + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/datatest_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_PAP.pdf', + '20151016AA_PAP.pdf' + ) + assert not mock_mkdir.called + assert not mock_move.called + + @mock.patch('scripts.EGAP.files_to_import_structure.os.mkdir') + @mock.patch('scripts.EGAP.files_to_import_structure.shutil.move') + def test_moves_anon_files(self, mock_move, mock_mkdir): + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_anonymous.pdf', + '20151016AA_anonymous.pdf' + ) + + mock_mkdir.assert_called_with('scripts/tests/test_files/20151016AA/data/anonymous') + + mock_move.assert_called_with( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_anonymous.pdf', + 'scripts/tests/test_files/20151016AA/data/anonymous/20151016AA_anonymous.pdf' + ) + + @mock.patch('scripts.EGAP.files_to_import_structure.os.remove') + def test_removes_no_id(self, mock_remove): + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf', + 'justafile.pdf' + ) + + mock_remove.assert_called_with('scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf') + + @mock.patch('scripts.EGAP.files_to_import_structure.os.remove') + def test_removes_form(self, mock_remove): + + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf', + '20151016AA_FORM.pdf' + ) + + mock_remove.assert_called_with('scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf')