From 7c94331c86efc9527fb99837019a711c5a74a9f7 Mon Sep 17 00:00:00 2001 From: Abram Booth Date: Wed, 23 Oct 2019 10:49:02 -0400 Subject: [PATCH 1/9] fix: avoid encoding error in assets containers (#9169) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose `invoke assets -w` fails with a unicode error -- Unsure of the root cause (possibly webpack speaking the non-ascii rune `…` aloud?) but @felliott found a workaround. ## Changes Set `LANG=en_US.UTF-8` for `assets` and `admin_assets` containers in our docker-compose.yml. This avoids the problem in the common case, for now. --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 62d0afb345d..05a02c16c59 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -383,6 +383,7 @@ services: restart: unless-stopped environment: DJANGO_SETTINGS_MODULE: api.base.settings + LANG: en_US.UTF-8 volumes: - ./:/code:cached - osf_requirements_vol:/usr/lib/python2.7 @@ -396,6 +397,7 @@ services: restart: unless-stopped environment: DJANGO_SETTINGS_MODULE: admin.base.settings + LANG: en_US.UTF-8 volumes: - ./:/code:cached - osf_requirements_vol:/usr/lib/python2.7 From affd450aea206f029b3d01c6c2337ce4cfce9f5e Mon Sep 17 00:00:00 2001 From: corbinSanders <50155660+corbinSanders@users.noreply.github.com> Date: Mon, 28 Oct 2019 12:25:38 -0400 Subject: [PATCH 2/9] [ENG-893] parse user create structure (#9177) ## Purpose Rather than dumping all the data directly from the EGAP spreadsheets, a better migration plan is to put validated data into JSON, then migrate the data from the JSON into the OSF. This PR creates a management command that reads in data from the EGAP registry spreadsheet and the EGAP author spreadsheet to create an OSFBag directory. (See https://openscience.atlassian.net/browse/ENG-1109 for details). This creates the base directory for each registry, and generates two JSON files. Project.json which contains the ID, Post date, a list of contributors names and emails (if available), and the title of the project. Registration-schema.json is also created, which contains the metadata of the registry. This metadata is already validated against the egap registration schema, and placed in the format accepted by the draft registration model. ## Changes osf/management/commands/create_EGAP_json.py - The management command to create the directories and json ## QA Notes Spot check that each registry in the EGAP schema spreadsheet is in the generated directory, and all the data is there and correct. ## Documentation N/A ## Side Effects N/A - Works outside of OSF ## Ticket https://openscience.atlassian.net/browse/ENG-893 --- scripts/EGAP/EGAP_tests.py | 151 ++++++++++++++++++ scripts/EGAP/__init__.py | 0 scripts/EGAP/create_EGAP_json.py | 258 +++++++++++++++++++++++++++++++ 3 files changed, 409 insertions(+) create mode 100644 scripts/EGAP/EGAP_tests.py create mode 100644 scripts/EGAP/__init__.py create mode 100644 scripts/EGAP/create_EGAP_json.py diff --git a/scripts/EGAP/EGAP_tests.py b/scripts/EGAP/EGAP_tests.py new file mode 100644 index 00000000000..b78f659ea88 --- /dev/null +++ b/scripts/EGAP/EGAP_tests.py @@ -0,0 +1,151 @@ +import unittest +from create_EGAP_json import (schema_to_spreadsheet_mapping, + make_project_dict, + make_registration_dict, + other_mapping, +) + +HEADER_ROW = ['POST DATE', + 'ID', + 'STATUS', + 'TITLE', + 'B2 AUTHORS', + 'EMAIL', + 'B3 ACKNOWLEDGEMENTS', + 'B4 FACULTY MEMBER?', + 'B5 PROSPECTIVE OR RETROSPECTIVE?', + 'B6 EXPERIMENTAL STUDY?', + 'B7 DATE OF START OF STUDY', + 'B8 GATE DATE', + 'B8 FORMERLY GATED UNTIL', + 'B9 PRESENTED AT EGAP MEETING?', + 'B10 PRE-ANALYSIS PLAN WITH REGISTRATION?', + 'C1 BACKGROUND', + 'C2 HYPOTHESES', + 'C3 TESTING PLAN', + 'C4 COUNTRY', + 'C5 SAMPLE SIZE', + 'C6 POWER ANALYSIS?', + 'C7 IRB APPROVAL?', + 'C8 IRB NUMBER', + 'C9 DATE OF IRB APPROVAL', + 'C10 INTERVENTION IMPLEMENTER', + 'C11 REMUNERATION?', + 'C12 PUBLICATION AGREEMENT?', + 'C13 JEL CODES', + 'METHODOLOGY', + 'POLICY'] + +TEST_ROW_WITH_OTHER = ['03/05/2017 - 17:00', + '20170305AA', + 'Status is not saved, so this field doesnt matter', + 'The members of Nsync', + 'Justin Timberlake | Joey Fatone | Lance Bass', + 'doesnt@matter.com', + 'We acknolowledge Chris Kirkpatrick', + 'Justin Timberlake is a faculty Member', + 'This is my other response for prospective', + 'Yes', + '05/01/2017', + '05/01/2020', + '', + 'No', + 'No', + 'Test background', + 'test hypothesis', + 'This is my testing plan', + 'Switzerland', + '3242', + 'This is a power analysis other response', + 'This is an other irb response', + '343434', + '03/06/2017', + 'This is an other intervention response', + 'This is an other renumeration response', + 'This is an other publication agreement response', + 'Jel Code', + 'Survey Methodology', + 'Gender'] + +TEST_ROW_WITH_OTHER_AUTHORS = [ + {'name': 'Justin Timberlake', 'email': 'jt@gmail.com'}, + {'name': 'Joey Fatone'}, + {'name': 'Lance Bass', 'email': 'lBass@gmail.com'}] + +TEST_ROW = ['05/05/2018 - 17:00', + '20180505AA', + 'Status is not saved, so this field doesnt matter', + 'The members of Backstreet boys', + 'Nick Carter | Brian Littrell, Ph.D. | AJ McLean | U.S. Agency Bureau, Department of Agency affairs (DOAA)', + 'doesnt@matter.com', + 'We acknolowledge Chris Kirkpatrick', + 'Yes', + 'Registration prior to any research activities', + 'Yes', + '05/01/2017', + '05/01/2020', + '', + 'No', + 'No', + 'Test background', + 'test hypothesis', + 'This is my testing plan', + 'Switzerland', + '3242', + 'Yes', + 'Yes', + '343434', + '03/06/2017', + 'Researchers', + 'Yes', + 'Yes', + 'Jel Code', + 'Survey Methodology', + 'Gender'] + +TEST_ROW_AUTHORS = [ + {'name': 'Nick Carter', 'email': 'nickc@gmail.com'}, + {'name': 'Brian Littrell, Ph.D.'}, + {'name': 'AJ McLean', 'email': 'AJML@gmail.com'}, + {'name': 'U.S. Agency Bureau, Department of Agency affairs (DOAA)', 'email': 'DOAA@UAB.gov'}] + +class TestProjectDict(unittest.TestCase): + + def test_row_with_other(self): + project_dict = make_project_dict(TEST_ROW_WITH_OTHER, TEST_ROW_WITH_OTHER_AUTHORS, HEADER_ROW) + self.assertEqual(project_dict['title'], TEST_ROW_WITH_OTHER[3]) + self.assertEqual(project_dict['contributors'], TEST_ROW_WITH_OTHER_AUTHORS) + self.assertEqual(project_dict['post-date'], TEST_ROW_WITH_OTHER[0]) + self.assertEqual(project_dict['id'], TEST_ROW_WITH_OTHER[1]) + + def test_row(self): + project_dict = make_project_dict(TEST_ROW, TEST_ROW_AUTHORS, HEADER_ROW) + self.assertEqual(project_dict['title'], TEST_ROW[3]) + self.assertEqual(project_dict['contributors'], TEST_ROW_AUTHORS) + self.assertEqual(project_dict['post-date'], TEST_ROW[0]) + self.assertEqual(project_dict['id'], TEST_ROW[1]) + +class TestRegistrationDict(unittest.TestCase): + + def run_registration_test(self, row, header_row): + project_dict = make_registration_dict(row, header_row) + for question_dict in schema_to_spreadsheet_mapping: + question_key = question_dict.keys()[0] + spreadsheet_column = question_dict[question_key] + column_index = header_row.index(spreadsheet_column) + if type(project_dict[question_key]['value']) == list: + field_val = project_dict[question_key]['value'][0] + else: + field_val = project_dict[question_key]['value'] + if row[column_index] != field_val and question_key in other_mapping: + self.assertEqual(project_dict[question_key]['value'], 'Other (describe in text box below)') + field_val = project_dict[other_mapping[question_key]]['value'] + self.assertEqual(row[column_index], field_val) + else: + self.assertEqual(row[column_index], field_val) + + def test_row_with_other(self): + self.run_registration_test(TEST_ROW_WITH_OTHER, HEADER_ROW) + + def test_row(self): + self.run_registration_test(TEST_ROW, HEADER_ROW) diff --git a/scripts/EGAP/__init__.py b/scripts/EGAP/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/scripts/EGAP/create_EGAP_json.py b/scripts/EGAP/create_EGAP_json.py new file mode 100644 index 00000000000..f04197ee79d --- /dev/null +++ b/scripts/EGAP/create_EGAP_json.py @@ -0,0 +1,258 @@ +import logging +import csv +import datetime +import json +import os +import shutil +import re +import jsonschema +import argparse + +from django.core.management.base import BaseCommand +from jsonschema.exceptions import ValidationError + +from website.project.metadata.utils import create_jsonschema_from_metaschema +from website.project.metadata.schemas import ensure_schema_structure, from_json + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +parser = argparse.ArgumentParser() +parser.add_argument('-a', '--authorsource', help='Specify the source file for the author csv file') +parser.add_argument('-r', '--registrysource', help='Specify the source file for the registrty csv file') +parser.add_argument('-t', '--target', help='Specify the target directory of the registry directories') +parser.add_argument('-d', '--dry', action='store_true', help='Dry run: Have the script delete the target directory after completion') + +schema_to_spreadsheet_mapping = [ + {'q1': 'TITLE'}, + {'q2': 'B2 AUTHORS'}, + {'q3': 'ID'}, + {'q4': 'POST DATE'}, + {'q5': 'B3 ACKNOWLEDGEMENTS'}, + {'q6': 'B4 FACULTY MEMBER?'}, + {'q8': 'B5 PROSPECTIVE OR RETROSPECTIVE?'}, + {'q10': 'B6 EXPERIMENTAL STUDY?'}, + {'q11': 'B7 DATE OF START OF STUDY'}, + {'q12': 'B8 GATE DATE'}, + {'q13': 'B9 PRESENTED AT EGAP MEETING?'}, + {'q14': 'B10 PRE-ANALYSIS PLAN WITH REGISTRATION?'}, + {'q15': 'C1 BACKGROUND'}, + {'q16': 'C2 HYPOTHESES'}, + {'q17': 'C3 TESTING PLAN'}, + {'q18': 'C4 COUNTRY'}, + {'q19': 'C5 SAMPLE SIZE'}, + {'q20': 'C6 POWER ANALYSIS?'}, + {'q22': 'C7 IRB APPROVAL?'}, + {'q24': 'C8 IRB NUMBER'}, + {'q25': 'C9 DATE OF IRB APPROVAL'}, + {'q26': 'C10 INTERVENTION IMPLEMENTER'}, + {'q28': 'C11 REMUNERATION?'}, + {'q30': 'C12 PUBLICATION AGREEMENT?'}, + {'q32': 'C13 JEL CODES'}, + {'q33': 'METHODOLOGY'}, + {'q34': 'POLICY'}, +] + +# Any multiple choice questions where "Other" is a possible response, have subsequent "Other" +# question to log that response. If multiple choice question value is invalid, +# attempt to log the value in the corresponding "Other" question response. +other_mapping = { + 'q6': 'q7', + 'q8': 'q9', + 'q20': 'q21', + 'q22': 'q23', + 'q26': 'q27', + 'q28': 'q29', + 'q30': 'q31' +} + +def create_file_tree_and_json(author_source, registry_source, target): + # Things this function needs to do: + # For each row in the registry function, create a directory. + # Create two JSON files, one project json with ID, Title, Postdate, and authors listed + # with emails. And another with all the key value pairs for the registry meta. + top_dir = target + logger.info('Creating EGAP directory at {}'.format(top_dir)) + os.mkdir(top_dir) + author_list = create_author_dict(author_source) + with open(registry_source) as csv_registry_file: + csv_reader = csv.reader(csv_registry_file, delimiter=',') + header_row = next(csv_reader) + normalized_header_row = [col_header.decode('ascii', 'ignore') for col_header in header_row] + + id_index = normalized_header_row.index('ID') + for line in csv_reader: + row = [cell.decode('ascii', 'ignore') for cell in line] + project_id = row[id_index] + logger.info('Adding project ID: {}'.format(project_id)) + root_directory = os.path.join(top_dir, project_id) + os.mkdir(root_directory) + data_directory = os.path.join(root_directory, 'data') + os.mkdir(data_directory) + os.mkdir(os.path.join(data_directory, 'nonanonymous')) + project_dict = make_project_dict(row, author_list, normalized_header_row) + make_json_file(root_directory, project_dict, 'project') + registration_dict = make_registration_dict(row, normalized_header_row) + make_json_file(root_directory, registration_dict, 'registration') + +def create_author_dict(source): + # Reads in author CSV and returns a list of dicts with names and emails of EGAP Authors + authors = [] + with open(source) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + header_row = next(csv_reader) + normalized_header_row = [col_header.decode('ascii', 'ignore').strip() for col_header in header_row] + + name_index = normalized_header_row.index('Name') + email_index = normalized_header_row.index('Email') + for line in csv_reader: + row = [cell.decode('ascii', 'ignore') for cell in line] + logger.info('Adding user: ' + row[name_index]) + if row[email_index] != '': + author_dict = {'name': row[name_index].strip(), 'email': row[email_index]} + else: + author_dict = {'name': row[name_index].strip()} + authors.append(author_dict) + return authors + +def make_project_dict(row, author_list, normalized_header_row): + project = {} + title_index = normalized_header_row.index('TITLE') + id_index = normalized_header_row.index('ID') + postdate_index = normalized_header_row.index('POST DATE') + contributors_index = normalized_header_row.index('B2 AUTHORS') + project['id'] = row[id_index] + project['title'] = row[title_index] + project['post-date'] = row[postdate_index] + + authors = row[contributors_index] + + authors = authors.split('|') + project['contributors'] = [] + author_name_list = [author['name'] for author in author_list] + for author in authors: + author = author.strip() + if author: + if author not in author_name_list: + logger.warning('Author {} not in Author spreadsheet for project {}.'.format(author,row[id_index])) + project['contributors'].append({'name': author}) + else: + author_list_index = author_name_list.index(author) + project['contributors'].append(author_list[author_list_index]) + return project + +def make_registration_dict(row, normalized_header_row): + registration = {} + + for question in schema_to_spreadsheet_mapping: + qid = question.keys()[0] + column_name = question.values()[0] + value = build_question_response(normalized_header_row, row, qid, column_name) + validated_qid, other_response = validate_response(qid, value) + registration[validated_qid] = value + if other_response: + registration[other_response] = build_nested_response('Other (describe in text box below)') + # q35 and q36 are required questions at the end of the schema, certification and + # confirmation questions. Just marking as agree - + registration['q35'] = build_nested_response('Agree') + registration['q36'] = build_nested_response('Agree') + return registration + +def make_json_file(filepath, data, json_type): + if json_type == 'project': + filepath = filepath + '/project.json' + if json_type == 'registration': + filepath = filepath + '/registration-schema.json' + with open(filepath, 'w') as outfile: + json.dump(data, outfile) + +def build_question_response(header_row, row, question_key, column_title): + """Format the question's response to go in the registration_metadata + :param header_row: Header row in spreadsheet + :param row: Row in spreadsheet + :param question_key: string, Official question key as part of schema + :param column_title: string, Corresponding question_key column title in EGAP spreadsheet + """ + index = header_row.index(column_title) + value = clean_value(row[index]) + # Spreadsheet has these as comma-separated values, but looking for array + if question_key in ['q33', 'q34']: + value = value.split(', ') + return build_nested_response(value) + +def clean_value(value): + """Clean spreadsheet values of issues that will affect validation """ + if value == 'n/a': + return 'N/A' + elif value == 'Design was registered before field was added': + return '' + return value + +def build_nested_response(value): + return { + 'comments': [], + 'extra': [], + 'value': value + } + +def validate_response(qid, value): + """Validate question response + + Validating each question response individually. If there is an error, we will + attempt to add the value to the corresponding "Other" block. Return that question id instead. + + For example, q6 is a multiple choice question, with "Other" as a choice. If text is entered + for q6 that does not match one of the multiple choice answers, assuming that this is "other" + text, and this response should go to the corresponding q7 question. q6 will be marked + as "Other" + + :param qid: string, question id from schema + :param value: question response + :param draft: DraftRegistration + :return qid: tuple, (qid corresponding to value, optional "Other" qid) + """ + temporary_check = {} + temporary_check[qid] = value + egap_schema = ensure_schema_structure(from_json('egap-registration.json')) + schema = create_jsonschema_from_metaschema(egap_schema, + required_fields=False, + is_reviewer=False) + + try: + json_schema = jsonschema.validate(temporary_check, schema) + except ValidationError as exc: + if qid in other_mapping: + return other_mapping[qid], qid + else: + raise Exception(exc) + return qid, None + +def main(default_args=False): + if default_args: + args = parser.parse_args(['--source', 'default', '--target', 'default']) + else: + args = parser.parse_args() + + author_source = args.authorsource + registry_source = args.registrysource + target_directory = args.target + dry_run = args.dry + + if not author_source: + author_source = 'EGAP_author_emails.csv' + + if not registry_source: + registry_source = 'EGAP_registry_for_OSF.csv' + + if not target_directory: + target_directory = 'EGAP_data_{}'.format(datetime.datetime.now().strftime('%m-%d-%Y')) + + create_file_tree_and_json(author_source, registry_source, target_directory) + + if dry_run: + shutil.rmtree(target_directory) + raise RuntimeError('Dry run, file tree being deleted.') + +if __name__ == '__main__': + + main(default_args=False) From 499e98a688ac598b01af98b697977b3ee46fbae9 Mon Sep 17 00:00:00 2001 From: "Brian J. Geiger" Date: Tue, 29 Oct 2019 11:19:52 -0400 Subject: [PATCH 3/9] Update script to work with Jupyter Notebook and Python 3 (#9182) ## Purpose Create an EGAP Jupyter Notebook capable of running the scripts for the EGAP migration ## Changes 1. Un-DRY the first script to remove the need to have the entire OSF running just to run this migration. 2. Make the script work with Python 3 (which current versions of Jupyter require) 3. Add notebook 4. Copy EGAP Schema to EGAP scripts directory 5. Ignore iPython checkpoint files in git ## QA Notes - Does this change require a data migration? If so, what data will we migrate? _No, this is mostly an aid for testing_ - What is the level of risk? _Low_ - Any permissions code touched? _No_ - Is this an additive or subtractive change, other? _Additive_ - How can QA verify? (Through UI, API, AdminApp or AdminAdminApp?) _By running the notebook_ - If verifying through API, what's the new version? Please include the endpoints in PR notes or Dev docs. _N/A_ - What features or workflows might this change impact? _EGAP Migration_ - How will this impact performance? _It will not_ ## Side Effects No side effects. The reduction of DRYness was to prevent the possibility of side effects. ## Ticket https://openscience.atlassian.net/browse/ENG-1177 --- .gitignore | 2 + scripts/EGAP/create_EGAP_json.py | 303 +++++++++++++++++++++- scripts/EGAP/egap-registration.json | 382 ++++++++++++++++++++++++++++ scripts/EGAP/egap_workflow.ipynb | 47 ++++ 4 files changed, 720 insertions(+), 14 deletions(-) create mode 100644 scripts/EGAP/egap-registration.json create mode 100644 scripts/EGAP/egap_workflow.ipynb diff --git a/.gitignore b/.gitignore index 84f32d26566..0f83ae23f86 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ ehthumbs.db Thumbs.db *.swp *~ +.ipynb_checkpoints # R ####################### @@ -202,3 +203,4 @@ ssl/ # pyenv .python-version + diff --git a/scripts/EGAP/create_EGAP_json.py b/scripts/EGAP/create_EGAP_json.py index f04197ee79d..9edb0679b2d 100644 --- a/scripts/EGAP/create_EGAP_json.py +++ b/scripts/EGAP/create_EGAP_json.py @@ -4,16 +4,11 @@ import json import os import shutil -import re import jsonschema import argparse -from django.core.management.base import BaseCommand from jsonschema.exceptions import ValidationError -from website.project.metadata.utils import create_jsonschema_from_metaschema -from website.project.metadata.schemas import ensure_schema_structure, from_json - logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -66,6 +61,22 @@ 'q30': 'q31' } + +here = os.path.split(os.path.abspath(__file__))[0] + + +def from_json(fname): + with open(os.path.join(here, fname)) as f: + return json.load(f) + + +def ensure_schema_structure(schema): + schema['pages'] = schema.get('pages', []) + schema['title'] = schema['name'] + schema['version'] = schema.get('version', 1) + return schema + + def create_file_tree_and_json(author_source, registry_source, target): # Things this function needs to do: # For each row in the registry function, create a directory. @@ -75,14 +86,17 @@ def create_file_tree_and_json(author_source, registry_source, target): logger.info('Creating EGAP directory at {}'.format(top_dir)) os.mkdir(top_dir) author_list = create_author_dict(author_source) - with open(registry_source) as csv_registry_file: + with open(registry_source, 'rt', encoding='utf-8-sig') as csv_registry_file: csv_reader = csv.reader(csv_registry_file, delimiter=',') header_row = next(csv_reader) - normalized_header_row = [col_header.decode('ascii', 'ignore') for col_header in header_row] + normalized_header_row = [col_header.strip() for col_header in header_row] + logger.info('Debug data') + logger.info('Header row: {}'.format(header_row)) + logger.info('Normalized header row: {}'.format(normalized_header_row)) id_index = normalized_header_row.index('ID') for line in csv_reader: - row = [cell.decode('ascii', 'ignore') for cell in line] + row = [cell for cell in line] project_id = row[id_index] logger.info('Adding project ID: {}'.format(project_id)) root_directory = os.path.join(top_dir, project_id) @@ -95,18 +109,21 @@ def create_file_tree_and_json(author_source, registry_source, target): registration_dict = make_registration_dict(row, normalized_header_row) make_json_file(root_directory, registration_dict, 'registration') + def create_author_dict(source): # Reads in author CSV and returns a list of dicts with names and emails of EGAP Authors authors = [] - with open(source) as csv_file: + with open(source, 'rt', encoding='utf-8-sig') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') header_row = next(csv_reader) - normalized_header_row = [col_header.decode('ascii', 'ignore').strip() for col_header in header_row] - + normalized_header_row = [col_header.strip() for col_header in header_row] + logger.info('Debug data') + logger.info('Header row: {}'.format(header_row)) + logger.info('Normalized header row: {}'.format(normalized_header_row)) name_index = normalized_header_row.index('Name') email_index = normalized_header_row.index('Email') for line in csv_reader: - row = [cell.decode('ascii', 'ignore') for cell in line] + row = [cell for cell in line] logger.info('Adding user: ' + row[name_index]) if row[email_index] != '': author_dict = {'name': row[name_index].strip(), 'email': row[email_index]} @@ -115,6 +132,7 @@ def create_author_dict(source): authors.append(author_dict) return authors + def make_project_dict(row, author_list, normalized_header_row): project = {} title_index = normalized_header_row.index('TITLE') @@ -141,12 +159,13 @@ def make_project_dict(row, author_list, normalized_header_row): project['contributors'].append(author_list[author_list_index]) return project + def make_registration_dict(row, normalized_header_row): registration = {} for question in schema_to_spreadsheet_mapping: - qid = question.keys()[0] - column_name = question.values()[0] + qid = list(question.keys())[0] + column_name = list(question.values())[0] value = build_question_response(normalized_header_row, row, qid, column_name) validated_qid, other_response = validate_response(qid, value) registration[validated_qid] = value @@ -158,6 +177,7 @@ def make_registration_dict(row, normalized_header_row): registration['q36'] = build_nested_response('Agree') return registration + def make_json_file(filepath, data, json_type): if json_type == 'project': filepath = filepath + '/project.json' @@ -166,6 +186,7 @@ def make_json_file(filepath, data, json_type): with open(filepath, 'w') as outfile: json.dump(data, outfile) + def build_question_response(header_row, row, question_key, column_title): """Format the question's response to go in the registration_metadata :param header_row: Header row in spreadsheet @@ -180,6 +201,7 @@ def build_question_response(header_row, row, question_key, column_title): value = value.split(', ') return build_nested_response(value) + def clean_value(value): """Clean spreadsheet values of issues that will affect validation """ if value == 'n/a': @@ -195,6 +217,257 @@ def build_nested_response(value): 'value': value } + +def base_metaschema(metaschema): + json_schema = { + 'type': 'object', + 'description': metaschema['description'], + 'title': metaschema['title'], + 'additionalProperties': False, + 'properties': { + } + } + return json_schema + + +def get_required(question): + """ + Returns True if metaschema question is required. + """ + required = question.get('required', False) + if not required: + properties = question.get('properties', False) + if properties and isinstance(properties, list): + for item, property in enumerate(properties): + if isinstance(property, dict) and property.get('required', False): + required = True + break + return required + + +COMMENTS_SCHEMA = { + 'type': 'array', + 'items': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'seenBy': { + 'type': 'array', + }, + 'canDelete': {'type': 'boolean'}, + 'created': {'type': 'string'}, + 'lastModified': {'type': 'string'}, + 'author': {'type': 'string'}, + 'value': {'type': 'string'}, + 'isOwner': {'type': 'boolean'}, + 'getAuthor': {'type': 'string'}, + 'user': { + 'type': 'object', + 'additionalProperties': True, + 'properties': { + 'fullname': {'type': 'string'}, + 'id': {'type': 'integer'} + } + }, + 'saved': {'type': 'boolean'}, + 'canEdit': {'type': 'boolean'}, + 'isDeleted': {'type': 'boolean'} + } + } +} + + +def get_options_jsonschema(options, required): + """ + Returns multiple choice options for schema questions + """ + for item, option in enumerate(options): + if isinstance(option, dict) and option.get('text'): + options[item] = option.get('text') + value = {'enum': options} + + if not required and '' not in value['enum']: # Non-required fields need to accept empty strings as a value. + value['enum'].append('') + + return value + + +def get_object_jsonschema(question, required_fields, is_reviewer, is_required): + """ + Returns jsonschema for nested objects within schema + """ + object_jsonschema = { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + + } + } + required = [] + properties = question.get('properties') + if properties: + for property in properties: + if property.get('required', False) and required_fields: + required.append(property['id']) + values = extract_question_values(property, required_fields, is_reviewer, is_required) + object_jsonschema['properties'][property['id']] = { + 'type': 'object', + 'additionalProperties': False, + 'properties': values + } + if required_fields: + object_jsonschema['properties'][property['id']]['required'] = ['value'] + if required_fields and is_required: + object_jsonschema['required'] = required + + return object_jsonschema + + +OSF_UPLOAD_EXTRA_SCHEMA = { + 'type': 'array', + 'items': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'data': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'kind': {'type': 'string'}, + 'contentType': {'type': 'string'}, + 'name': {'type': 'string'}, + 'extra': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'downloads': {'type': 'integer'}, + 'version': {'type': 'integer'}, + 'latestVersionSeen': {'type': 'string'}, + 'guid': {'type': 'string'}, + 'checkout': {'type': 'string'}, + 'hashes': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'sha256': {'type': 'string'}, + 'md5': {'type': 'string'} + } + } + } + }, + 'materialized': {'type': 'string'}, + 'modified': {'type': 'string'}, + 'nodeId': {'type': 'string'}, + 'etag': {'type': 'string'}, + 'provider': {'type': 'string'}, + 'path': {'type': 'string'}, + 'nodeUrl': {'type': 'string'}, + 'waterbutlerURL': {'type': 'string'}, + 'resource': {'type': 'string'}, + 'nodeApiUrl': {'type': 'string'}, + 'type': {'type': 'string'}, + 'accept': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'acceptedFiles': {'type': 'boolean'}, + 'maxSize': {'type': 'integer'}, + } + }, + 'links': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'download': {'type': 'string'}, + 'move': {'type': 'string'}, + 'upload': {'type': 'string'}, + 'delete': {'type': 'string'} + } + }, + 'permissions': { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + 'edit': {'type': 'boolean'}, + 'view': {'type': 'boolean'} + } + }, + 'created_utc': {'type': 'string'}, + 'id': {'type': 'string'}, + 'modified_utc': {'type': 'string'}, + 'size': {'type': 'integer'}, + 'sizeInt': {'type': 'integer'}, + } + }, + 'fileId': {'type': ['string', 'object']}, + 'descriptionValue': {'type': 'string'}, + 'sha256': {'type': 'string'}, + 'selectedFileName': {'type': 'string'}, + 'nodeId': {'type': 'string'}, + 'viewUrl': {'type': 'string'} + } + } +} + + +def extract_question_values(question, required_fields, is_reviewer, is_required): + """ + Pulls structure for 'value', 'comments', and 'extra' items + """ + response = { + 'value': {'type': 'string'}, + 'comments': COMMENTS_SCHEMA, + 'extra': {'type': 'array'} + } + if question.get('type') == 'object': + response['value'] = get_object_jsonschema(question, required_fields, is_reviewer, is_required) + elif question.get('type') == 'choose': + options = question.get('options') + if options: + enum_options = get_options_jsonschema(options, is_required) + if question.get('format') == 'singleselect': + response['value'] = enum_options + elif question.get('format') == 'multiselect': + response['value'] = {'type': 'array', 'items': enum_options} + elif question.get('type') == 'osf-upload': + response['extra'] = OSF_UPLOAD_EXTRA_SCHEMA + + if is_reviewer: + del response['extra'] + if not question.get('type') == 'object': + del response['value'] + + return response + + +def create_jsonschema_from_metaschema(metaschema, required_fields=False, is_reviewer=False): + """ + Creates jsonschema from registration metaschema for validation. + + Reviewer schemas only allow comment fields. + """ + json_schema = base_metaschema(metaschema) + required = [] + + for page in metaschema['pages']: + for question in page['questions']: + is_required = get_required(question) + if is_required and required_fields: + required.append(question['qid']) + json_schema['properties'][question['qid']] = { + 'type': 'object', + 'additionalProperties': False, + 'properties': extract_question_values(question, required_fields, is_reviewer, is_required) + } + if required_fields: + json_schema['properties'][question['qid']]['required'] = ['value'] + + if required and required_fields: + json_schema['required'] = required + + return json_schema + + def validate_response(qid, value): """Validate question response @@ -227,6 +500,7 @@ def validate_response(qid, value): raise Exception(exc) return qid, None + def main(default_args=False): if default_args: args = parser.parse_args(['--source', 'default', '--target', 'default']) @@ -253,6 +527,7 @@ def main(default_args=False): shutil.rmtree(target_directory) raise RuntimeError('Dry run, file tree being deleted.') + if __name__ == '__main__': main(default_args=False) diff --git a/scripts/EGAP/egap-registration.json b/scripts/EGAP/egap-registration.json new file mode 100644 index 00000000000..28d3721e8bf --- /dev/null +++ b/scripts/EGAP/egap-registration.json @@ -0,0 +1,382 @@ +{ + "name": "EGAP Registration", + "version": 2, + "description": "The EGAP registry focuses on designs for experiments and observational studies in governance and politics.", + "pages": [{ + "id": "page1", + "title": "General Information About the Project", + "questions": [{ + "qid": "q1", + "nav": "Title", + "type": "string", + "format": "text", + "title": "B1 Title of Study", + "description": "Provide the working title of your study.", + "required": true + }, + { + "qid": "q2", + "nav": "Authors", + "title": "B2 Authors", + "help": "Jimmy Stewart, Ava Gardner, Bob Hope, Greta Garbo", + "format": "textarea", + "required": true + }, + { + "qid": "q3", + "nav": "EGAP Registration ID", + "title": "EGAP Registration ID", + "format": "textarea", + "required": true + }, + { + "qid": "q4", + "nav": "Timestamp", + "title": "Timestamp of original registration", + "format": "textarea", + "required": true + }, + { + "qid": "q5", + "nav": "Acknowledgements", + "title": "B3 Acknowledgements", + "type": "string", + "format": "textarea", + "required": false + }, + { + "qid": "q6", + "title": "B4 Is one of the study authors a university faculty member?", + "nav": "University Faculty Member?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No", + "Other (describe in text box below)" + ], + "description": "Please choose one" + }, + { + "qid": "q7", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q8", + "title": "B5 Is this Registration Prospective or Retrospective?", + "nav": "Prospective or Retrospective?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Registration prior to any research activities", + "Registration prior to assignment of treatment", + "Registration prior to realization of outcomes", + "Registration prior to researcher access to outcome data", + "Registration prior to researcher analysis of outcome data", + "Registration after researcher analysis of outcome data", + "Other (describe in text box below)" + ], + "description": "Please choose one" + }, + { + "qid": "q9", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q10", + "title": "B6 Is this an experimental study?", + "nav": "Experimental study?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No" + ], + "description": "(with random assignment of units to different conditions)" + }, + { + "qid": "q11", + "title": "B7 Date of start of study", + "nav": "Date of start of study", + "type": "string", + "format": "text", + "description": "Understood as first date of treatment assignment or equivalent for observational study", + "help": "E.g., 06/02/2018" + }, + { + "qid": "q12", + "title": "B8 Gate Date", + "nav": "Gate Date?", + "type": "string", + "format": "text", + "description": "Gating is discouraged, but if necessary, EGAP policy limits the gate range to 18 months maximum.", + "help": "E.g., 06/02/2018" + }, + { + "qid": "q13", + "title": "B9 Was this design presented at an EGAP meeting?", + "nav": "Presented at an EGAP meeting?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes" + ], + "description": "Indicate if the design received feedback from a EGAP design workshop or other special EGAP session prior to registration" + }, + { + "qid": "q14", + "title": "B10 Is there a pre-analysis plan associated with this registration?", + "nav": "Pre-analysis plan associated with this registration?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes" + ], + "description": "If so, please attach it in the Additional Documentation section on the final screen." + } + ] + }, + { + "id": "page2", + "title": "Registration Data", + "questions": [{ + "qid": "q15", + "nav": "Background and explanation of rationale.", + "title": "C1 Background and explanation of rationale.", + "format": "textarea", + "required": true, + "description": "Brief description of goals of project. If you are also attaching a pre-analysis plan, please refrain from simply copying and pasting a section from your plan here. If possible, please also avoid saying \"see attached pre-analysis plan,\" as it renders the search functionality less useful. Rather, please provide a short (1-2 paragraph) summary of the project background." + }, + { + "qid": "q16", + "nav": "Background and explanation of rationale.", + "title": "C2 What are the hypotheses to be tested/quantities of interest to be estimated?", + "format": "textarea", + "required": true, + "description": "Please list the hypotheses including hypotheses on heterogeneous effects. If you are also attaching a pre-analysis plan, please refrain from simply copying and pasting a section from your plan here. If possible, please also avoid saying \"see attached pre-analysis plan,\" as it renders the search functionality less useful. Rather, please provide a short (1-2 paragraph) summary of project hypotheses." + }, + { + "qid": "q17", + "nav": "How will these hypotheses be tested?", + "title": "C3 How will these hypotheses be tested?", + "format": "textarea", + "required": true, + "description": "Brief description of your methodology. If you are also attaching a pre-analysis plan, please refrain from simply copying and pasting a section from your plan here. If possible, please also avoid saying \"see attached pre-analysis plan,\" as it renders the search functionality less useful. Rather, please provide a short (1-2 paragraph) summary of project methodology." + }, + { + "qid": "q18", + "title": "C4 Country", + "nav": "Country", + "type": "string", + "format": "text", + "help": "comma separated names of countries (e.g. Canada, United States of America, Mexico)" + }, + { + "qid": "q19", + "title": "C5 Sample Size (# of Units)", + "nav": "Sample Size", + "type": "string", + "format": "text" + }, + { + "qid": "q20", + "title": "C6 Was a power analysis conducted prior to data collection?", + "nav": "Power analysis conducted prior to data collection?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes", + "Other (describe in text box below)" + ] + }, + { + "qid": "q21", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q22", + "title": "C7 Has this research received Institutional Review Board (IRB) or ethics committee approval?", + "nav": "Review Board (IRB) or ethics committee approval?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "No", + "Yes", + "Other (describe in text box below)" + ] + }, + { + "qid": "q23", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q24", + "title": "C8 IRB Number", + "nav": "IRB Number", + "type": "string", + "format": "text" + }, + { + "qid": "q25", + "title": "C9 Date of IRB Approval", + "nav": "IRB Number", + "type": "string", + "format": "text" + }, + { + "qid": "q26", + "title": "C10 Will the intervention be implemented by the researcher or a third party? If a third party, please provide the name.", + "nav": "Review Board (IRB) or ethics committee approval?", + "type": "choose", + "format": "singleselect", + "options": [ + "Researchers", + "Other (describe in text box below)" + ] + }, + { + "qid": "q27", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q28", + "title": "C11 Did any of the research team receive remuneration from the implementing agency for taking part in this research?", + "nav": "Remuneration?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No", + "Other (describe in text box below)" + ] + }, + { + "qid": "q29", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q30", + "title": "C12 If relevant, is there an advance agreement with the implementation group that all results can be published?", + "nav": "is there an advance agreement with the implementation group that all results can be published?", + "type": "choose", + "format": "singleselect", + "options": [ + "N/A", + "Yes", + "No", + "Other (describe in text box below)" + ] + }, + { + "qid": "q31", + "title": "Other", + "format": "textarea", + "required": false + }, + { + "qid": "q32", + "title": "C13 JEL classification(s)", + "nav": "JEL classification(s)", + "type": "string", + "format": "text", + "description": "Please provide alphanumeric code(s). If multiple classifications, separate by commas (e.g. D31, C19, F22)" + } + ] + }, + { + "id": "page3", + "title": "Keywords and Data", + "questions": [{ + "qid": "q33", + "nav": "Keywords", + "type": "choose", + "format": "multiselect", + "title": "Keywords for Methodology", + "description": "Choose one or more categories that describe your study methodology.", + "options": [ + "Experimental Design", + "Field Experiments", + "Lab Experiments", + "Mixed Method", + "Statistics", + "Survey Methodology" + ] + }, { + "qid": "q34", + "nav": "Keywords", + "type": "choose", + "format": "multiselect", + "title": "Keywords for Policy", + "description": "Choose one or more policy categories.", + "options": [ + "Conflict and Violence", + "Corruption", + "Development", + "Elections", + "Ethnic Politics", + "Gender", + "Governance" + ] + }, { + "qid": "q35", + "title": "Certification", + "nav": "Certification", + "type": "choose", + "format": "singleselect", + "description": "By submitting this form and accompanying documents with EGAP, I confirm that I have rights to put this information in the public domain and I understand that this information will remain on the EGAP registry in perpetuity, regardless of whether the research is subsequently implemented or not.", + "options": [ + "Agree" + ], + "required": true + }, { + "qid": "q36", + "title": "Confirmation", + "nav": "Confirmation", + "type": "choose", + "format": "singleselect", + "description": "You should receive a confirmation of your registration within three business days. Your registration is considered complete only when confirmation is received. If you do not receive confirmation within three business days please contact paps@egap.org.", + "options": [ + "Agree" + ], + "required": true + }, { + "qid": "q37", + "nav": "Additional Documentation", + "title": "Additional Documentation", + "type": "osf-upload", + "format": "osf-upload-open", + "description": "Please upload your pre-analysis plan, along with any other supporting documents, such as survey instrument, research protocol, any data, etc." + }, { + "qid": "q38", + "nav": "Anonymous Documentation", + "title": "Anonymous Documentation", + "type": "osf-upload", + "format": "osf-upload-open", + "description": "Please upload your anonymized pre-analysis plan, along with any other supporting documents, such as survey instrument, research protocol, any data, etc." + }] + } + ] +} diff --git a/scripts/EGAP/egap_workflow.ipynb b/scripts/EGAP/egap_workflow.ipynb new file mode 100644 index 00000000000..750bb02b152 --- /dev/null +++ b/scripts/EGAP/egap_workflow.ipynb @@ -0,0 +1,47 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from create_EGAP_json import create_file_tree_and_json\n", + "\n", + "author_source = '/Users/bgeiger/Desktop/EGAP/20190821_author_emails.csv'\n", + "registry_source = '/Users/bgeiger/Desktop/EGAP/20191014_OSF_database.csv'\n", + "target_directory= '/Users/bgeiger/Desktop/EGAP/output/'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_file_tree_and_json(author_source, registry_source, target_directory)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 8a454a3b4766da03aed12b6a9f0f0ff7b7619b02 Mon Sep 17 00:00:00 2001 From: John Tordoff Date: Wed, 30 Oct 2019 13:26:11 -0400 Subject: [PATCH 4/9] [ENG-1108] Add script for importing files into EGAP file structure (#9178) ## Purpose Take the 52 pick-up that is the EGAP data dump and make it into the freshly minted deck of our file import structure. ## Changes 1. Given a directory of files (a.ka. files_dir), a directory of metadata generated from the egap spreadsheets (a.k.a. metadata_dir), and an optional list of project ids, copy the appropriate files from the files directory into the appropriate locations in the directory of metadata generated from the egap spreadsheets. * The location to copy to (a.k.a. destination_dir) is "//data/" * Do not copy the files if they do not start with the project id * If the files contain the word "anonymous" (case insensitive), copy to a "/anonymous" directory * If the files contain the phrase "PAP_anon" (case insensitive), copy to the "/anonymous" directory * If the files are being copied but aren't to be copied to the "/anonymous" directory, copy them to a "/nonanonymous" directory * Ensure that the actions listed above can also be activated by calling a top-level function with the appropriate args or kwargs * If a list of project_ids is not specified when the script is run, you can default to the list of folder names contained inside the metadata_dir. * A list of ids will always be provided when called by an external script such as a Jupyter Notebook. 2. Have an audit function that will list which files above will not be copied. Save that information to a file called "ignoring.txt". Use the same code in the audit that you use in the script to determine which files will not be copied. 3. Have tests to verify that regular, anonymous, and ignored files will be handled appropriately. ## QA Notes Determine criteria for the types of files you'd like tested, so we can find directories that match that criteria. Test using the Jupyter Notebook. ## Documentation Code comments only ## Side Effects None that I know of ## Ticket https://openscience.atlassian.net/browse/ENG-1108 --- scripts/EGAP/files_to_import_structure.py | 140 ++++++++++++++++++ .../test_nonanonymous/20151016AA_FORM.pdf | Bin .../data/test_nonanonymous/20151016AA_PAP.pdf | Bin .../data/test_nonanonymous/justafile.pdf | Bin .../tests/test_files_to_import_structure.py | 55 +++++++ 5 files changed, 195 insertions(+) create mode 100644 scripts/EGAP/files_to_import_structure.py create mode 100644 scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf create mode 100644 scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_PAP.pdf create mode 100644 scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf create mode 100644 scripts/tests/test_files_to_import_structure.py diff --git a/scripts/EGAP/files_to_import_structure.py b/scripts/EGAP/files_to_import_structure.py new file mode 100644 index 00000000000..89cbe3934ce --- /dev/null +++ b/scripts/EGAP/files_to_import_structure.py @@ -0,0 +1,140 @@ +import os +import re +import shutil +import argparse +from distutils.dir_util import copy_tree + +from nose.tools import assert_equal + +# This takes the item id from the path of the project directory for example '20121001AA Findley' -> '20121001AA' +get_item_id = lambda _path: _path.split(os.sep)[-1].split(' ')[0] + + +# Check if file name starts with EGAP id for example '20121001AA_PAP.pdf' +def check_id(root, item): + project_id = get_item_id(root.split('/')[-3]) + return item.startswith(project_id) + + +# Check if file follows anonymous naming convention +check_anon = lambda item: 'pap_anon' in item.lower() or 'anonymous' in item.lower() + + +def action_files_by_name(root, source, item_name): + """ + Pick out anonymous and create new folder to move them into it, remove ones that don't follow id naming convention. + :param root: + :param source: + :param item_name: + :return: + """ + if not check_id(root, item_name): + path = os.path.join(root, item_name) + os.remove(path) + return + + if check_anon(item_name): + destination_parent = os.path.join('/'.join(root.split('/')[:-1]), 'anonymous') + + if not os.path.exists(destination_parent): + os.mkdir(destination_parent) + + destination = os.path.join(destination_parent, item_name) + shutil.move(source, destination) + + +def audit_files(source): + including = open('including.txt', 'w+') + ignoring = open('ignoring.txt', 'w+') + for root, dir, files in os.walk(source): + for item in files: + name = os.path.join(root.split('/')[-1], item) # get file/folder name after slash + if not check_id(root, name): + ignoring.writelines(name + '\r') + else: + including.writelines(name + '\r') + + ignoring.close() + including.close() + + projects = set(os.listdir(source)) + project_ids = set([get_item_id(folders) for folders in list(projects)]) + + # check for duplicate ids + assert_equal(len(projects), len(project_ids)) + + +def main(files_dir, metadata_dir, id_list=None): + """ + This is a script for our EGAP partnership that converts the EGAP provided dump of files into a directory structure + we can easily import into the OSF. Some files in the dump are anonymous and need to be sorted into a special folder + some don't follow an id naming convention and should be ignored and not imported. + + This script copies whole file tree for a project to preserve file hierarchy then picks out anonymous files and moves + them to the anonymous folder and delete those that don't follow the naming convention. + + This script can be safely removed once all EGAP registrations have been imported. + + :param files_dir: the source path we're picking files out of + :param metadata_dir: a pre-made directory structure for importing projects that we are packing files into. + :return: + """ + project_dirs = os.listdir(files_dir) + if id_list: + project_dirs = [project for project in project_dirs if get_item_id(project) in id_list] + + # Copy whole tree to preserve file hierarchy then + for item in project_dirs: + item_id = get_item_id(item) + source = os.path.join(files_dir, item) + destination = os.path.join(metadata_dir, item_id, 'data', 'nonanonymous') + if os.path.isdir(source): + copy_tree(source, destination) + + for root, dir, files in os.walk(metadata_dir): + for item in files: + if item not in ('project.json', 'registration-schema.json'): + source = os.path.join(root, item) + action_files_by_name(root, source, item) + + # Check All anon files in /anonymous/ directory + for root, dir, files in os.walk(metadata_dir): + for item in files: + if item not in ('project.json', 'registration-schema.json'): + if check_anon(item): + assert '/anonymous' in root + else: + assert '/nonanonymous' in root + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '-source', + '--source', + help='This should be the directory for the EGAP data dump, traditionally called "EGAP_REGISTRY_staging/3 Registrations/"' + ) + parser.add_argument( + '-destination', + '--destination', + help='This should be the directory of the import file structure containing the bags of data.' + ) + parser.add_argument( + '-list', + '--list', + help='This is a list of ids to import into a the new metadata directory.' + ) + parser.add_argument( + '-audit', + '--audit', + help='This includes all files that don\'t follow the "_PAP" naming convention.' + ) + + args = parser.parse_args() + source = args.source + destination = args.destination + audit = args.audit + if audit: + audit_files(source) + else: + main(source, destination) diff --git a/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf b/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_PAP.pdf b/scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_PAP.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf b/scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/tests/test_files_to_import_structure.py b/scripts/tests/test_files_to_import_structure.py new file mode 100644 index 00000000000..0778df7f82f --- /dev/null +++ b/scripts/tests/test_files_to_import_structure.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +import mock +from tests.base import OsfTestCase +from scripts.EGAP.files_to_import_structure import action_files_by_name + + +class TestEGAPFilesToImportStructure(OsfTestCase): + + @mock.patch('scripts.EGAP.files_to_import_structure.os.mkdir') + @mock.patch('scripts.EGAP.files_to_import_structure.shutil.move') + def test_doesnt_move_nonanon_files(self, mock_move, mock_mkdir): + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/datatest_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_PAP.pdf', + '20151016AA_PAP.pdf' + ) + assert not mock_mkdir.called + assert not mock_move.called + + @mock.patch('scripts.EGAP.files_to_import_structure.os.mkdir') + @mock.patch('scripts.EGAP.files_to_import_structure.shutil.move') + def test_moves_anon_files(self, mock_move, mock_mkdir): + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_anonymous.pdf', + '20151016AA_anonymous.pdf' + ) + + mock_mkdir.assert_called_with('scripts/tests/test_files/20151016AA/data/anonymous') + + mock_move.assert_called_with( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_anonymous.pdf', + 'scripts/tests/test_files/20151016AA/data/anonymous/20151016AA_anonymous.pdf' + ) + + @mock.patch('scripts.EGAP.files_to_import_structure.os.remove') + def test_removes_no_id(self, mock_remove): + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf', + 'justafile.pdf' + ) + + mock_remove.assert_called_with('scripts/tests/test_files/20151016AA/data/test_nonanonymous/justafile.pdf') + + @mock.patch('scripts.EGAP.files_to_import_structure.os.remove') + def test_removes_form(self, mock_remove): + + action_files_by_name( + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous', + 'scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf', + '20151016AA_FORM.pdf' + ) + + mock_remove.assert_called_with('scripts/tests/test_files/20151016AA/data/test_nonanonymous/20151016AA_FORM.pdf') From 256b97c35ebfc32073375fd3bbd800599786ee20 Mon Sep 17 00:00:00 2001 From: Yuhuai Liu Date: Mon, 4 Nov 2019 10:44:57 -0500 Subject: [PATCH 5/9] Change submittingAuthor to Author (#9190) ## Purpose All the remaining author should be `Author` instead of `submittingAuthor`. --- osf/external/chronos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osf/external/chronos.py b/osf/external/chronos.py index caf48790d23..64f19f7b51e 100644 --- a/osf/external/chronos.py +++ b/osf/external/chronos.py @@ -105,7 +105,7 @@ def serialize_author(cls, contributor): if contributor._order == 0: contribution = 'firstAuthor' else: - contribution = 'submittingAuthor' + contribution = 'Author' ret.update({ 'CONTRIBUTION': contribution, 'ORGANIZATION': '', From dc0e0852f801120081ff784b646ff5b19f3b741f Mon Sep 17 00:00:00 2001 From: John Tordoff Date: Mon, 4 Nov 2019 15:35:55 -0500 Subject: [PATCH 6/9] [ENG-897] EGAP Ingester (#9183) ## Purpose Turn those wonderful local file structures in to EGAP projects! ## Changes Adds a management command that creates a node with proper contributor and uploads files maintaining directory hierarchy with tests. ## QA Notes To setup this you must: 1) log in and create a private project 2) upload a zip of file's using the EGAP import structure to the project 3) run `python manage.py import_EGAP -c= -id=` - This is a low risk change that doesn't involve a migration. - QA will be able to verify this with an Ipython notebook. Once that's set-up they should be able to browse the registrations and see they were imported properly with the correct permissions and bibliographic status. - This is unlikely to effect any other portion of the site in a substantive way. This includes some unittests but obviously can easily be snagged on irregular data, if you find any please report it to me. ## Documentation Code comments, JIRA ## Side Effects None that I know of. ## Ticket https://openscience.atlassian.net/browse/ENG-897 --- egap_assets.zip | Bin 0 -> 15691 bytes osf/management/commands/import_EGAP.py | 189 ++++++++++++++++ .../management_commands/test_EGAP_import.py | 210 ++++++++++++++++++ .../20120220AA/data/nonanonymous/test-1.txt | 0 .../data/nonanonymous/test_folder/test-2.txt | 0 .../EGAP/20120220AA/project.json | 1 + .../EGAP/20120220AA/registration-schema.json | 1 + .../test_directory/EGAP/test-egap.zip | Bin 0 -> 15691 bytes 8 files changed, 401 insertions(+) create mode 100644 egap_assets.zip create mode 100644 osf/management/commands/import_EGAP.py create mode 100644 osf_tests/management_commands/test_EGAP_import.py create mode 100644 osf_tests/management_commands/test_directory/EGAP/20120220AA/data/nonanonymous/test-1.txt create mode 100644 osf_tests/management_commands/test_directory/EGAP/20120220AA/data/nonanonymous/test_folder/test-2.txt create mode 100644 osf_tests/management_commands/test_directory/EGAP/20120220AA/project.json create mode 100644 osf_tests/management_commands/test_directory/EGAP/20120220AA/registration-schema.json create mode 100644 osf_tests/management_commands/test_directory/EGAP/test-egap.zip diff --git a/egap_assets.zip b/egap_assets.zip new file mode 100644 index 0000000000000000000000000000000000000000..85b13477ac26a1490e5c535d41920d5ca95cb46b GIT binary patch literal 15691 zcmeHucRbbo`@g-5Y>Mo?CCbRi%FK>r@6EB2A~^_UXYUnBgb+$1dt~ni$UsYY*&ywcb~G48J& zMx~VuvA5@M++piZZ#w4gR$Dgv*e<~>i~8h%R|`J<9U@b2J)Nr5E&e-1_~s&~(P-vl z9NI;zDLYm{>5M6*;qLc%=At%#{OxT`l|(q*sq;4VIOy9#Y~c1uohvuDzn@;I+ozEJ zc2?)p{aw5+u(5vk8=Et#+c$8XzXrXxJR{s4sXAPZgTqfjsIG*K(`Ut~*Z{E^1|`Z5 zqa0Y1=qk{fxDcZl8L3FgsA=l`vL)1>Fk2#j3~I|?k0Ch?cm(99arzJh9-~7K#>|+^ z%*xo*ctu57XN6EFSltgTf)bsT=p<^%Nuq#lGRbxn%_+K2-0I?{5{BU-4vb8V+@z}aKIYKFN1Gm@^`hgn8;vTjan3HqqByC!Gw9+4o>%YrMDGn{T9vI32bFA{ z2^`BqEpPFfYg=1=XqyruleOxvMLLN2ZOJYSOVxTb^`_(r$~)04%d4u>4MiEv#_=xw z5f`E}=3lJE#*5YDq-j+zk8spQhbP?1P}vY|XjNUc{HiQ8U~kzmG2B`IT|FRA6??OJ zBipq4BU6*?AoLJ~0FVu9`Ozm!cgqZCL>QSa+IP@@&vf|wOTAdgn3Fv z`gXc<#62oW`_SAwC+K++lhXczzCMkPFGRnK=aHa4+#NQ{cNj7Utj zU5dLBlNe`Y;$ZpVleMGuyL`JTd)o{`A`HQADHt+1`Kh72ozxiQpDB5%Ek@kY67xxa^!Y1v$k+B`9+tT5UM~wP`VVwiaWjHBLeTY zwg}>gg0n4T_&`zx>9*SRU^~}imq|p26t(YeFALeXXo#AY@~*kf4sJyXindZEH6-|& z4M&ZZ$T|iO;pi0<-qkW2e=|NYywOWpLq%alGgH>EL^--oC|h%~ol%uEkg?az7BjB#TpuTKZfUd z6RAo7Ug}7J1m*eR6ZTKIO+q=uyLrtqm|s=UnK@&0^+ovWdP=x08e|Bh?Sg0Xb#bn$ zdWlsn+~E&4@2m>Ce=Tvc<+>y~@wlc{Nc!0`a)C1~#FBX;k8Q0*B-TdmD=W=F+=>ah-(@cJ+( zP1@<6nI9faak!>J2ExO7EO;&=o)OQXT>Q7k^*8epc}ye%^3vP{Wb(_I!@}Bc$A=gg zsgOJkaKcas7|c?<=y3>_FIvgmRZy?Y-XN>=4}6Dl)lcmb9cE^_Qe%|}b%@wEQlbpJ z+um;%$cD?Kn--6uoLpW%@eaj2xaovi3*|B55mPNY`^#D`FLPZ*7e-JQ$qB#sd|Df~ zt>wB~UVc>>2NErvObj}9#_PaD@9`gYVXI+RhnLg zEzS^fPp_Da4U1&NXNl8D?3mMk2VaXR+}$VToxMf>n$T?C{4CYamg$TuZw1)7gurv+^dbpB*OKd>A zMbw8V%VPxP1x!EKHTv)AM+UDSkJ0a>@4uwl0pxN#XZ1eE%jmVyyX@ zGQu82Om^UN0PP4BwFW){@c!M{^#LJKKASvpN@5C0Iv1%^V=NBbYnPcncgJr|Dpoi2 zE$(i6TkLYlo%Sw@C?5)7n~|GvsK^<6Mg76iD9#k~wwC2N?wqlH_fGv~5p}^5LjIoO zr_R>1_eLJ3-dvMyh-p)OT; zt&MI2;!?eTCXoNGo`6{t#;rfSDdJHerl2qc9;2g9G9X<7=dB`LQ1UIb3h?R{2VtqEJW8ESuO-S0}f|5?qoiF_Y^a z>FDEzyiVA4@2uKV-E-fV|4=^{Q+IEmz?w2m?rDw%ZMaQLLg|NAQL!6aw3A<-KI6FY z73`Fz`gwV{_GI|P{@9#X>FG~_R7CUJ+h^r==flDAVj^Q7rlwe4Ok^g`*LB!w&*b(Q ziR6Wr>Llb@ttG9EXl=MkJ#BQ&;aI9EI8}>tUt8)yZg*Abh|^rGkWwMhS`iOkx%u^1 zWV~g0f|OUpoo?q*JvyuUoZfawZN*EV@x^@BussfgII2$745xYaH(TYd@WI&JJkHSe zGV0z%%T6-!!VP)N_>o807$V7osa`C-KD|L(*=*-C=d*fS%>C}Cp85WfUS#w2YT^TF z1*VFM5xq4qm#EHJgA(RgHioBky+%q`z723&q>2kVL zo~dNcC&fNj+O~y293~DDK9>v-rwB-O4c1iqDYSru^YC2GOB@ z``X*bS4d}>X=vQApEh!-8W`r8E6?tGo+TF{!y!sNsZu7Iz42);;1YYnUX%r@sUJIMfUqcgw9 zl5$PBp>L!K1y!-qSXMH3*_`28Io;zL=LseIiX8OKn1T$o@X9z0P%N2BD)o$&gZD7X zf(k*xsaq5N`RmnBatS{Bh1O5a=CZZ-_ZYdm`cIPPW4t)w*W^xB)=JL*{eskYI}-Ee z%*tCyDrbHBjCd8NDHXo)n{%9L{51PAdy;Y6nyS0-)D|kXmJZi_22WuDaAH{K4Sxax zQc9Q0&HHxUIU~mx5=L{4!x*%;KlpZXmXptAgu4?v)N@$j=ki=H@-ei1%aYr&-Khq>Y#&*)!`k*ZP7l!t9_37%EkSpuuZ+|z$QP!}`R9EvHg zEuVZI_maBKFvs45oQx*3r(-^rM}bakrty` z+%HChU6Q2APe-U*M=10#glU}{<@w<8CW?XS3deIBw;@AmVP)ZZare^h>Y+SuAK^;- zkMiu~^Am}q&z39P->>rvjPMWG49btzb4HTr)Emrx3Yu`F)yyCnB|lF2X`FYHI`2#s zN7E=17;lsi}U(Z1=Zh+d|?)@?7v zLN%+vkQx0kW4kjZbmG!dHGhEMxFun@t~D1P)005w?x9;=zJ5-N{oKogyeS-=?&f4W zRV3)SFE5#W|D-CEZt+$YlyPZ{WjN)n+t;!Ts-CoPb*i2B;iXE)6GhTeP4SjFX;Wfi zmY&T%!MfgS@GU#HAuf0@LcO_~-TYcxQPEjrqp`5 zywp_ra|H|+(+k(9Z}A(pU$71gD3u6)GC*MdTpcH>`=;~;fu5kawtHnuTH3~%Tz~kO zQdRQ$176Ia%|%-eVQuG|`=Y8GV6HMw=K0#qy^w~OCCyAhNiLsvbA(?mulO)K+#V3S z<;+r9_WLBein4vBk&X9|EXyYvat=kS^q4v5Y38gZ1cmu ze~xWn7CxMG|6{-E;Bq52nc&7YP(cu@Js(IB1pWLN+eD}+q4Pug@DXm$J1VeE)C}w} zF~x@_E}D-y|_Wnd<<&~-6DZ9S_gCSu7W%P z=EG|QB-^pe-2ygVss_%RmvpTsmzaw^^S-~>4aTj{m33Ot-(!2lTtt|f1U5@rLfa>` z*FTH%Z0>T7yWt6e(Y^GYh;`HDFv^LydLQ!N+;vcT@=8J-W$kU3KQ%w4KS?jn>1(|(#pKw#mm!lDrb!QWf>!9) z3-$7eA=UY-pA|ZCGb7g5(E01Z!5?Mu!_WS2Y2zW%32wv#Rqv>b27V&gab?tXRE~Hr zmR+#Yg?#T`Yil&B+!47Y z5xKkc>C^pvZjhUhkjI^rq@-Ti)HI&8_n^oJd7(FaN}IoEy_Zp0z#hlOYvt}Ie7r=R zYJInd!J@y>0XSyr+-~{2KQ_uN>X`<&=Wx{)vUpS0Zxq+PYWaUhx+Tdw;tq&mT0Sw z#Mw%jYY)y9yb%r!w`CuA3<@t)Mx~K0N&?$s`jiZupe{HstuSA@L5g9c40Z>V84<*TFA}w46ED zFvbhstDO}u^$@>9{jO-(IUUDyN`%Z|*8K!C&e?GTj3(U&*1S43mz^d#y#~una&4cn z%q7!P-qE=@n^EP#UVn0?rj$n_Zkj0L>Mn81#+}&+y(M9s#%Js=0z*CuUKt|t9dTXL zz&&B-ic7xTlhRQ689nGmM5Ize&quGf2_MV`BCFsSP@0-c?h(SDs?XWPi2vgGp^>~D0nKZo;rlS>GQG_ue^cI$Sx7*rP{DLO#%9t0Y``3NGZFJHwZY+Qty{o#o(b(klPB6YA-(n-v{Qapo`j)brNKlG>dJxr_!3nNg8`@m-}1Gkv4^g zV{7$L9_Q!29M>U)C3ehP!O;A@RJwRBS?S%&qwcD5G)jc%cA40^*|kjkEqTf58tdyy z!>y87=>plNAs2$bm+%FxD4S*&J(`l9aLWb9KQI=|fKW-;;2B>q&8B2roH?s>D_0A# zjHL!U6-Po#!sT9dJyW_FkyOTeM$gRYJIjxEzjNWeKg@5`?HZCNeW`PN{o-&2;T4@X zV-Xm4m(F>e(!p9iWs_6OT&$nUPwY5H5ysadaohM?{F9V~rhA4h$D1pyG<`(+CHZH( z2}Q8$Z%*R4Kil#5x-J6_`oQuG)XY8=ds{at@p-sMV0F!lF>VoMZm^2QdIUEpy;e5h zP35L+IMMS|PXEdbugEPQoz!8kj0SYsI_XGWa8JBcYuL-!MGJbfAP;wi5KKZpw6r~i zivI5=xf2;l`vC)Tmo39OHl}W>;6%I#RO#B1lz-;?5=ZK;^W@iZOM+d{;GA3P79D|M zsZ-U7?-OS7-Bt)qrDC%?+~>_GesZ}f%)4a_MQ)-tzNH%vpNsISF5O7(rdxi& zzJ{k##m+~1tIqAQ<5NyDp9gIM7YxqR-Iz^s?O2pA^nS`<6d%D3Ci%qP=(Ze14!Sh9 z7R}XfdMgyow%q2G6n`U*{sT7sZka`N|`TISiM@+n(tf5UzDcIia z5ecZzBMaMQgvA6&IJb3zCL9{AOFh%uUA zGMC3QNGJ?Whdhv)4}FmrxTzBK>U@|{X!bmh zlf*aMq(;e(H8e?9+s633$6feB!WZAlx%=>l1N`yJu;^K8X=?*#G{&7Er3>2~Z3neZO#}rP^X@-I^B}>kH zv(0AT>GQn-y77TYbt9raSGUu65}bJ_#maLsHLu7-Jf}ahUQEc-u93t`;VM(H2gl;u zDQ&#q?3Ib_;7hq^7;R4{*0DkfT|w?&zlQOa4Y@ceJyyH5>heX1SMQ?{BWS1V)ZkG@m(r7!rZKQ!e`V{Bf`#Qf+ypIM=P+zQ02g z?*#SV=5T*=vWmPqhTHpqCaZw1)IXQVaC;xWo(9y(&2H2eu1PS5|bjy~WL;{Ac!`}iddd>imrj}lf*{~n7YLLoL60fs`}T!euh zuH@fRFd}xS5SxoIWr58_*lqmp>Gx-I@u!Bc=HlP48Y&tY`T^+?mv-<`2q3!&KfWZF zq@cf{ozq{djAir0c;kT9 zrp&KhGi|*CfuKI-CLU||hW)88#6_u4o(zaAW`%>z_Ai9MA|t~Oe8gCa`~^^_2hTQYzzgCbR6+IvSo}Ji5Ac{| z08B4@I85NKAR^oWQ4uFvK) z6uD~#`qlnpa7rQ(50RXRmmcV?D2^jX`_o$iB|UJhQ2W6T?}6TGAA$`h;89)#wlfBA zS136ASp&V*ZOE5&gE2MP* z2LgyquxaoC+P}H^VkhA=goB02hIq(O*$&UBZXSY$5Rrgl{?4f2$9F)FM|29HG@{3Y zmR!&eycH2+5zt3@(jeFSPaky{Hhe1w=%b8i0m2^TqhNElqXY-~C`z`&1poK%LC#g6 zk0M|PC;_G@m(r7!rZKQ!e`V{Bf`#Qf+ypIM=P+zQ02g z?*#SV=5T*=vWmPqhTHpqCaZw1)IXQVaC;xWo(9y(&2H2eu1PS5|bjy~WL;{Ac!`}iddd>imrj}lf*{~n7YLLoL60fs`}T!euh zuH@fRFd}xS5SxoIWr58_*lqmp>Gx-I@u!Bc=HlP48Y&tY`T^+?mv-<`2q3!&KfWZF zq@cf{ozq{djAir0c;kT9 zrp&KhGi|*CfuKI-CLU||hW)88#6_u4o(zaAW`%>z_Ai9MA|t~Oe8gCa`~^^_2hTQYzzgCbR6+IvSo}Ji5Ac{| z08B4@I85NKAR^oWQ4uFvK) z6uD~#`qlnpa7rQ(50RXRmmcV?D2^jX`_o$iB|UJhQ2W6T?}6TGAA$`h;89)#wlfBA zS136ASp&V*ZOE5&gE2MP* z2LgyquxaoC+P}H^VkhA=goB02hIq(O*$&UBZXSY$5Rrgl{?4f2$9F)FM|29HG@{3Y zmR!&eycH2+5zt3@(jeFSPaky{Hhe1w=%b8i0m2^TqhNElqXY-~C`z`&1poK%LC#g6 zk0M|PC;_ Date: Tue, 5 Nov 2019 15:49:25 -0500 Subject: [PATCH 7/9] Refactor audit, add requirements.txt, and update Jupyter notebook (#9192) ## Purpose Integrate EGAP file processing with Jupyter notebook. ## Changes 1. Add notebook integration of EGAP file processing 2. Add requirements.txt for Jupyter users 3. Refactor audit and file checking for more consistency between uses 4. Update variables in file script to avoid shadowing outer scope ## QA Notes - Does this change require a data migration? If so, what data will we migrate? _No data migration_ - What is the level of risk? _Low_ - Any permissions code touched? _No_ - Is this an additive or subtractive change, other? _Additive mostly, but some changes to a recently added, not yet QA'd PR_ - How can QA verify? (Through UI, API, AdminApp or AdminAdminApp?) _Using the Jupyter notebook_ - If verifying through API, what's the new version? Please include the endpoints in PR notes or Dev docs. - What features or workflows might this change impact? _EGAP migration_ - How will this impact performance? _It shouldn't_ ### How to use: - Create a virtual env for this and activate it - `pip3 install -r requirements.txt` from within the `scripts/EGAP/` directory - Run `jupyter lab` from the OSF base directory. - Open the ipynb file from within jupyter lab. - The first code block does setup. Set your variables in that block to point to the various inputs and outputs indicated there. - `author_source` is the csv of authors from EGAP - `registry_source` is the csv of projects from EGAP - `raw_files_directory` is the path to the raw files from EGAP - `metadata_directory` is the path of the directory that will be created to hold all of the project data - `directories_to_process` is the list of project ids that should be processed in the file-moving step - Run the first code block to import and set up everything - Run the `create_file_tree_and_json` code block to create the metadata directory - Run the `audit_files` code block to generate the ignoring and including text files inside the `scripts/EGAP` directory - Run the `convert_files` block to move the files for the projects specified in `directories_to_process` into the metadata directory from the raw data directory. ## Documentation No external documentation ## Side Effects Shouldn't be. ## Ticket https://openscience.atlassian.net/browse/ENG-1194 --- scripts/EGAP/egap_workflow.ipynb | 1499 ++++++++++++++++++++- scripts/EGAP/files_to_import_structure.py | 71 +- scripts/EGAP/requirements.txt | 68 + 3 files changed, 1610 insertions(+), 28 deletions(-) create mode 100644 scripts/EGAP/requirements.txt diff --git a/scripts/EGAP/egap_workflow.ipynb b/scripts/EGAP/egap_workflow.ipynb index 750bb02b152..96f5af42703 100644 --- a/scripts/EGAP/egap_workflow.ipynb +++ b/scripts/EGAP/egap_workflow.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the `scripts/EGAP` folder, with your virtualenv active, `pip install -r requirements.txt`" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,10 +14,1481 @@ "outputs": [], "source": [ "from create_EGAP_json import create_file_tree_and_json\n", + "from files_to_import_structure import audit_files, main as convert_files\n", "\n", "author_source = '/Users/bgeiger/Desktop/EGAP/20190821_author_emails.csv'\n", "registry_source = '/Users/bgeiger/Desktop/EGAP/20191014_OSF_database.csv'\n", - "target_directory= '/Users/bgeiger/Desktop/EGAP/output/'" + "metadata_directory = '/Users/bgeiger/Desktop/EGAP/metadata/'\n", + "raw_files_directory = '/Users/bgeiger/Desktop/EGAP/raw_files/'\n", + "directories_to_process = [\n", + " '20110302AA',\n", + " '20110307AA',\n", + " '20120117AA',\n", + " '20120220AA',\n", + " '20120727AA',\n", + " '20120925AA',\n", + " '20120926AA',\n", + " '20121001AA',\n", + " '20121002AA',\n", + " '20121012AA',\n", + " '20121026AA',\n", + " '20121031AA',\n", + " '20121101AA',\n", + " '20121104AA',\n", + " '20121106AA',\n", + " '20121107AA',\n", + " '20121123AA',\n", + " '20121212AA',\n", + " '20130122AA',\n", + " '20130403AA',\n", + " '20130406AA',\n", + " '20130410AA',\n", + " '20130426AA',\n", + " '20130518AA',\n", + " '20130607AA',\n", + " '20130616AA',\n", + " '20130704AA',\n", + " '20130729AA',\n", + " '20130731AA',\n", + " '20130803AA',\n", + " '20130813AA',\n", + " '20130819AA',\n", + " '20130913AA',\n", + " '20130921AA',\n", + " '20131012AA',\n", + " '20131024AA',\n", + " '20131101AA',\n", + " '20131105AA',\n", + " '20131110AA',\n", + " '20131117AA',\n", + " '20131118AA',\n", + " '20131130AA',\n", + " '20131203AA',\n", + " '20131206AA',\n", + " '20131210AA',\n", + " '20131211AA',\n", + " '20131216AA',\n", + " '20131220AA',\n", + " '20140110AA',\n", + " '20140112AA',\n", + " '20140113AA',\n", + " '20140120AA',\n", + " '20140124AA',\n", + " '20140126AA',\n", + " '20140131AA',\n", + " '20140203AA',\n", + " '20140203AB',\n", + " '20140222AA',\n", + " '20140222AB',\n", + " '20140228AA',\n", + " '20140303AA',\n", + " '20140308AA',\n", + " '20140316AA',\n", + " '20140320AA',\n", + " '20140417AA',\n", + " '20140502AA',\n", + " '20140503AA',\n", + " '20140506AA',\n", + " '20140509AA',\n", + " '20140509AB',\n", + " '20140512AA',\n", + " '20140521AA',\n", + " '20140523AA',\n", + " '20140529AA',\n", + " '20140610AA',\n", + " '20140611AA',\n", + " '20140611AB',\n", + " '20140613AB',\n", + " '20140627AA',\n", + " '20140627AB',\n", + " '20140627AC',\n", + " '20140701AA',\n", + " '20140701AB',\n", + " '20140707AA',\n", + " '20140708AA',\n", + " '20140715AA',\n", + " '20140722AA',\n", + " '20140723AA',\n", + " '20140723AB',\n", + " '20140806AA',\n", + " '20140812AA',\n", + " '20140820AA',\n", + " '20140912AA',\n", + " '20140915AA',\n", + " '20140918AA',\n", + " '20140922AA',\n", + " '20141002AA',\n", + " '20141006AA',\n", + " '20141023AA',\n", + " '20141025AA',\n", + " '20141027AA',\n", + " '20141031AA',\n", + " '20141031AB',\n", + " '20141101AA',\n", + " '20141103AA',\n", + " '20141107AA',\n", + " '20141117AA',\n", + " '20141202AA',\n", + " '20141208AA',\n", + " '20141213AA',\n", + " '20141223AA',\n", + " '20141225AA',\n", + " '20141227AA',\n", + " '20141231AA',\n", + " '20150110AA',\n", + " '20150111AA',\n", + " '20150118AA',\n", + " '20150122AA',\n", + " '20150122AB',\n", + " '20150127AA',\n", + " '20150131AA',\n", + " '20150202AA',\n", + " '20150204AA',\n", + " '20150206AA',\n", + " '20150211AA',\n", + " '20150216AA',\n", + " '20150304AA',\n", + " '20150308AA',\n", + " '20150309AA',\n", + " '20150310AA',\n", + " '20150311AA',\n", + " '20150313AA',\n", + " '20150320AA',\n", + " '20150323AA',\n", + " '20150324AA',\n", + " '20150326AA',\n", + " '20150330AA',\n", + " '20150420AA',\n", + " '20150423AA',\n", + " '20150428AA',\n", + " '20150429AA',\n", + " '20150508AA',\n", + " '20150513AA',\n", + " '20150513AB',\n", + " '20150513AC',\n", + " '20150513AD',\n", + " '20150513AE',\n", + " '20150513AF',\n", + " '20150513AG',\n", + " '20150513AH',\n", + " '20150513AI',\n", + " '20150514AA',\n", + " '20150517AA',\n", + " '20150518AA',\n", + " '20150520AA',\n", + " '20150522AA',\n", + " '20150526AA',\n", + " '20150527AA',\n", + " '20150602AA',\n", + " '20150602AB',\n", + " '20150603AA',\n", + " '20150604AA',\n", + " '20150605AA',\n", + " '20150605AB',\n", + " '20150616AA',\n", + " '20150617AA',\n", + " '20150619AA',\n", + " '20150622AA',\n", + " '20150623AA',\n", + " '20150701AA',\n", + " '20150702AA',\n", + " '20150703AA',\n", + " '20150707AA',\n", + " '20150708AA',\n", + " '20150709AA',\n", + " '20150709AB',\n", + " '20150710AA',\n", + " '20150713AA',\n", + " '20150716AA',\n", + " '20150716AB',\n", + " '20150717AA',\n", + " '20150718AA',\n", + " '20150720AA',\n", + " '20150723AA',\n", + " '20150724AA',\n", + " '20150727AA',\n", + " '20150731AA',\n", + " '20150731AB',\n", + " '20150803AA',\n", + " '20150803AB',\n", + " '20150812AA',\n", + " '20150813AA',\n", + " '20150813AB',\n", + " '20150819AA',\n", + " '20150819AB',\n", + " '20150820AA',\n", + " '20150824AA',\n", + " '20150824AB',\n", + " '20150825AA',\n", + " '20150827AA',\n", + " '20150903AA',\n", + " '20150903AB',\n", + " '20150914AA',\n", + " '20150915AA',\n", + " '20150917AA',\n", + " '20150921AA',\n", + " '20150922AA',\n", + " '20150924AA',\n", + " '20150925AA',\n", + " '20150927AA',\n", + " '20150928AA',\n", + " '20150928AB',\n", + " '20150929AA',\n", + " '20150929AB',\n", + " '20150930AA',\n", + " '20150930AB',\n", + " '20151003AA',\n", + " '20151006AA',\n", + " '20151006AB',\n", + " '20151012AA',\n", + " '20151013AA',\n", + " '20151013AB',\n", + " '20151014AA',\n", + " '20151014AB',\n", + " '20151016AA',\n", + " '20151016AB',\n", + " '20151016AC',\n", + " '20151017AA',\n", + " '20151019AA',\n", + " '20151023AA',\n", + " '20151027AA',\n", + " '20151030AA',\n", + " '20151102AA',\n", + " '20151102AB',\n", + " '20151102AC',\n", + " '20151103AA',\n", + " '20151107AA',\n", + " '20151112AA',\n", + " '20151112AB',\n", + " '20151114AA',\n", + " '20151116AA',\n", + " '20151116AB',\n", + " '20151118AA',\n", + " '20151119AA',\n", + " '20151119AB',\n", + " '20151120AA',\n", + " '20151120AB',\n", + " '20151123AA',\n", + " '20151125AA',\n", + " '20151125AB',\n", + " '20151128AA',\n", + " '20151201AA',\n", + " '20151201AB',\n", + " '20151202AA',\n", + " '20151204AA',\n", + " '20151206AA',\n", + " '20151207AA',\n", + " '20151209AA',\n", + " '20151218AA',\n", + " '20160105AA',\n", + " '20160106AA',\n", + " '20160112AA',\n", + " '20160112AB',\n", + " '20160113AA',\n", + " '20160113AB',\n", + " '20160119AA',\n", + " '20160121AA',\n", + " '20160202AA',\n", + " '20160208AA',\n", + " '20160208AB',\n", + " '20160208AC',\n", + " '20160216AA',\n", + " '20160217AA',\n", + " '20160219AA',\n", + " '20160222AA',\n", + " '20160224AA',\n", + " '20160224AB',\n", + " '20160225AA',\n", + " '20160308AA',\n", + " '20160308AB',\n", + " '20160309AA',\n", + " '20160313AA',\n", + " '20160315AA',\n", + " '20160318AA',\n", + " '20160321AA',\n", + " '20160323AA',\n", + " '20160324AA',\n", + " '20160327AA',\n", + " '20160330AA',\n", + " '20160401AA',\n", + " '20160404AA',\n", + " '20160404AB',\n", + " '20160405AA',\n", + " '20160405AB',\n", + " '20160406AA',\n", + " '20160408AA',\n", + " '20160409AA',\n", + " '20160409AB',\n", + " '20160410AA',\n", + " '20160411AA',\n", + " '20160411AB',\n", + " '20160411AC',\n", + " '20160411AD',\n", + " '20160413AA',\n", + " '20160414AA',\n", + " '20160415AA',\n", + " '20160416AA',\n", + " '20160416AB',\n", + " '20160421AA',\n", + " '20160426AA',\n", + " '20160427AA',\n", + " '20160427AB',\n", + " '20160429AA',\n", + " '20160429AB',\n", + " '20160507AA',\n", + " '20160514AA',\n", + " '20160515AA',\n", + " '20160516AA',\n", + " '20160517AA',\n", + " '20160517AB',\n", + " '20160517AC',\n", + " '20160517AD',\n", + " '20160517AE',\n", + " '20160519AA',\n", + " '20160520AA',\n", + " '20160524AA',\n", + " '20160531AA',\n", + " '20160601AA',\n", + " '20160601AB',\n", + " '20160601AC',\n", + " '20160601AD',\n", + " '20160602AA',\n", + " '20160603AA',\n", + " '20160605AA',\n", + " '20160607AA',\n", + " '20160607AB',\n", + " '20160609AA',\n", + " '20160609AB',\n", + " '20160609AC',\n", + " '20160611AA',\n", + " '20160612AA',\n", + " '20160613AA',\n", + " '20160613AB',\n", + " '20160615AA',\n", + " '20160615AB',\n", + " '20160616AA',\n", + " '20160617AA',\n", + " '20160617AB',\n", + " '20160618AA',\n", + " '20160621AA',\n", + " '20160621AB',\n", + " '20160621AC',\n", + " '20160621AD',\n", + " '20160622AA',\n", + " '20160622AB',\n", + " '20160624AA',\n", + " '20160624AB',\n", + " '20160625AA',\n", + " '20160628AA',\n", + " '20160629AA',\n", + " '20160630AA',\n", + " '20160702AA',\n", + " '20160708AA',\n", + " '20160708AB',\n", + " '20160712AA',\n", + " '20160717AA',\n", + " '20160719AA',\n", + " '20160719AB',\n", + " '20160721AA',\n", + " '20160726AA',\n", + " '20160726AB',\n", + " '20160727AA',\n", + " '20160729AA',\n", + " '20160730AA',\n", + " '20160801AA',\n", + " '20160801AB',\n", + " '20160803AA',\n", + " '20160803AB',\n", + " '20160809AA',\n", + " '20160809AB',\n", + " '20160812AA',\n", + " '20160812AB',\n", + " '20160813AA',\n", + " '20160813AB',\n", + " '20160813AC',\n", + " '20160819AA',\n", + " '20160819AB',\n", + " '20160820AA',\n", + " '20160822AA',\n", + " '20160824AA',\n", + " '20160825AA',\n", + " '20160829AA',\n", + " '20160831AA',\n", + " '20160905AA',\n", + " '20160907AA',\n", + " '20160911AA',\n", + " '20160912AB',\n", + " '20160913AA',\n", + " '20160913AB',\n", + " '20160916AA',\n", + " '20160916AB',\n", + " '20160918AA',\n", + " '20160919AA',\n", + " '20160921AA',\n", + " '20160921AB',\n", + " '20160926AA',\n", + " '20160926AB',\n", + " '20160926AC',\n", + " '20161001AA',\n", + " '20161004AA',\n", + " '20161006AA',\n", + " '20161011AA',\n", + " '20161016AA',\n", + " '20161017AA',\n", + " '20161019AA',\n", + " '20161019AB',\n", + " '20161020AA',\n", + " '20161026AA',\n", + " '20161028AA',\n", + " '20161030AA',\n", + " '20161101AA',\n", + " '20161101AB',\n", + " '20161103AA',\n", + " '20161103AB',\n", + " '20161104AA',\n", + " '20161104AB',\n", + " '20161109AA',\n", + " '20161109AB',\n", + " '20161109AC',\n", + " '20161110AA',\n", + " '20161110AB',\n", + " '20161110AC',\n", + " '20161110AD',\n", + " '20161110AE',\n", + " '20161112AA',\n", + " '20161118AA',\n", + " '20161121AA',\n", + " '20161122AA',\n", + " '20161122AB',\n", + " '20161123AA',\n", + " '20161125AA',\n", + " '20161127AA',\n", + " '20161128AA',\n", + " '20161129AA',\n", + " '20161204AA',\n", + " '20161204AB',\n", + " '20161205AA',\n", + " '20161206AA',\n", + " '20161207AA',\n", + " '20161208AA',\n", + " '20161212AA',\n", + " '20161216AA',\n", + " '20161216AB',\n", + " '20161227AA',\n", + " '20161227AB',\n", + " '20170103AA',\n", + " '20170109AA',\n", + " '20170112AA',\n", + " '20170115AA',\n", + " '20170117AA',\n", + " '20170118AA',\n", + " '20170123AA',\n", + " '20170124AA',\n", + " '20170130AA',\n", + " '20170131AA',\n", + " '20170203AA',\n", + " '20170203AB',\n", + " '20170203AC',\n", + " '20170203AD',\n", + " '20170205AA',\n", + " '20170207AA',\n", + " '20170208AA',\n", + " '20170209AA',\n", + " '20170210AA',\n", + " '20170212AA',\n", + " '20170214AA',\n", + " '20170214AB',\n", + " '20170215AA',\n", + " '20170216AA',\n", + " '20170216AB',\n", + " '20170220AA',\n", + " '20170222AA',\n", + " '20170223AA',\n", + " '20170223AB',\n", + " '20170223AC',\n", + " '20170224AA',\n", + " '20170225AA',\n", + " '20170227AA',\n", + " '20170227AB',\n", + " '20170227AC',\n", + " '20170301AA',\n", + " '20170302AA',\n", + " '20170307AA',\n", + " '20170308AA',\n", + " '20170308AB',\n", + " '20170309AA',\n", + " '20170310AA',\n", + " '20170312AA',\n", + " '20170317AA',\n", + " '20170320AA',\n", + " '20170320AB',\n", + " '20170320AC',\n", + " '20170321AA',\n", + " '20170321AB',\n", + " '20170322AA',\n", + " '20170322AB',\n", + " '20170323AA',\n", + " '20170324AA',\n", + " '20170325AA',\n", + " '20170328AA',\n", + " '20170329AA',\n", + " '20170330AA',\n", + " '20170403AA',\n", + " '20170412AA',\n", + " '20170412AB',\n", + " '20170413AA',\n", + " '20170413AB',\n", + " '20170413AC',\n", + " '20170414AA',\n", + " '20170416AA',\n", + " '20170417AA',\n", + " '20170417AB',\n", + " '20170420AA',\n", + " '20170421AA',\n", + " '20170422AA',\n", + " '20170423AA',\n", + " '20170423AB',\n", + " '20170426AA',\n", + " '20170427AA',\n", + " '20170428AA',\n", + " '20170501AA',\n", + " '20170501AB',\n", + " '20170501AC',\n", + " '20170501AD',\n", + " '20170503AA',\n", + " '20170503AB',\n", + " '20170503AC',\n", + " '20170504AA',\n", + " '20170504AB',\n", + " '20170505AA',\n", + " '20170505AB',\n", + " '20170505AC',\n", + " '20170506AA',\n", + " '20170507AA',\n", + " '20170508AA',\n", + " '20170508AB',\n", + " '20170509AA',\n", + " '20170509AB',\n", + " '20170510AA',\n", + " '20170511AA',\n", + " '20170515AA',\n", + " '20170515AB',\n", + " '20170516AA',\n", + " '20170517AA',\n", + " '20170519AA',\n", + " '20170520AA',\n", + " '20170522AA',\n", + " '20170523AA',\n", + " '20170524AA',\n", + " '20170525AA',\n", + " '20170525AB',\n", + " '20170527AA',\n", + " '20170531AA',\n", + " '20170602AA',\n", + " '20170603AA',\n", + " '20170606AA',\n", + " '20170608AA',\n", + " '20170609AA',\n", + " '20170609AB',\n", + " '20170609AC',\n", + " '20170611AA',\n", + " '20170611AB',\n", + " '20170612AA',\n", + " '20170613AA',\n", + " '20170614AA',\n", + " '20170615AA',\n", + " '20170615AB',\n", + " '20170616AA',\n", + " '20170617AA',\n", + " '20170618AA',\n", + " '20170619AA',\n", + " '20170626AA',\n", + " '20170626AB',\n", + " '20170629AA',\n", + " '20170705AA',\n", + " '20170706AA',\n", + " '20170706AB',\n", + " '20170706AC',\n", + " '20170711AA',\n", + " '20170712AA',\n", + " '20170714AA',\n", + " '20170716AA',\n", + " '20170716AB',\n", + " '20170717AA',\n", + " '20170720AA',\n", + " '20170720AB',\n", + " '20170720AC',\n", + " '20170721AA',\n", + " '20170721AB',\n", + " '20170724AA',\n", + " '20170725AA',\n", + " '20170725AB',\n", + " '20170727AA',\n", + " '20170728AA',\n", + " '20170728AB',\n", + " '20170729AA',\n", + " '20170731AA',\n", + " '20170803AA',\n", + " '20170804AA',\n", + " '20170805AA',\n", + " '20170807AA',\n", + " '20170808AA',\n", + " '20170809AA',\n", + " '20170810AA',\n", + " '20170811AA',\n", + " '20170811AB',\n", + " '20170814AA',\n", + " '20170815AA',\n", + " '20170816AA',\n", + " '20170819AA',\n", + " '20170821AA',\n", + " '20170821AB',\n", + " '20170822AA',\n", + " '20170823AA',\n", + " '20170828AA',\n", + " '20170829AA',\n", + " '20170831AA',\n", + " '20170901AA',\n", + " '20170905AA',\n", + " '20170906AA',\n", + " '20170907AA',\n", + " '20170908AA',\n", + " '20170908AB',\n", + " '20170908AC',\n", + " '20170910AA',\n", + " '20170911AA',\n", + " '20170913AA',\n", + " '20170913AB',\n", + " '20170913AC',\n", + " '20170914AA',\n", + " '20170914AB',\n", + " '20170915AA',\n", + " '20170915AB',\n", + " '20170918AA',\n", + " '20170919AA',\n", + " '20170920AA',\n", + " '20170920AB',\n", + " '20170920AC',\n", + " '20170921AA',\n", + " '20170922AA',\n", + " '20170922AB',\n", + " '20170922AC',\n", + " '20170925AA',\n", + " '20170926AA',\n", + " '20170926AB',\n", + " '20170927AA',\n", + " '20170927AB',\n", + " '20170928AA',\n", + " '20170929AA',\n", + " '20170930AA',\n", + " '20171001AA',\n", + " '20171001AB',\n", + " '20171002AA',\n", + " '20171003AA',\n", + " '20171003AB',\n", + " '20171004AA',\n", + " '20171009AA',\n", + " '20171010AA',\n", + " '20171010AB',\n", + " '20171012AA',\n", + " '20171013AA',\n", + " '20171015AA',\n", + " '20171016AA',\n", + " '20171017AA',\n", + " '20171018AA',\n", + " '20171019AA',\n", + " '20171020AA',\n", + " '20171022AA',\n", + " '20171023AA',\n", + " '20171024AA',\n", + " '20171024AB',\n", + " '20171024AC',\n", + " '20171025AA',\n", + " '20171027AA',\n", + " '20171101AA',\n", + " '20171103AA',\n", + " '20171104AA',\n", + " '20171104AB',\n", + " '20171105AA',\n", + " '20171106AA',\n", + " '20171106AB',\n", + " '20171106AC',\n", + " '20171107AA',\n", + " '20171109AA',\n", + " '20171109AB',\n", + " '20171113AA',\n", + " '20171113AB',\n", + " '20171113AC',\n", + " '20171114AA',\n", + " '20171115AA',\n", + " '20171117AA',\n", + " '20171117AB',\n", + " '20171117AC',\n", + " '20171119AA',\n", + " '20171120AA',\n", + " '20171120AB',\n", + " '20171121AA',\n", + " '20171121AB',\n", + " '20171122AA',\n", + " '20171122AB',\n", + " '20171122AC',\n", + " '20171124AA',\n", + " '20171127AA',\n", + " '20171127AB',\n", + " '20171128AA',\n", + " '20171129AA',\n", + " '20171205AA',\n", + " '20171205AB',\n", + " '20171206AA',\n", + " '20171208AA',\n", + " '20171210AA',\n", + " '20171210AB',\n", + " '20171211AA',\n", + " '20171211AB',\n", + " '20171211AC',\n", + " '20171213AA',\n", + " '20171218AA',\n", + " '20171218AB',\n", + " '20171218AC',\n", + " '20171221AA',\n", + " '20171222AA',\n", + " '20171223AA',\n", + " '20171228AA',\n", + " '20171229AA',\n", + " '20171230AA',\n", + " '20180105AA',\n", + " '20180105AB',\n", + " '20180105AC',\n", + " '20180105AD',\n", + " '20180108AA',\n", + " '20180108AB',\n", + " '20180109AA',\n", + " '20180109AB',\n", + " '20180110AA',\n", + " '20180110AB',\n", + " '20180113AA',\n", + " '20180119AA',\n", + " '20180120AA',\n", + " '20180121AA',\n", + " '20180123AA',\n", + " '20180124AA',\n", + " '20180125AA',\n", + " '20180126AA',\n", + " '20180126AB',\n", + " '20180127AA',\n", + " '20180128AA',\n", + " '20180130AA',\n", + " '20180201AA',\n", + " '20180201AB',\n", + " '20180201AC',\n", + " '20180202AA',\n", + " '20180202AB',\n", + " '20180202AC',\n", + " '20180204AA',\n", + " '20180204AB',\n", + " '20180205AA',\n", + " '20180205AB',\n", + " '20180205AC',\n", + " '20180206AA',\n", + " '20180208AA',\n", + " '20180208AB',\n", + " '20180209AA',\n", + " '20180211AA',\n", + " '20180213AA',\n", + " '20180213AB',\n", + " '20180214AA',\n", + " '20180215AA',\n", + " '20180215AB',\n", + " '20180215AC',\n", + " '20180219AA',\n", + " '20180219AB',\n", + " '20180220AA',\n", + " '20180221AA',\n", + " '20180221AB',\n", + " '20180222AA',\n", + " '20180222AB',\n", + " '20180227AA',\n", + " '20180228AA',\n", + " '20180228AB',\n", + " '20180302AA',\n", + " '20180303AA',\n", + " '20180304AA',\n", + " '20180304AB',\n", + " '20180304AC',\n", + " '20180304AD',\n", + " '20180305AA',\n", + " '20180306AA',\n", + " '20180308AA',\n", + " '20180310AA',\n", + " '20180313AA',\n", + " '20180315AA',\n", + " '20180315AB',\n", + " '20180315AC',\n", + " '20180316AA',\n", + " '20180316AB',\n", + " '20180318AA',\n", + " '20180319AA',\n", + " '20180319AB',\n", + " '20180319AC',\n", + " '20180320AA',\n", + " '20180321AA',\n", + " '20180323AA',\n", + " '20180323AB',\n", + " '20180324AA',\n", + " '20180325AA',\n", + " '20180327AA',\n", + " '20180328AA',\n", + " '20180329AA',\n", + " '20180331AA',\n", + " '20180401AA',\n", + " '20180402AA',\n", + " '20180402AB',\n", + " '20180403AA',\n", + " '20180404AA',\n", + " '20180409AA',\n", + " '20180409AB',\n", + " '20180409AC',\n", + " '20180413AA',\n", + " '20180413AB',\n", + " '20180416AA',\n", + " '20180417AA',\n", + " '20180418AA',\n", + " '20180418AB',\n", + " '20180423AA',\n", + " '20180424AA',\n", + " '20180425AA',\n", + " '20180425AB',\n", + " '20180425AC',\n", + " '20180425AD',\n", + " '20180426AA',\n", + " '20180426AB',\n", + " '20180426AC',\n", + " '20180427AA',\n", + " '20180430AA',\n", + " '20180430AB',\n", + " '20180430AC',\n", + " '20180502AA',\n", + " '20180503AA',\n", + " '20180503AB',\n", + " '20180504AA',\n", + " '20180507AA',\n", + " '20180508AA',\n", + " '20180509AA',\n", + " '20180509AB',\n", + " '20180514AA',\n", + " '20180515AA',\n", + " '20180515AB',\n", + " '20180516AA',\n", + " '20180516AB',\n", + " '20180518AA',\n", + " '20180521AA',\n", + " '20180521AB',\n", + " '20180523AA',\n", + " '20180528AA',\n", + " '20180529AA',\n", + " '20180529AB',\n", + " '20180529AC',\n", + " '20180530AA',\n", + " '20180601AA',\n", + " '20180602AA',\n", + " '20180605AA',\n", + " '20180605AB',\n", + " '20180605AC',\n", + " '20180605AD',\n", + " '20180607AA',\n", + " '20180608AA',\n", + " '20180608AB',\n", + " '20180610AA',\n", + " '20180610AB',\n", + " '20180611AA',\n", + " '20180611AB',\n", + " '20180612AA',\n", + " '20180613AA',\n", + " '20180614AA',\n", + " '20180614AB',\n", + " '20180615AA',\n", + " '20180616AA',\n", + " '20180619AA',\n", + " '20180619AB',\n", + " '20180620AA',\n", + " '20180625AA',\n", + " '20180626AA',\n", + " '20180628AA',\n", + " '20180628AB',\n", + " '20180701AA',\n", + " '20180703AA',\n", + " '20180703AB',\n", + " '20180703AC',\n", + " '20180707AA',\n", + " '20180709AA',\n", + " '20180709AB',\n", + " '20180710AA',\n", + " '20180710AB',\n", + " '20180710AC',\n", + " '20180711AA',\n", + " '20180711AB',\n", + " '20180712AA',\n", + " '20180713AA',\n", + " '20180716AA',\n", + " '20180719AA',\n", + " '20180720AA',\n", + " '20180722AA',\n", + " '20180723AA',\n", + " '20180723AB',\n", + " '20180724AA',\n", + " '20180724AB',\n", + " '20180725AA',\n", + " '20180725AB',\n", + " '20180725AC',\n", + " '20180730AA',\n", + " '20180731AA',\n", + " '20180801AA',\n", + " '20180801AB',\n", + " '20180802AA',\n", + " '20180802AB',\n", + " '20180803AA',\n", + " '20180804AA',\n", + " '20180807AA',\n", + " '20180807AB',\n", + " '20180808AA',\n", + " '20180809AA',\n", + " '20180809AB',\n", + " '20180809AC',\n", + " '20180810AA',\n", + " '20180811AA',\n", + " '20180812AA',\n", + " '20180814AA',\n", + " '20180814AB',\n", + " '20180815AA',\n", + " '20180816AA',\n", + " '20180816AB',\n", + " '20180817AA',\n", + " '20180819AA',\n", + " '20180819AB',\n", + " '20180821AA',\n", + " '20180821AB',\n", + " '20180822AA',\n", + " '20180826AA',\n", + " '20180827AA',\n", + " '20180827AB',\n", + " '20180829AA',\n", + " '20180831AA',\n", + " '20180903AA',\n", + " '20180904AA',\n", + " '20180904AB',\n", + " '20180905AA',\n", + " '20180906AA',\n", + " '20180910AA',\n", + " '20180910AB',\n", + " '20180912AA',\n", + " '20180914AA',\n", + " '20180918AA',\n", + " '20180918AB',\n", + " '20180919AA',\n", + " '20180920AA',\n", + " '20180920AB',\n", + " '20180925AA',\n", + " '20180925AB',\n", + " '20180927AA',\n", + " '20180927AB',\n", + " '20181001AA',\n", + " '20181003AA',\n", + " '20181005AA',\n", + " '20181006AA',\n", + " '20181010AA',\n", + " '20181010AB',\n", + " '20181012AA',\n", + " '20181013AA',\n", + " '20181015AA',\n", + " '20181016AA',\n", + " '20181017AA',\n", + " '20181017AB',\n", + " '20181018AA',\n", + " '20181019AA',\n", + " '20181022AA',\n", + " '20181023AA',\n", + " '20181023AB',\n", + " '20181024AA',\n", + " '20181024AB',\n", + " '20181024AC',\n", + " '20181025AA',\n", + " '20181026AA',\n", + " '20181029AA',\n", + " '20181030AA',\n", + " '20181030AB',\n", + " '20181031AA',\n", + " '20181101AA',\n", + " '20181101AB',\n", + " '20181101AC',\n", + " '20181101AD',\n", + " '20181102AA',\n", + " '20181102AB',\n", + " '20181102AC',\n", + " '20181102AD',\n", + " '20181105AA',\n", + " '20181105AB',\n", + " '20181105AC',\n", + " '20181105AD',\n", + " '20181106AA',\n", + " '20181106AB',\n", + " '20181106AC',\n", + " '20181106AD',\n", + " '20181106AE',\n", + " '20181106AF',\n", + " '20181107AA',\n", + " '20181108AA',\n", + " '20181108AB',\n", + " '20181108AC',\n", + " '20181110AA',\n", + " '20181111AA',\n", + " '20181112AA',\n", + " '20181112AB',\n", + " '20181112AC',\n", + " '20181112AD',\n", + " '20181113AA',\n", + " '20181114AA',\n", + " '20181115AA',\n", + " '20181115AB',\n", + " '20181115AC',\n", + " '20181115AD',\n", + " '20181120AA',\n", + " '20181120AB',\n", + " '20181120AC',\n", + " '20181123AA',\n", + " '20181125AA',\n", + " '20181126AA',\n", + " '20181126AB',\n", + " '20181127AA',\n", + " '20181127AB',\n", + " '20181127AC',\n", + " '20181128AA',\n", + " '20181129AA',\n", + " '20181129AB',\n", + " '20181130AA',\n", + " '20181130AB',\n", + " '20181201AA',\n", + " '20181201AB',\n", + " '20181204AA',\n", + " '20181204AB',\n", + " '20181204AC',\n", + " '20181205AA',\n", + " '20181205AB',\n", + " '20181206AA',\n", + " '20181206AB',\n", + " '20181206AC',\n", + " '20181206AD',\n", + " '20181206AE',\n", + " '20181206AF',\n", + " '20181206AG',\n", + " '20181207AA',\n", + " '20181208AA',\n", + " '20181210AA',\n", + " '20181210AB',\n", + " '20181211AA',\n", + " '20181211AB',\n", + " '20181211AC',\n", + " '20181212AA',\n", + " '20181214AA',\n", + " '20181216AA',\n", + " '20181216AB',\n", + " '20181219AA',\n", + " '20181219AB',\n", + " '20181221AA',\n", + " '20181221AB',\n", + " '20181221AC',\n", + " '20181222AA',\n", + " '20181228AA',\n", + " '20181230AA',\n", + " '20190103AA',\n", + " '20190108AA',\n", + " '20190109AA',\n", + " '20190110AA',\n", + " '20190110AB',\n", + " '20190110AC',\n", + " '20190111AA',\n", + " '20190113AA',\n", + " '20190113AB',\n", + " '20190114AA',\n", + " '20190115AA',\n", + " '20190116AA',\n", + " '20190116AB',\n", + " '20190116AC',\n", + " '20190119AA',\n", + " '20190121AA',\n", + " '20190122AA',\n", + " '20190125AA',\n", + " '20190128AA',\n", + " '20190128AB',\n", + " '20190128AC',\n", + " '20190129AA',\n", + " '20190129AB',\n", + " '20190130AA',\n", + " '20190131AA',\n", + " '20190131AB',\n", + " '20190131AC',\n", + " '20190201AA',\n", + " '20190201AB',\n", + " '20190201AC',\n", + " '20190202AA',\n", + " '20190204AA',\n", + " '20190205AA',\n", + " '20190205AB',\n", + " '20190205AC',\n", + " '20190206AA',\n", + " '20190206AB',\n", + " '20190206AC',\n", + " '20190208AA',\n", + " '20190212AA',\n", + " '20190212AB',\n", + " '20190213AA',\n", + " '20190213AB',\n", + " '20190213AC',\n", + " '20190213AD',\n", + " '20190213AE',\n", + " '20190215AA',\n", + " '20190215AB',\n", + " '20190215AC',\n", + " '20190215AD',\n", + " '20190215AE',\n", + " '20190215AF',\n", + " '20190219AA',\n", + " '20190220AA',\n", + " '20190220AB',\n", + " '20190220AC',\n", + " '20190221AA',\n", + " '20190223AA',\n", + " '20190223AB',\n", + " '20190223AC',\n", + " '20190225AA',\n", + " '20190227AA',\n", + " '20190227AB',\n", + " '20190301AA',\n", + " '20190301AB',\n", + " '20190301AC',\n", + " '20190301AD',\n", + " '20190304AA',\n", + " '20190304AB',\n", + " '20190304AC',\n", + " '20190304AD',\n", + " '20190305AA',\n", + " '20190306AA',\n", + " '20190306AB',\n", + " '20190307AA',\n", + " '20190309AA',\n", + " '20190310AA',\n", + " '20190311AA',\n", + " '20190311AB',\n", + " '20190311AC',\n", + " '20190312AA',\n", + " '20190313AA',\n", + " '20190313AB',\n", + " '20190313AC',\n", + " '20190313AD',\n", + " '20190313AE',\n", + " '20190314AA',\n", + " '20190314AB',\n", + " '20190314AC',\n", + " '20190314AD',\n", + " '20190314AE',\n", + " '20190314AF',\n", + " '20190314AG',\n", + " '20190314AH',\n", + " '20190314AI',\n", + " '20190314AJ',\n", + " '20190314AK',\n", + " '20190314AL',\n", + " '20190315AA',\n", + " '20190315AB',\n", + " '20190315AC',\n", + " '20190315AD',\n", + " '20190320AA',\n", + " '20190320AB',\n", + " '20190320AC',\n", + " '20190320AD',\n", + " '20190325AA',\n", + " '20190326AA',\n", + " '20190326AB',\n", + " '20190327AA',\n", + " '20190327AB',\n", + " '20190327AC',\n", + " '20190328AA',\n", + " '20190329AA',\n", + " '20190329AB',\n", + " '20190401AA',\n", + " '20190401AB',\n", + " '20190402AA',\n", + " '20190404AA',\n", + " '20190405AA',\n", + " '20190406AA',\n", + " '20190410AA',\n", + " '20190410AB',\n", + " '20190410AC',\n", + " '20190411AA',\n", + " '20190411AB',\n", + " '20190411AC',\n", + " '20190412AA',\n", + " '20190416AA',\n", + " '20190417AA',\n", + " '20190417AB',\n", + " '20190418AA',\n", + " '20190420AA',\n", + " '20190422AA',\n", + " '20190423AA',\n", + " '20190424AA',\n", + " '20190424AB',\n", + " '20190426AA',\n", + " '20190427AA',\n", + " '20190427AB',\n", + " '20190429AA',\n", + " '20190430AA',\n", + " '20190430AB',\n", + " '20190430AC',\n", + " '20190503AA',\n", + " '20190503AB',\n", + " '20190506AA',\n", + " '20190507AA',\n", + " '20190507AB',\n", + " '20190512AA',\n", + " '20190513AA',\n", + " '20190513AB',\n", + " '20190513AC',\n", + " '20190514AA',\n", + " '20190515AA',\n", + " '20190515AB',\n", + " '20190515AC',\n", + " '20190515AD',\n", + " '20190515AE',\n", + " '20190516AA',\n", + " '20190517AA',\n", + " '20190520AA',\n", + " '20190522AA',\n", + " '20190522AB',\n", + " '20190522AC',\n", + " '20190522AD',\n", + " '20190522AE',\n", + " '20190522AF',\n", + " '20190524AA',\n", + " '20190524AB',\n", + " '20190526AA',\n", + " '20190526AB',\n", + " '20190527AA',\n", + " '20190527AB',\n", + " '20190528AA',\n", + " '20190528AB',\n", + " '20190528AC',\n", + " '20190529AA',\n", + " '20190529AB',\n", + " '20190529AC',\n", + " '20190530AA',\n", + " '20190530AB',\n", + " '20190530AC',\n", + " '20190531AA',\n", + " '20190603AA',\n", + " '20190604AA',\n", + " '20190604AB',\n", + " '20190604AC',\n", + " '20190604AD',\n", + " '20190604AE',\n", + " '20190605AA',\n", + " '20190605AB',\n", + " '20190605AC',\n", + " '20190606AA',\n", + " '20190606AB',\n", + " '20190606AC',\n", + " '20190606AD',\n", + " '20190607AA',\n", + " '20190608AA',\n", + " '20190609AA',\n", + " '20190611AA',\n", + " '20190612AA',\n", + " '20190613AA',\n", + " '20190613AB',\n", + " '20190614AA',\n", + " '20190615AA',\n", + " '20190616AA',\n", + " '20190616AB',\n", + " '20190616AC',\n", + " '20190617AA',\n", + " '20190618AA',\n", + " '20190620AA',\n", + " '20190620AB',\n", + " '20190620AC',\n", + " '20190620AD',\n", + " '20190621AA',\n", + " '20190621AB',\n", + " '20190624AA',\n", + " '20190625AA',\n", + " '20190625AB',\n", + " '20190625AC',\n", + " '20190625AD',\n", + " '20190625AE',\n", + " '20190626AA',\n", + " '20190701AA',\n", + " '20190703AA',\n", + " '20190707AA',\n", + " '20190707AB',\n", + " '20190708AA',\n", + " '20190708AB',\n", + " '20190709AA',\n", + " '20190709AB',\n", + " '20190710AA',\n", + " '20190711AA',\n", + " '20190711AB',\n", + " '20190711AC',\n", + " '20190712AA',\n", + " '20190713AA',\n", + " '20190714AA',\n", + " '20190716AA',\n", + " '20190716AB',\n", + " '20190717AA',\n", + " '20190717AB',\n", + " '20190717AC',\n", + " '20190718AA',\n", + " '20190718AB',\n", + " '20190718AC',\n", + " '20190719AA',\n", + " '20190719AB',\n", + " '20190722AA',\n", + " '20190722AB',\n", + " '20190722AC',\n", + " '20190723AA',\n", + " '20190724AA',\n", + " '20190724AB',\n", + " '20190724AC',\n", + " '20190724AD',\n", + " '20190725AA',\n", + " '20190726AA',\n", + " '20190729AA',\n", + " '20190729AB',\n", + " '20190730AA',\n", + " '20190731AA',\n", + " '20190731AB',\n", + " '20190731AC',\n", + " '20190731AD',\n", + " '20190731AE',\n", + " '20190802AA',\n", + " '20190804AA',\n", + " '20190806AA',\n", + " '20190807AA',\n", + " '20190807AB',\n", + " '20190808AA',\n", + " '20190808AB',\n", + " '20190810AA',\n", + " '20190810AB',\n", + " '20190812AA',\n", + " '20190812AB',\n", + " '20190813AA',\n", + " '20190813AB',\n", + " '20190814AA',\n", + " '20190814AB',\n", + " '20190814AC',\n", + " '20190815AA',\n", + " '20190816AA',\n", + " '20190819AA',\n", + " '20190819AB',\n", + " '20190821AA',\n", + " '20190822AA',\n", + " '20190822AB',\n", + " '20190822AC',\n", + " '20190823AA',\n", + " '20190825AA',\n", + " '20190826AA',\n", + " '20190826AB',\n", + " '20190827AA',\n", + " '20190828AA',\n", + " '20190828AB',\n", + " '20190828AC',\n", + " '20190829AA',\n", + " '20190901AA',\n", + " '20190901AB',\n", + " '20190901AC',\n", + " '20190903AA',\n", + " '20190903AB',\n", + " '20190904AA',\n", + " '20190905AA',\n", + " '20190905AB',\n", + " '20190906AA',\n", + " '20190906AB',\n", + " '20190906AC',\n", + " '20190909AA',\n", + " '20190909AB',\n", + " '20190909AC',\n", + " '20190911AA',\n", + " '20190912AA',\n", + " '20190912AB',\n", + " '20190912AC',\n", + " '20190912AD',\n", + " '20190912AE',\n", + " '20190912AF',\n", + " '20190913AA',\n", + " '20190914AA',\n", + " '20190914AB',\n", + " '20190915AA',\n", + " '20190916AA',\n", + " '20190916AB',\n", + " '20190916AC',\n", + " '20190917AA',\n", + " '20190917AB',\n", + " '20190917AC',\n", + " '20190917AD',\n", + " '20190919AA',\n", + " '20190920AA',\n", + " '20190920AB',\n", + " '20190922AA',\n", + " '20190922AB',\n", + " '20190923AA',\n", + " '20190924AA',\n", + " '20190924AB',\n", + " '20190924AC',\n", + " '20190924AD',\n", + " '20190925AA',\n", + " '20190925AB',\n", + " '20190925AC',\n", + " '20190926AA',\n", + " '20190926AB',\n", + " '20190926AC',\n", + " '20190927AA',\n", + " '20190930AA',\n", + " '20190930AB',\n", + " '20190930AC',\n", + " '20191001AA',\n", + " '20191003AA',\n", + " '20191003AB',\n", + " '20191003AC',\n", + " '20191004AA',\n", + " '20191007AA',\n", + " '20191008AA',\n", + " '20191008AB',\n", + " '20191009AA',\n", + " '20191009AB',\n", + " '20191010AA',\n", + " '20191011AA',\n", + " '20191012AA',\n", + " '20191013AA',\n", + " '20191014AA',\n", + " '20191017AA',\n", + " '20191017AB',\n", + " '20191017AC',\n", + " '20191020AA',\n", + " '20191021AA',\n", + " '20191021AB',\n", + " '20191022AA',\n", + " '20191023AA',\n", + " '20191023AB',\n", + " '20191023AC',\n", + " '20191023AD',\n", + " '20191024AA',\n", + " '20191024AB',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_file_tree_and_json(author_source, registry_source, metadata_directory)" ] }, { @@ -19,8 +1497,25 @@ "metadata": {}, "outputs": [], "source": [ - "create_file_tree_and_json(author_source, registry_source, target_directory)" + "audit_files(raw_files_directory)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "convert_files(raw_files_directory, metadata_directory, directories_to_process)\n", + "print('Done converting files.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/scripts/EGAP/files_to_import_structure.py b/scripts/EGAP/files_to_import_structure.py index 89cbe3934ce..0a3d78860f0 100644 --- a/scripts/EGAP/files_to_import_structure.py +++ b/scripts/EGAP/files_to_import_structure.py @@ -3,16 +3,28 @@ import shutil import argparse from distutils.dir_util import copy_tree +import logging from nose.tools import assert_equal +logger = logging.getLogger(__name__) + + # This takes the item id from the path of the project directory for example '20121001AA Findley' -> '20121001AA' get_item_id = lambda _path: _path.split(os.sep)[-1].split(' ')[0] +def get_project_id(root, source_dir): + project_id_base = root.split(source_dir)[-1] + if ' ' in project_id_base: + project_id = project_id_base.split(' ')[0].split('/')[-1] + else: + project_id = project_id_base.split('/')[0] + return project_id + + # Check if file name starts with EGAP id for example '20121001AA_PAP.pdf' -def check_id(root, item): - project_id = get_item_id(root.split('/')[-3]) +def check_id(project_id, item): return item.startswith(project_id) @@ -20,16 +32,17 @@ def check_id(root, item): check_anon = lambda item: 'pap_anon' in item.lower() or 'anonymous' in item.lower() -def action_files_by_name(root, source, item_name): +def action_files_by_name(root, source_item, item_name): """ Pick out anonymous and create new folder to move them into it, remove ones that don't follow id naming convention. :param root: - :param source: + :param source_item: :param item_name: :return: """ - if not check_id(root, item_name): - path = os.path.join(root, item_name) + project_id = get_project_id(root, source_item) + path = os.path.join(root, item_name) + if not check_id(project_id, item_name): os.remove(path) return @@ -38,18 +51,20 @@ def action_files_by_name(root, source, item_name): if not os.path.exists(destination_parent): os.mkdir(destination_parent) + destination_item = os.path.join(destination_parent, item_name) + shutil.move(path, destination_item) - destination = os.path.join(destination_parent, item_name) - shutil.move(source, destination) +def audit_files(source_directory): + logger.info("Running audit. Source: {}".format(source_directory)) -def audit_files(source): including = open('including.txt', 'w+') ignoring = open('ignoring.txt', 'w+') - for root, dir, files in os.walk(source): + for root, directory, files in os.walk(source_directory): for item in files: - name = os.path.join(root.split('/')[-1], item) # get file/folder name after slash - if not check_id(root, name): + project_id = get_project_id(root, source_directory) + name = '{}/{}'.format(root.split(source_directory)[-1], item) # get file/folder name from just under source + if not check_id(project_id, item): ignoring.writelines(name + '\r') else: including.writelines(name + '\r') @@ -57,7 +72,7 @@ def audit_files(source): ignoring.close() including.close() - projects = set(os.listdir(source)) + projects = set(os.listdir(source_directory)) project_ids = set([get_item_id(folders) for folders in list(projects)]) # check for duplicate ids @@ -77,30 +92,34 @@ def main(files_dir, metadata_dir, id_list=None): :param files_dir: the source path we're picking files out of :param metadata_dir: a pre-made directory structure for importing projects that we are packing files into. + :param id_list: an optional list of project ids to limit what gets processed :return: """ + logger.info("Processing files. Source: {} Destination: {}".format(files_dir, metadata_dir)) + project_dirs = os.listdir(files_dir) if id_list: project_dirs = [project for project in project_dirs if get_item_id(project) in id_list] + logger.info('Processing directories: {}'.format(project_dirs)) + # Copy whole tree to preserve file hierarchy then for item in project_dirs: item_id = get_item_id(item) - source = os.path.join(files_dir, item) - destination = os.path.join(metadata_dir, item_id, 'data', 'nonanonymous') - if os.path.isdir(source): - copy_tree(source, destination) + source_item = os.path.join(files_dir, item) + destination_item = os.path.join(metadata_dir, item_id, 'data', 'nonanonymous') + if os.path.isdir(source_item): + copy_tree(source_item, destination_item) - for root, dir, files in os.walk(metadata_dir): + for root, directory, files in os.walk(metadata_dir): for item in files: if item not in ('project.json', 'registration-schema.json'): - source = os.path.join(root, item) - action_files_by_name(root, source, item) + action_files_by_name(root, metadata_dir, item) # Check All anon files in /anonymous/ directory - for root, dir, files in os.walk(metadata_dir): + for root, directory, files in os.walk(metadata_dir): for item in files: - if item not in ('project.json', 'registration-schema.json'): + if item not in ('project.json', 'registration-schema.json', '.DS_Store'): if check_anon(item): assert '/anonymous' in root else: @@ -112,22 +131,22 @@ def main(files_dir, metadata_dir, id_list=None): parser.add_argument( '-source', '--source', - help='This should be the directory for the EGAP data dump, traditionally called "EGAP_REGISTRY_staging/3 Registrations/"' + help='The directory for the EGAP data files, traditionally called "EGAP_REGISTRY_staging/3 Registrations/"' ) parser.add_argument( '-destination', '--destination', - help='This should be the directory of the import file structure containing the bags of data.' + help='The directory of the import file structure containing the bags of data.' ) parser.add_argument( '-list', '--list', - help='This is a list of ids to import into a the new metadata directory.' + help='An optional list of ids to import into a the new metadata directory.' ) parser.add_argument( '-audit', '--audit', - help='This includes all files that don\'t follow the "_PAP" naming convention.' + help='Boolean to generate two lists of all files that should and should not be included. Needs "source".' ) args = parser.parse_args() diff --git a/scripts/EGAP/requirements.txt b/scripts/EGAP/requirements.txt new file mode 100644 index 00000000000..7e65d67ac4f --- /dev/null +++ b/scripts/EGAP/requirements.txt @@ -0,0 +1,68 @@ +appnope==0.1.0 +attrs==19.3.0 +backcall==0.1.0 +bcrypt==3.1.7 +bleach==3.1.0 +blinker==1.4 +bson==0.5.8 +cffi==1.13.1 +Click==7.0 +decorator==4.4.0 +defusedxml==0.6.0 +Django==2.2.6 +django-rest-framework==0.1.0 +djangorestframework==3.10.3 +entrypoints==0.3 +Flask==1.1.1 +furl==2.1.0 +future==0.18.1 +importlib-metadata==0.23 +ipykernel==5.1.3 +ipython==7.8.0 +ipython-genutils==0.2.0 +ipywidgets==7.5.1 +itsdangerous==1.1.0 +jedi==0.15.1 +Jinja2==2.10.3 +json5==0.8.5 +jsonschema==3.1.1 +jupyter==1.0.0 +jupyter-client==5.3.4 +jupyter-console==6.0.0 +jupyter-core==4.6.1 +jupyterlab==1.1.4 +jupyterlab-server==1.0.6 +MarkupSafe==1.1.1 +mistune==0.8.4 +more-itertools==7.2.0 +nbconvert==5.6.1 +nbformat==4.4.0 +nose==1.3.7 +notebook==6.0.1 +orderedmultidict==1.0.1 +pandocfilters==1.4.2 +parso==0.5.1 +pexpect==4.7.0 +pickleshare==0.7.5 +prometheus-client==0.7.1 +prompt-toolkit==2.0.10 +ptyprocess==0.6.0 +pycparser==2.19 +Pygments==2.4.2 +pyrsistent==0.15.4 +python-dateutil==2.8.0 +pytz==2019.3 +pyzmq==18.1.0 +qtconsole==4.5.5 +Send2Trash==1.5.0 +six==1.12.0 +sqlparse==0.3.0 +terminado==0.8.2 +testpath==0.4.2 +tornado==6.0.3 +traitlets==4.3.3 +wcwidth==0.1.7 +webencodings==0.5.1 +Werkzeug==0.16.0 +widgetsnbextension==3.5.1 +zipp==0.6.0 From c9e3449e7e91a9bc58e94a3d2f99a32a1b8b1f28 Mon Sep 17 00:00:00 2001 From: John Tordoff Date: Wed, 6 Nov 2019 13:08:14 -0500 Subject: [PATCH 8/9] fix double quotes for travis (#9194) --- scripts/EGAP/files_to_import_structure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/EGAP/files_to_import_structure.py b/scripts/EGAP/files_to_import_structure.py index 0a3d78860f0..43b9dbfa3f1 100644 --- a/scripts/EGAP/files_to_import_structure.py +++ b/scripts/EGAP/files_to_import_structure.py @@ -56,7 +56,7 @@ def action_files_by_name(root, source_item, item_name): def audit_files(source_directory): - logger.info("Running audit. Source: {}".format(source_directory)) + logger.info('Running audit. Source: {}'.format(source_directory)) including = open('including.txt', 'w+') ignoring = open('ignoring.txt', 'w+') @@ -95,7 +95,7 @@ def main(files_dir, metadata_dir, id_list=None): :param id_list: an optional list of project ids to limit what gets processed :return: """ - logger.info("Processing files. Source: {} Destination: {}".format(files_dir, metadata_dir)) + logger.info('Processing files. Source: {} Destination: {}'.format(files_dir, metadata_dir)) project_dirs = os.listdir(files_dir) if id_list: From 5b58c5803b0f666d1f5d8be69d5ee574a1732efd Mon Sep 17 00:00:00 2001 From: "Brian J. Geiger" Date: Thu, 7 Nov 2019 09:40:18 -0500 Subject: [PATCH 9/9] Update changelog and package.json for release --- CHANGELOG | 5 +++++ package.json | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index da406c46bf7..f0b26e1e32d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,11 @@ We follow the CalVer (https://calver.org/) versioning scheme: YY.MINOR.MICRO. +19.31.0 (2019-11-7) +=================== +- EGAP: Parse project structure, add contributors, add files, ingest the draft registration, and add a Jupyter notebook +- Modify a Chronos field for proper contributor classification + 19.30.0 (2019-10-16) =================== - Fix weirdness around deleted nodes by not deleing OSF Storage diff --git a/package.json b/package.json index 8a756331088..f884cb97f9d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "OSF", - "version": "19.30.0", + "version": "19.31.0", "description": "Facilitating Open Science", "repository": "https://github.com/CenterForOpenScience/osf.io", "author": "Center for Open Science",