Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Feature: Python script and module to check dataset readiness for data preservation #236

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ldcoolp/preserve/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .main import Preserve
243 changes: 243 additions & 0 deletions ldcoolp/preserve/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
from pathlib import Path

from logging import Logger
from typing import List, Dict

import pandas as pd

from redata.commons.logger import log_stdout

# LD-Cool-P specific
from figshare.figshare import Figshare
from ..config import config_default_dict
from ..curation import metadata
from ..curation.inspection import checksum
from ..curation import retrieve

# Wildcards for globbing hidden files
HIDDEN_FILES = ['*DS_Store', '.*.docx', '.*.pdf']


class Preserve:
"""
Primary class for the preparation of curated datasets for data
preservation.

This ``class`` includes a number of built-in features such as
checking against the published metadata via MD5 checksum,
saving a JSON file containing the published metadata, deleting
hidden files, and changing curated folder to read-only

Attributes
----------
article_id: Figshare article ID
version_no: Version number for article ID. Default: v1
"""

def __init__(self, article_id: int, version_no: int = 1,
config_dict: dict = config_default_dict,
log: Logger = log_stdout()):

self.log = log
self.article_id = article_id
self.version_no = version_no

self.fs = Figshare() # No token needed for public dataset

self.curation_dict = config_dict['curation']

self.root_directory = \
self.curation_dict[self.curation_dict['parent_dir']]
self.published_folder = self.curation_dict['folder_published']
self.data_path = self.curation_dict['folder_copy_data'] # DATA
self.original_data_path = self.curation_dict['folder_data'] # ORIGINAL_DATA
self.metadata_path = self.curation_dict['folder_metadata'] # METADATA

# Search for path
p_dir = Path(self.root_directory) / self.published_folder
list_paths = list(p_dir.glob(f'*{self.article_id}'))
if len(list_paths) == 0:
self.log.warning(
f"No curated dataset found in {self.published_folder}.")
self.log.warning("Exiting !!!")
raise ValueError
if len(list_paths) > 1:
self.log.warning(
f"More than one paths found in {self.published_folder}.")
self.log.warning("Exiting !!!")
raise ValueError

self.folder_name = Path(list_paths[0])
self.version_dir = self.folder_name / f"v{self.version_no:02}"
if self.version_dir.exists():
self.log.info("Article and version found!")
else:
self.log.warning("Version not found!")
self.log.warning("Exiting !!!")
raise OSError

# Ensure that METADATA folder exists (support for very old deposits)
m_path = self.version_dir / self.metadata_path
if not m_path.exists():
self.log.info(f"{self.metadata_path} NOT found!")
self.log.info(f"Creating: {self.metadata_path}")
m_path.mkdir()

# Retrieve Figshare metadata and save metadata
self.article_metadata = self.get_metadata()

def get_metadata(self) -> dict:
"""Retrieve Figshare metadata from public API"""
article_metadata = self.fs.get_article_details(self.article_id,
self.version_no)
return article_metadata

def save_metadata(self):
"""Write JSON file containing Figshare metadata"""
out_file_prefix = f"published_{self.article_id}"
metadata.save_metadata(self.article_metadata,
out_file_prefix,
metadata_source='FIGSHARE',
root_directory=self.version_dir,
metadata_directory=self.metadata_path,
log=self.log)

def check_files(self, save_files: bool = False) -> pd.DataFrame:
"""Performs checksum verification on each file"""

if self.article_metadata['is_embargoed']:
self.log.warning(
f"Embargoed files! File checking is not possible at this time")
self.log.warning(
f"Embargo date: {self.article_metadata['embargo_date']}")
raise SystemExit("NOTE: Embargoed datasets NOT supported at this point")
else:
summary_list = [] # Initialize
files_list: List[Dict] = self.article_metadata['files']
d_dir = self.version_dir / self.data_path
o_dir = self.version_dir / self.original_data_path
for n, file_dict in enumerate(files_list):
filename = file_dict['name']
glob_list = list(d_dir.glob(filename))

data_location = ''
if len(glob_list) == 0:
try:
t_path = list(o_dir.glob(filename))[0]
if not t_path.exists():
raise FileNotFoundError
else:
self.log.info(
f"{filename} found in {self.original_data_path}")
data_location = self.original_data_path
except (IndexError, FileNotFoundError):
self.log.warning(f"File not found: {filename}")
else:
t_path = glob_list[0]
self.log.info(
f"{filename} found in {self.data_path}")
data_location = self.data_path

checksum_flag = \
checksum.check_md5(t_path, file_dict['supplied_md5'],
log=log_stdout())

summary_list.append({
'name': filename,
'data_location': data_location,
'checksum_status': checksum_flag,
})
summary_list[n].update(file_dict)

if save_files:
out_file_prefix = f'checksum_summary_{self.article_id}'
metadata.save_metadata(summary_list, out_file_prefix,
metadata_source='CHECKSUM',
root_directory=self.version_dir,
metadata_directory=self.metadata_path,
save_csv=True, log=self.log)

df = pd.DataFrame.from_dict(summary_list, orient='columns')
return df

def update_files(self, df: pd.DataFrame):
"""Identify incorrect files on server and retrieve the correct one"

Note: will need to update data frame to avoid symbolic linking
:param df: pandas DataFrame from ``check_files()``
"""

bad_idx = df['checksum_status'].false() == False
df_bad_checksum = df.loc[bad_idx]
if df_bad_checksum.empty:
self.log.info(f"All files are correct")
self.log.info("No file retrieval needed! :-)")
else:
for index, row in df_bad_checksum.iterrows():
self.log.info(f"{index}. Downloading: {row['name']}")
filename = self.version_dir / self.data_path / row['name']
retrieve.private_file_retrieve(row['download_url'], filename,
log=self.log)
df_bad_checksum['data_location'] = self.data_path # Need to update

df.loc[bad_idx] = df_bad_checksum
return df

def make_symbolic_links(self, df):
"""Construct symbolic links in DATA from ORIGINAL_DATA as needed

:param df: pandas DataFrame from ``check_files()``
"""

# Get list of those in ORIGINAL_DATA
df_symlink = df.loc[df['data_location'] == self.original_data_path]

if df_symlink.empty:
self.log.info(f"All files are in {self.data_path}")
self.log.info("No symbolic links are needed! :-)")
else:
for index, row in df_symlink.iterrows():
self.log.info(f"{index}. Creating symbolic link for {row['name']}")
data_path = self.version_dir / self.data_path / row['name']
data_path.symlink_to(f"../{self.original_data_path}/{row['name']}")

def delete_old_readme_files(self):
"""Find and remove all old README.txt files in DATA"""

d_dir = self.version_dir / self.data_path
files_find_list = list(d_dir.glob('README_????-??-??T*.txt'))
if len(files_find_list) == 0:
self.log.info("No old README.txt files found! :-)")
else:
self.log.info(f"Old README.txt files found, N={len(files_find_list)}!")
self.delete_files(files_find_list)

def delete_hidden_files(self):
"""Find and remove all hidden files. See ``HIDDEN_FILES`` wildcards"""
hidden_files_list = []
for hidden in HIDDEN_FILES:
file_find = list(self.version_dir.rglob(hidden))
if len(file_find) > 0:
hidden_files_list.extend(file_find)

if len(hidden_files_list) == 0:
self.log.info("No hidden files found! :-)")
else:
self.log.info(f"Hidden files found, N={len(hidden_files_list)}!")
self.delete_files(hidden_files_list)

return hidden_files_list

def delete_files(self, files_list: List[Path]):
"""Delete list of files if response to prompt is yes"""
for f_path in files_list:
self.log.info(f_path.relative_to(self.version_dir))
self.log.info("PROMPT: Do you you wish to delete all of these files")
src_input = input("PROMPT: Type 'yes'. Anything else will skip : ")
self.log.info(f"RESPONSE: {src_input}")
if src_input.lower() == 'yes':
for f_path in files_list:
self.log.info(f"Removing: {f_path.relative_to(self.version_dir)}")
f_path.unlink()
else:
self.log.info("Not deleting files.")
125 changes: 125 additions & 0 deletions ldcoolp/scripts/preserve_checks
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env python

from os.path import dirname, exists, join
from os import mkdir

import argparse
import configparser

from datetime import date

# LD-Cool-P specific
from ldcoolp.preserve import Preserve
from redata.commons import logger

# Config loader
from ldcoolp.config import dict_load

# Version and branch info
from ldcoolp import __version__, CODE_NAME
from redata.commons.git_info import GitInfo
from ldcoolp import __file__ as library_path

today = date.today()

library_root_path = dirname(dirname(library_path)) # Retrieve parent directory to ldcoolp


if __name__ == '__main__':
# Parse command-line arguments
parser = argparse.ArgumentParser(
description='Command-line driver for LD-Cool-P preservation checks.')
parser.add_argument('--config', required=True,
help='path to configuration file')
parser.add_argument('--article-id', required=True, type=int,
help='Figshare article ID')
parser.add_argument('--version-no', default=1, type=int,
help='Figshare article ID version no. Default: 1')
parser.add_argument('--metadata-only', action='store_true',
help='Perform checks without checking files (this is for large datasets)')
args = parser.parse_args()

if not exists(args.config):
raise FileNotFoundError(f"WARNING!!! Config file not found: {args.config}")

gi = GitInfo(library_root_path)

banner_message = f"""
This is the command-line tool that perform preservation checks for curated
datasets. It only performs this after the dataset has been moved to the
published folder

{CODE_NAME} branch: {gi.branch}
{CODE_NAME} version: {__version__}
{CODE_NAME} commit hash: {gi.short_commit}
Created by Chun Ly
Issues? Submit a GitHub ticket: https://github.com/UAL-RE/LD-Cool-P/issues/new
"""
print(banner_message)

# Load configuration
try:
config_dict = dict_load(args.config)
except configparser.ParsingError:
exit()

curation_dict = config_dict['curation']

# Define logfile
root_directory_main = curation_dict[curation_dict['log_parent_dir']]

log_dir = join(root_directory_main, curation_dict['log_dir'])
if not exists(log_dir):
mkdir(log_dir)
logfile_prefix = 'preserve_checks'
log = logger.log_setup(log_dir, logfile_prefix)

lc = logger.LogCommons(log, logfile_prefix, gi,
code_name=CODE_NAME, version=__version__)

lc.script_start()

# Retrieve username, hostname, IP
lc.script_sys_info()

# Configuration information
log.info(f"Config file: {args.config}")

log.info(f"Checking: {args.article_id} v{args.version_no}")

# Define preservation object
try:
p = Preserve(args.article_id, args.version_no,
config_dict=config_dict, log=log)
except SystemExit:
pass

# Step 1: Save Figshare metadata
p.save_metadata()

if not args.metadata_only:
# Step 2: Perform checksum verification on files
checksum_df = p.check_files(save_files=True)

for handler in log.handlers:
log_file = logger.get_log_file(handler)
logger.pandas_write_buffer(checksum_df, log_file)

# Step 3: Resolve any checksum differences
checksum_df = p.update_files(checksum_df)

# Step 4: Create symbolic links
p.make_symbolic_links(checksum_df)

# Step 6: Delete old README.txt files
p.delete_old_readme_files()

# Step 7: Delete hidden files
p.delete_hidden_files()

log.info(f"Completed: {args.article_id} ...")

# Change permission to mode=666 (rw for all)
lc.log_permission()

lc.script_end()