Skip to content
This repository has been archived by the owner on Oct 8, 2022. It is now read-only.

Commit

Permalink
fix IPT parse; add logging
Browse files Browse the repository at this point in the history
  • Loading branch information
zzeppozz committed Nov 19, 2020
1 parent d55d4e1 commit a5e1359
Show file tree
Hide file tree
Showing 9 changed files with 419 additions and 288 deletions.
13 changes: 12 additions & 1 deletion docs/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,15 @@ Testing T-Rex elements
* Options to search:

* curl "http://localhost:8983/solr/spcoco/select?q=*.*"
* curl "http://localhost:8983/solr/spcoco/select?q=*.*"


* Web UI for Solr admin:

* http://notyeti-192.lifemapper.org:8983/solr/#/spcoco/core-overview

Troubleshooting
----------------

* /var/solr/logs/solr.log

97 changes: 74 additions & 23 deletions src/fileop/logtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,79 @@
LOGFILE_MAX_BYTES = 52000000
LOGFILE_BACKUP_COUNT = 5

# .............................................................................
def get_logger(name, fname):
log = logging.getLogger(name)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(LOG_FORMAT, LOG_DATE_FORMAT)
handlers = []
handlers.append(RotatingFileHandler(fname, maxBytes=LOGFILE_MAX_BYTES,
backupCount=LOGFILE_BACKUP_COUNT))
handlers.append(logging.StreamHandler())
for fh in handlers:
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)
return log

class LMLog():
# ...............................................
def __init__(self, name, fname):
self.log = self._get_logger(name, fname)

# ...............................................
def _get_logger(self, name, fname):
log = logging.getLogger(name)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(LOG_FORMAT, LOG_DATE_FORMAT)
handlers = []
handlers.append(RotatingFileHandler(fname, maxBytes=LOGFILE_MAX_BYTES,
backupCount=LOGFILE_BACKUP_COUNT))
handlers.append(logging.StreamHandler())
for fh in handlers:
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)
return log

# ...............................................
def rotate_logfile(self, logpath, logname=None):
if self.log is None:
if logname is None:
nm, _ = os.path.splitext(os.path.basename(__file__))
logname = '{}.{}'.format(nm, int(time.time()))
logfname = os.path.join(logpath, '{}.log'.format(logname))
self.log = get_logger(logname, logfname)
return log

# ...............................................
def log_info(self, msg, logger=None):
if logger is None:
print(msg)
else:
logger.info(msg)

# ...............................................
def log_warn(self, msg, logger=None):
if logger is None:
print(msg)
else:
logger.warn(msg)

# ...............................................
def rotate_logfile(log, logpath, logname=None):
if log is None:
if logname is None:
nm, _ = os.path.splitext(os.path.basename(__file__))
logname = '{}.{}'.format(nm, int(time.time()))
logfname = os.path.join(logpath, '{}.log'.format(logname))
log = get_logger(logname, logfname)
return log
def log_error(self, msg, logger=None):
if logger is None:
print(msg)
else:
logger.error(msg)

# # .............................................................................
# def get_logger(name, fname):
# log = logging.getLogger(name)
# log.setLevel(logging.DEBUG)
# formatter = logging.Formatter(LOG_FORMAT, LOG_DATE_FORMAT)
# handlers = []
# handlers.append(RotatingFileHandler(fname, maxBytes=LOGFILE_MAX_BYTES,
# backupCount=LOGFILE_BACKUP_COUNT))
# handlers.append(logging.StreamHandler())
# for fh in handlers:
# fh.setLevel(logging.DEBUG)
# fh.setFormatter(formatter)
# log.addHandler(fh)
# return log
#
# # ...............................................
# def rotate_logfile(log, logpath, logname=None):
# if log is None:
# if logname is None:
# nm, _ = os.path.splitext(os.path.basename(__file__))
# logname = '{}.{}'.format(nm, int(time.time()))
# logfname = os.path.join(logpath, '{}.log'.format(logname))
# log = get_logger(logname, logfname)
# return log

25 changes: 0 additions & 25 deletions src/tools/readwrite.py → src/fileop/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,31 +226,6 @@ def get_csv_dict_writer(datafile, delimiter, encoding, fldnames, fmode='w'):
print('Opened file {} for dict write'.format(datafile))
return writer, f

# .............................................................................
def get_logger(name, fname):
log = logging.getLogger(name)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(LOG_FORMAT, LOG_DATE_FORMAT)
handlers = []
handlers.append(RotatingFileHandler(fname, maxBytes=LOGFILE_MAX_BYTES,
backupCount=LOGFILE_BACKUP_COUNT))
handlers.append(logging.StreamHandler())
for fh in handlers:
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)
return log

# ...............................................
def rotate_logfile(log, logpath, logname=None):
if log is None:
if logname is None:
nm, _ = os.path.splitext(os.path.basename(__file__))
logname = '{}.{}'.format(nm, int(time.time()))
logfname = os.path.join(logpath, '{}.log'.format(logname))
log = get_logger(logname, logfname)
return log


# ...............................................
def makerow(rec, outfields):
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions src/go.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import json

from LmRex.tools.api import APIQuery, GbifAPI, IdigbioAPI
from LmRex.tools.readwrite import (
from LmRex.fileop.readwrite import (
get_csv_dict_reader, get_csv_dict_writer, get_line)
from LmRex.tools.ready_file import ready_filename, delete_file
from LmRex.fileop.ready_file import ready_filename, delete_file
import LmRex.tools.solr as spsolr
from LmRex.common.lmconstants import (
SPECIFY_ARK_PREFIX, GBIF, DWCA, ENCODING, TEST_SPECIFY7_SERVER,
Expand Down
58 changes: 37 additions & 21 deletions src/spcoco/dwca.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import zipfile

from LmRex.tools.api import APIQuery, GbifAPI, IdigbioAPI
from LmRex.tools.readwrite import (
from LmRex.fileop.readwrite import (
get_csv_dict_reader, get_csv_dict_writer, get_line)
from LmRex.tools.ready_file import ready_filename, delete_file
from LmRex.fileop.ready_file import ready_filename, delete_file
import LmRex.tools.solr as spsolr
from LmRex.common.lmconstants import (
SPECIFY_ARK_PREFIX, GBIF, DWCA, ENCODING, TEST_SPECIFY7_SERVER,
Expand Down Expand Up @@ -59,11 +59,11 @@ def get_dwca_urls(rss_url, isIPT):
ds_key_val = str(INCR_KEY)
else:
ds_key_val = ds_id_elt.text
datasets[ds_key_val] = {'url': url_elt.text}
datasets[ds_key_val] = {'url': url_elt.text, 'name': ds_key_val}
return datasets

# ......................................................
def download_dwca(url, baseoutpath, overwrite=False):
def download_dwca(url, baseoutpath, overwrite=False, logger=None):
if url.endswith('.zip'):
_, fname = os.path.split(url)
basename, _ = os.path.splitext(fname)
Expand All @@ -90,26 +90,33 @@ def download_dwca(url, baseoutpath, overwrite=False):
reason = response.reason
except AttributeError:
reason = 'Unknown Error'
print(('Failed on URL {}, code = {}, reason = {} ({})'.format(
url, ret_code, reason, str(e))))
log_error('Failed on URL {}, code = {}, reason = {} ({})'.format(
url, ret_code, reason, str(e)), logger)

output = response.content
with open(outfilename, 'wb') as outf:
outf.write(output)
return outfilename

# .............................................................................
class DWCA:
class DwCArchive:
"""Class to download and read a Darwin Core Archive"""

# ......................................................
def __init__(self, zipfile_or_directory, outpath=None):
def __init__(self, zipfile_or_directory, outpath=None, logger=None):
"""
Args:
zipfile_or_directory: Full path to zipfile or directory containing
Darwin Core Archive
outpath: file location for output data and log files
logger: LMLog object for logging processing information
Note:
Produces data requiring http post to contain
headers={'Content-Type': 'text/csv'}
"""
if os.path.exists(zipfile_or_directory):
self.logger = logger
# DWCA is zipped
if (os.path.isfile(zipfile_or_directory) and
zipfile_or_directory.endswith('.zip')):
Expand Down Expand Up @@ -137,9 +144,9 @@ def _is_guid(self, idstr):
return False

# ......................................................
def _get_date(self, dwc_rec):
yr = dwc_rec['year']
def _get_date(self, dwc_rec):
try:
yr = dwc_rec['year']
int(yr)
except:
coll_date = ''
Expand All @@ -157,7 +164,7 @@ def _get_date(self, dwc_rec):


# ......................................................
def read_recs_for_solr(self, fileinfo, ds_uuid, outpath, overwrite=True):
def rewrite_recs_for_solr(self, fileinfo, ds_uuid, outpath, overwrite=True):
"""
Note:
Produces data requiring http post to contain
Expand Down Expand Up @@ -190,8 +197,9 @@ def read_recs_for_solr(self, fileinfo, ds_uuid, outpath, overwrite=True):
occ_uuid = dwc_rec[fileinfo[DWCA.UUID_KEY]]
if not self._is_guid(occ_uuid):
if count > 1:
print('Line {} does not contain a GUID in id field'
.format(count))
log_warn(
'Line {} does not contain a GUID in id field'
.format(count), self.log)
else:
coll_date = self._get_date(dwc_rec)
who_val = dwc_rec['datasetName']
Expand All @@ -214,9 +222,11 @@ def read_recs_for_solr(self, fileinfo, ds_uuid, outpath, overwrite=True):
specify_record_server, ds_uuid, occ_uuid)
wtr.writerow(solr_rec)
except Exception as e:
print('Rec {}: failed {}'.format(count, e))
log_error('Rec {}: failed {}'.format(count, e))
except Exception as e:
print ('Failed to read/write file {}: {}'.format(core_fname, e))
log_warn(
'Failed to read/write file {}: {}'.format(core_fname, e),
self.logger)
finally:
inf.close()
outf.close()
Expand All @@ -233,15 +243,17 @@ def extract_from_zip(self, zip_fname, extract_path=None):
if ext in ['.xml', '.csv', '.txt']:
zfile.extract(zinfo, path=extract_path)
else:
print('Unexpected filename {} in zipfile {}'.format(
zinfo.filename, zip_fname))
log_warn('Unexpected filename {} in zipfile {}'.format(
zinfo.filename, zip_fname), self.logger)


# ......................................................
def read_dataset_uuid(self, meta_fname):
idstr = None
if os.path.split(meta_fname)[1] != 'eml.xml':
print ('Expected filename eml.xml at {}'.format(meta_fname))
log_error(
'Expected filename eml.xml at {}'.format(meta_fname),
self.logger)
return ''
tree = ET.parse(meta_fname)
root = tree.getroot()
Expand Down Expand Up @@ -282,7 +294,9 @@ def read_core_fileinfo(self, meta_fname):
fieldname_index_map: dict of fields and corresponding column indices
"""
if os.path.split(meta_fname)[1] != 'meta.xml':
print ('Expected filename meta.xml at {}'.format(meta_fname))
log_error(
'Expected filename meta.xml at {}'.format(meta_fname),
self.logger)
return ''
fileinfo = {}
field_idxs = {}
Expand Down Expand Up @@ -311,7 +325,7 @@ def read_core_fileinfo(self, meta_fname):
# plus fieldname --> uuid_idx
field_idxs[DWCA.UUID_KEY] = uuid_idx
field_idxs[uuid_idx] = DWCA.UUID_KEY
all_idxs = [int(uuid_idx)]
all_idxs = set([int(uuid_idx)])
# Rest of fields and indices
field_elts = core_elt.findall('{}field'.format(DWCA.NS))
startidx = len(DWCA.NS)-1
Expand All @@ -326,12 +340,14 @@ def read_core_fileinfo(self, meta_fname):
# Correct UUID fieldname
if idx == uuid_idx:
uuid_fldname = term
all_idxs.append(int(idx))
field_idxs.pop(DWCA.UUID_KEY)
all_idxs.add(int(idx))
field_idxs[idx] = term
field_idxs[term] = idx
fileinfo[DWCA.UUID_KEY] = uuid_fldname
fileinfo[DWCA.FLDMAP_KEY] = field_idxs
# CSV file fieldnames ordered by column index
all_idxs = list(all_idxs)
all_idxs.sort()
ordered_fldnames = []
for i in all_idxs:
Expand Down
Loading

0 comments on commit a5e1359

Please sign in to comment.