From 5c330f1cd0aed76020cbfbcdc561bd3719cc1865 Mon Sep 17 00:00:00 2001 From: Christian Panse Date: Fri, 23 Aug 2019 14:58:21 +0200 Subject: [PATCH] fixed #1 --- fgcz/fcc.py | 125 +++++++++++++++++++++++----------------------------- 1 file changed, 55 insertions(+), 70 deletions(-) diff --git a/fgcz/fcc.py b/fgcz/fcc.py index 2826a8c..f3b6767 100644 --- a/fgcz/fcc.py +++ b/fgcz/fcc.py @@ -1,16 +1,16 @@ #!/usr/bin/python # -*- coding: latin1 -*- -# $HeadURL: https://github.com/fgcz/PyFGCZ/fcc.py $ +# $HeadURL: http://fgcz-svn.uzh.ch/repos/fgcz/stable/proteomics/fcc/fcc.py $ # $Id: fcc.py 7518 2015-05-27 15:20:12Z cpanse $ # $Date: 2015-05-27 17:20:12 +0200 (Wed, 27 May 2015) $ # $Author: cpanse $ -# Copyright 2008-2017 +# Copyright 2008-2015 # Christian Panse # Simon Barkow-Oesterreicher -# Witold Eryk Wolski +# Witold Eric Wolski # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -80,7 +80,7 @@ 2012-12-04 handles dirs as files, e.g. conversion of waters.com instruments raw folders (SB,CP) 2015-07-07 on github.com """ -__version__ = "https://github.com/fgcz/PyFGCZ" +__version__ = "https://github.com/fgcz/fcc" import os import urllib @@ -98,10 +98,9 @@ import logging import logging.handlers import hashlib -import yaml -def create_logger(name="fcc", address=("fgcz-ms.uzh.ch", 514)): +def create_logger(name="fcc", address=("130.60.193.21", 514)): """ create a logger object """ @@ -121,7 +120,7 @@ def create_logger(name="fcc", address=("fgcz-ms.uzh.ch", 514)): class FgczCrawl(object): - def __init__(self, pattern=None, max_time_diff=None): + def __init__(self, pattern=None, max_time_diff=None, min_time_diff=300): """ """ self.para = {} @@ -137,7 +136,7 @@ def __init__(self, pattern=None, max_time_diff=None): self.regex_list = map(lambda p: re.compile(p), self.pattern_list) - self.para['min_time_diff'] = 300 + self.para['min_time_diff'] = min_time_diff if not max_time_diff is None: self.para['max_time_diff'] = max_time_diff else: @@ -163,12 +162,14 @@ def dfs_(self, path, idx): res.append(new_path) - res = filter(lambda f: time.time() - os.path.getmtime(f) > self.para[ - 'min_time_diff'] and time.time() - os.path.getmtime(f) < self.para['max_time_diff'], res) + + #logger.info("min_time_diff={}".format(self.para['min_time_diff'])) + res = filter(lambda f: time.time() - os.path.getctime(f) > self.para[ + 'min_time_diff'] and time.time() - os.path.getctime(f) < self.para['max_time_diff'], res) res = filter(lambda f: os.path.getsize(f) > self.para['min_size'] or os.path.isdir(f), res) - + return res @property @@ -254,7 +255,7 @@ def parseConfig(xml): rule = dict() try: converter = converterDict[i.attributes['converterID'].value] - for a in ("converterID", "converterDir", "converterCmd", "converterOptions", "toFileExt", "fromFileExt", "hostname"): + for a in ("converterDir", "converterCmd", "converterOptions", "toFileExt", "fromFileExt", "hostname"): rule[a] = converter[a] # hard constraints @@ -294,7 +295,7 @@ def getDetailsFromFilePath(filePath): return fileDetails -def matchFileToRules(fileDetails, rulesList, myHostname = None): +def matchFileToRules(fileDetails, rulesList, myHostname=None): """ returns rules that are matched to instrument RAW-files. NOTE: date cmp function assumes YYYYMMDD! @@ -310,7 +311,7 @@ def matchFileToRules(fileDetails, rulesList, myHostname = None): logger.debug("skipping" + filename + "because of file size is 0.") return matchedRules - timediff = time.time() - os.path.getmtime(filename) + timediff = time.time() - os.path.getctime(filename) # TODO(cp): should be a variable if timediff < 300: @@ -327,6 +328,8 @@ def matchFileToRules(fileDetails, rulesList, myHostname = None): regex = re.compile(".*{0}.*".format(rule["keyword"])) regex2 = re.compile(".*{0}.*".format(rule["converterDir"])) + + if (((fileDetails["project"] == rule["project"]) or ('' == rule["project"])) and (fileDetails["omics"] == rule["omics"]) and ((fileDetails["instrument"] == rule["instrument"]) or ('' == rule["instrument"])) and @@ -334,14 +337,30 @@ def matchFileToRules(fileDetails, rulesList, myHostname = None): (fileDetails["date"] >= rule["beginDate"]) and (fileDetails["date"] <= rule["endDate"]) and (fileDetails["extension"] == rule["fromFileExt"]) and - (regex.match(fileDetails["filePath"])) and - (re.search(myHostname, rule["hostname"]))): - if (regex2.match(fileDetails["filePath"])): + bool(regex.match(fileDetails["filePath"])) and + bool(re.search(myHostname, rule["hostname"]))): + + if (bool(regex2.match(fileDetails["filePath"]))): logger.debug("skipping '" + filename + "' because of recursion warning." + str( rule["converterDir"]) + " is already in the path.") continue + + + """ + print filename, timediff, \ + (fileDetails["project"] == rule["project"] or ('' == rule["project"])), \ + fileDetails["omics"] == rule["omics"], \ + ((fileDetails["instrument"] == rule["instrument"]) or ('' == rule["instrument"])), \ + ((fileDetails["user"] == rule["user"]) or ('' == rule["user"])), \ + (fileDetails["date"] >= rule["beginDate"]), \ + (fileDetails["date"] <= rule["endDate"]), \ + (fileDetails["extension"] == rule["fromFileExt"]), \ + (bool(regex.match(fileDetails["filePath"])) and bool((re.search(myHostname, rule["hostname"])))), \ + bool(re.search(myHostname, rule["hostname"])), \ + bool(regex2.match(fileDetails["filePath"])) + """ + matchedRules.append(rule) - # print rule except: pass return matchedRules @@ -359,20 +378,21 @@ def usage(): class Fcc: """ """ - parameters = {'config_url': "http://fgcz-ms.uzh.ch/config/fcc_config.xml", 'readme_url': "http://fgcz-r-021.uzh.ch/config/fcc_readme.txt", + parameters = {'config_url': "http://fgcz-ms.uzh.ch/config/fcc_config.xml", 'crawl_pattern': ['/srv/www/htdocs/Data2San/', 'p[0-9]{2,4}', 'Metabolomics', '(GCT)_[0-9]', '[a-z]{3,18}_[0-9]{8}(_[-a-zA-Z0-9_]{0,100}){0,1}', '[-a-zA-Z0-9_]+.(raw|RAW|wiff|wiff\.scan)'], - 'nCPU': 1, + 'nCPU': None, 'max_time_diff': 60 * 60 * 24 * 7 * 4, - 'sleepDuration': 300, + 'sleepDuration': 600, + 'min_time_diff': 300, 'loop': False, 'exec': False} myProcessId = os.getpid() - parameters['hostname'] = "{0}".format(socket.gethostbyaddr(socket.gethostname())[0].split('.')[0]) + myHostname = "{0}".format(socket.gethostbyaddr(socket.gethostname())[0].split('.')[0]) signal.signal(signal.SIGINT, signal_handler) myRootDir = None @@ -413,7 +433,7 @@ def read_config(self, url=''): logger.error("The XML config file is missing or malformed. Error: ") logger.error(sys.exc_info()[1]) print ("Unexpected error:", sys.exc_info()[1]) - raise + sys.exit(1) # TODO(cp): use lxml try: @@ -422,19 +442,6 @@ def read_config(self, url=''): logger.error("could not parse xml configuration") return None - """ - write all considered cmds into a file - """ - def update_processed_cmd(self, filename = r'C:\FGCZ\fcc\cmds_conducted.yaml'): - if self.parameters['exec']: - try: - os.rename(filename, "{}.bak".format(filename)) - except: - pass - with open(filename, "w") as f: - yaml.dump(self.processedCmdMD5Dict, f, default_flow_style=False) - - def process(self, file): """ computes a match and executes cmd (add to spool dir) @@ -450,8 +457,11 @@ def process(self, file): fileDir = os.path.dirname(file) fileDetails = getDetailsFromFilePath(file) + # print self.myHostname, fileDetails - matchingRules = matchFileToRules(fileDetails, self.rulesList, myHostname = self.parameters['hostname']) + matchingRules = matchFileToRules(fileDetails, self.rulesList, myHostname=self.myHostname) + + #print matchingRules if len(matchingRules) > 0: logger.debug( @@ -465,34 +475,13 @@ def process(self, file): """ create the directory in the python way, """ - # if not os.path.exists(converterDir) and self.parameters['exec']: - if not os.path.exists(converterDir): + if not os.path.exists(converterDir) and self.parameters['exec']: try: os.mkdir(converterDir) except: logger.error( "mkdir {0} failed.".format(converterDir)) - raise - - readme_filename = os.path.normpath("{0}/README.txt".format(converterDir)) - readme_content = """ -the files contained in this directory have been generated using fcc applying rule #{0}. - -more information can be found using the following url: - -http://fgcz-data.uzh.ch/config/fcc_config.xml#converterID-{0} - -or by contacting Christian Panse - -if you use these files in your publication please cite: -http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3614436/ - - -""".format(mrule["converterID"]) - - if os.path.isfile(readme_filename) is False: - with open(readme_filename, "w") as readmef: - readmef.write(readme_content) + sys.exit(1) toFileName = os.path.normpath( "{0}/{1}{2}".format(converterDir, @@ -500,6 +489,7 @@ def process(self, file): os.path.basename(file))[0], mrule["toFileExt"])) + # print "DEBUG", toFileName if not os.path.exists(toFileName): if mrule["project"] in countDict: countDict[mrule["project"]] = countDict[ @@ -509,31 +499,25 @@ def process(self, file): candCmdLine = createSystemBatch( file, toFileName, mrule) + checksum = hashlib.md5() checksum.update(candCmdLine.encode("utf-8")) candCmdLineMD5 = checksum.hexdigest() if not candCmdLineMD5 in self.processedCmdMD5Dict: - self.processedCmdMD5Dict[candCmdLineMD5] = candCmdLine - self.update_processed_cmd() if self.parameters['exec']: self.pool.map_async(myExecWorker0, [ candCmdLine ], callback=lambda i: logger.info("callback {0}".format(i))) logger.info("added|cmd='{}' to pool".format(candCmdLine)) - else: - # TODO(cp): make this generic working - with open("C:\\FGCZ\\fcc\Cmds-problems.txt", "a") as cmdf: - cmdf.write("{0}\t{1}\n".format(candCmdLineMD5, candCmdLine)) + def run(self): """ :return: """ - - - crawler = FgczCrawl(pattern=self.parameters['crawl_pattern'], max_time_diff=self.parameters['max_time_diff']) + crawler = FgczCrawl(pattern=self.parameters['crawl_pattern'], max_time_diff=self.parameters['max_time_diff'], min_time_diff=self.parameters['min_time_diff']) if not os.path.exists(os.path.normpath(self.parameters['crawl_pattern'][0])): logger.error("{0} does not exsist.".format(self.parameters('crawl_pattern')[0])) @@ -559,14 +543,15 @@ def run(self): except: logger.error("could not create pool.") - print sys.exc_info() sys.exit(1) while True: + logger.info("number of pending jobs in queue is {}.".format(len(self.pool._cache.keys()))) self.rulesList = self.read_config(self.parameters['config_url']) logger.debug("found {0} rules in {1}".format(len(self.rulesList), self.parameters['config_url'])) + logger.info("computing rule versus file matching ...") tStart = time.time()