-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
55 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,16 @@ | ||
#!/usr/bin/python | ||
# -*- coding: latin1 -*- | ||
|
||
# $HeadURL: https://github.com/fgcz/PyFGCZ/fcc.py $ | ||
# $HeadURL: http://fgcz-svn.uzh.ch/repos/fgcz/stable/proteomics/fcc/fcc.py $ | ||
# $Id: fcc.py 7518 2015-05-27 15:20:12Z cpanse $ | ||
# $Date: 2015-05-27 17:20:12 +0200 (Wed, 27 May 2015) $ | ||
# $Author: cpanse $ | ||
|
||
|
||
# Copyright 2008-2017 | ||
# Copyright 2008-2015 | ||
# Christian Panse <[email protected]> | ||
# Simon Barkow-Oesterreicher | ||
# Witold Eryk Wolski <[email protected]> | ||
# Witold Eric Wolski <[email protected]> | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
|
@@ -80,7 +80,7 @@ | |
2012-12-04 handles dirs as files, e.g. conversion of waters.com instruments raw folders (SB,CP) | ||
2015-07-07 on github.com | ||
""" | ||
__version__ = "https://github.com/fgcz/PyFGCZ" | ||
__version__ = "https://github.com/fgcz/fcc" | ||
|
||
import os | ||
import urllib | ||
|
@@ -98,10 +98,9 @@ | |
import logging | ||
import logging.handlers | ||
import hashlib | ||
import yaml | ||
|
||
|
||
def create_logger(name="fcc", address=("fgcz-ms.uzh.ch", 514)): | ||
def create_logger(name="fcc", address=("130.60.193.21", 514)): | ||
""" | ||
create a logger object | ||
""" | ||
|
@@ -121,7 +120,7 @@ def create_logger(name="fcc", address=("fgcz-ms.uzh.ch", 514)): | |
class FgczCrawl(object): | ||
|
||
|
||
def __init__(self, pattern=None, max_time_diff=None): | ||
def __init__(self, pattern=None, max_time_diff=None, min_time_diff=300): | ||
""" | ||
""" | ||
self.para = {} | ||
|
@@ -137,7 +136,7 @@ def __init__(self, pattern=None, max_time_diff=None): | |
|
||
self.regex_list = map(lambda p: re.compile(p), self.pattern_list) | ||
|
||
self.para['min_time_diff'] = 300 | ||
self.para['min_time_diff'] = min_time_diff | ||
if not max_time_diff is None: | ||
self.para['max_time_diff'] = max_time_diff | ||
else: | ||
|
@@ -163,12 +162,14 @@ def dfs_(self, path, idx): | |
res.append(new_path) | ||
|
||
|
||
res = filter(lambda f: time.time() - os.path.getmtime(f) > self.para[ | ||
'min_time_diff'] and time.time() - os.path.getmtime(f) < self.para['max_time_diff'], res) | ||
|
||
#logger.info("min_time_diff={}".format(self.para['min_time_diff'])) | ||
res = filter(lambda f: time.time() - os.path.getctime(f) > self.para[ | ||
'min_time_diff'] and time.time() - os.path.getctime(f) < self.para['max_time_diff'], res) | ||
res = filter(lambda f: os.path.getsize(f) > | ||
self.para['min_size'] or os.path.isdir(f), res) | ||
|
||
|
||
return res | ||
|
||
@property | ||
|
@@ -254,7 +255,7 @@ def parseConfig(xml): | |
rule = dict() | ||
try: | ||
converter = converterDict[i.attributes['converterID'].value] | ||
for a in ("converterID", "converterDir", "converterCmd", "converterOptions", "toFileExt", "fromFileExt", "hostname"): | ||
for a in ("converterDir", "converterCmd", "converterOptions", "toFileExt", "fromFileExt", "hostname"): | ||
rule[a] = converter[a] | ||
|
||
# hard constraints | ||
|
@@ -294,7 +295,7 @@ def getDetailsFromFilePath(filePath): | |
return fileDetails | ||
|
||
|
||
def matchFileToRules(fileDetails, rulesList, myHostname = None): | ||
def matchFileToRules(fileDetails, rulesList, myHostname=None): | ||
""" | ||
returns rules that are matched to instrument RAW-files. | ||
NOTE: date cmp function assumes YYYYMMDD! | ||
|
@@ -310,7 +311,7 @@ def matchFileToRules(fileDetails, rulesList, myHostname = None): | |
logger.debug("skipping" + filename + "because of file size is 0.") | ||
return matchedRules | ||
|
||
timediff = time.time() - os.path.getmtime(filename) | ||
timediff = time.time() - os.path.getctime(filename) | ||
|
||
# TODO(cp): should be a variable | ||
if timediff < 300: | ||
|
@@ -327,21 +328,39 @@ def matchFileToRules(fileDetails, rulesList, myHostname = None): | |
regex = re.compile(".*{0}.*".format(rule["keyword"])) | ||
regex2 = re.compile(".*{0}.*".format(rule["converterDir"])) | ||
|
||
|
||
|
||
if (((fileDetails["project"] == rule["project"]) or ('' == rule["project"])) and | ||
(fileDetails["omics"] == rule["omics"]) and | ||
((fileDetails["instrument"] == rule["instrument"]) or ('' == rule["instrument"])) and | ||
((fileDetails["user"] == rule["user"]) or ('' == rule["user"])) and | ||
(fileDetails["date"] >= rule["beginDate"]) and | ||
(fileDetails["date"] <= rule["endDate"]) and | ||
(fileDetails["extension"] == rule["fromFileExt"]) and | ||
(regex.match(fileDetails["filePath"])) and | ||
(re.search(myHostname, rule["hostname"]))): | ||
if (regex2.match(fileDetails["filePath"])): | ||
bool(regex.match(fileDetails["filePath"])) and | ||
bool(re.search(myHostname, rule["hostname"]))): | ||
|
||
if (bool(regex2.match(fileDetails["filePath"]))): | ||
logger.debug("skipping '" + filename + "' because of recursion warning." + str( | ||
rule["converterDir"]) + " is already in the path.") | ||
continue | ||
|
||
|
||
""" | ||
print filename, timediff, \ | ||
(fileDetails["project"] == rule["project"] or ('' == rule["project"])), \ | ||
fileDetails["omics"] == rule["omics"], \ | ||
((fileDetails["instrument"] == rule["instrument"]) or ('' == rule["instrument"])), \ | ||
((fileDetails["user"] == rule["user"]) or ('' == rule["user"])), \ | ||
(fileDetails["date"] >= rule["beginDate"]), \ | ||
(fileDetails["date"] <= rule["endDate"]), \ | ||
(fileDetails["extension"] == rule["fromFileExt"]), \ | ||
(bool(regex.match(fileDetails["filePath"])) and bool((re.search(myHostname, rule["hostname"])))), \ | ||
bool(re.search(myHostname, rule["hostname"])), \ | ||
bool(regex2.match(fileDetails["filePath"])) | ||
""" | ||
|
||
matchedRules.append(rule) | ||
# print rule | ||
except: | ||
pass | ||
return matchedRules | ||
|
@@ -359,20 +378,21 @@ def usage(): | |
class Fcc: | ||
""" | ||
""" | ||
parameters = {'config_url': "http://fgcz-ms.uzh.ch/config/fcc_config.xml", 'readme_url': "http://fgcz-r-021.uzh.ch/config/fcc_readme.txt", | ||
parameters = {'config_url': "http://fgcz-ms.uzh.ch/config/fcc_config.xml", | ||
'crawl_pattern': ['/srv/www/htdocs/Data2San/', | ||
'p[0-9]{2,4}', 'Metabolomics', | ||
'(GCT)_[0-9]', | ||
'[a-z]{3,18}_[0-9]{8}(_[-a-zA-Z0-9_]{0,100}){0,1}', | ||
'[-a-zA-Z0-9_]+.(raw|RAW|wiff|wiff\.scan)'], | ||
'nCPU': 1, | ||
'nCPU': None, | ||
'max_time_diff': 60 * 60 * 24 * 7 * 4, | ||
'sleepDuration': 300, | ||
'sleepDuration': 600, | ||
'min_time_diff': 300, | ||
'loop': False, | ||
'exec': False} | ||
|
||
myProcessId = os.getpid() | ||
parameters['hostname'] = "{0}".format(socket.gethostbyaddr(socket.gethostname())[0].split('.')[0]) | ||
myHostname = "{0}".format(socket.gethostbyaddr(socket.gethostname())[0].split('.')[0]) | ||
|
||
signal.signal(signal.SIGINT, signal_handler) | ||
myRootDir = None | ||
|
@@ -413,7 +433,7 @@ def read_config(self, url=''): | |
logger.error("The XML config file is missing or malformed. Error: ") | ||
logger.error(sys.exc_info()[1]) | ||
print ("Unexpected error:", sys.exc_info()[1]) | ||
raise | ||
sys.exit(1) | ||
|
||
# TODO(cp): use lxml | ||
try: | ||
|
@@ -422,19 +442,6 @@ def read_config(self, url=''): | |
logger.error("could not parse xml configuration") | ||
return None | ||
|
||
""" | ||
write all considered cmds into a file | ||
""" | ||
def update_processed_cmd(self, filename = r'C:\FGCZ\fcc\cmds_conducted.yaml'): | ||
if self.parameters['exec']: | ||
try: | ||
os.rename(filename, "{}.bak".format(filename)) | ||
except: | ||
pass | ||
with open(filename, "w") as f: | ||
yaml.dump(self.processedCmdMD5Dict, f, default_flow_style=False) | ||
|
||
|
||
def process(self, file): | ||
""" | ||
computes a match and executes cmd (add to spool dir) | ||
|
@@ -450,8 +457,11 @@ def process(self, file): | |
fileDir = os.path.dirname(file) | ||
fileDetails = getDetailsFromFilePath(file) | ||
|
||
# print self.myHostname, fileDetails | ||
|
||
matchingRules = matchFileToRules(fileDetails, self.rulesList, myHostname = self.parameters['hostname']) | ||
matchingRules = matchFileToRules(fileDetails, self.rulesList, myHostname=self.myHostname) | ||
|
||
#print matchingRules | ||
|
||
if len(matchingRules) > 0: | ||
logger.debug( | ||
|
@@ -465,41 +475,21 @@ def process(self, file): | |
""" | ||
create the directory in the python way, | ||
""" | ||
# if not os.path.exists(converterDir) and self.parameters['exec']: | ||
if not os.path.exists(converterDir): | ||
if not os.path.exists(converterDir) and self.parameters['exec']: | ||
try: | ||
os.mkdir(converterDir) | ||
except: | ||
logger.error( | ||
"mkdir {0} failed.".format(converterDir)) | ||
raise | ||
|
||
readme_filename = os.path.normpath("{0}/README.txt".format(converterDir)) | ||
readme_content = """ | ||
the files contained in this directory have been generated using fcc applying rule #{0}. | ||
more information can be found using the following url: | ||
http://fgcz-data.uzh.ch/config/fcc_config.xml#converterID-{0} | ||
or by contacting Christian Panse <[email protected]> | ||
if you use these files in your publication please cite: | ||
http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3614436/ | ||
""".format(mrule["converterID"]) | ||
|
||
if os.path.isfile(readme_filename) is False: | ||
with open(readme_filename, "w") as readmef: | ||
readmef.write(readme_content) | ||
sys.exit(1) | ||
|
||
toFileName = os.path.normpath( | ||
"{0}/{1}{2}".format(converterDir, | ||
os.path.splitext( | ||
os.path.basename(file))[0], | ||
mrule["toFileExt"])) | ||
|
||
# print "DEBUG", toFileName | ||
if not os.path.exists(toFileName): | ||
if mrule["project"] in countDict: | ||
countDict[mrule["project"]] = countDict[ | ||
|
@@ -509,31 +499,25 @@ def process(self, file): | |
|
||
candCmdLine = createSystemBatch( | ||
file, toFileName, mrule) | ||
|
||
checksum = hashlib.md5() | ||
checksum.update(candCmdLine.encode("utf-8")) | ||
candCmdLineMD5 = checksum.hexdigest() | ||
|
||
if not candCmdLineMD5 in self.processedCmdMD5Dict: | ||
|
||
self.processedCmdMD5Dict[candCmdLineMD5] = candCmdLine | ||
self.update_processed_cmd() | ||
if self.parameters['exec']: | ||
self.pool.map_async(myExecWorker0, [ candCmdLine ], | ||
callback=lambda i: logger.info("callback {0}".format(i))) | ||
logger.info("added|cmd='{}' to pool".format(candCmdLine)) | ||
else: | ||
# TODO(cp): make this generic working | ||
with open("C:\\FGCZ\\fcc\Cmds-problems.txt", "a") as cmdf: | ||
cmdf.write("{0}\t{1}\n".format(candCmdLineMD5, candCmdLine)) | ||
|
||
|
||
def run(self): | ||
""" | ||
:return: | ||
""" | ||
|
||
|
||
crawler = FgczCrawl(pattern=self.parameters['crawl_pattern'], max_time_diff=self.parameters['max_time_diff']) | ||
crawler = FgczCrawl(pattern=self.parameters['crawl_pattern'], max_time_diff=self.parameters['max_time_diff'], min_time_diff=self.parameters['min_time_diff']) | ||
|
||
if not os.path.exists(os.path.normpath(self.parameters['crawl_pattern'][0])): | ||
logger.error("{0} does not exsist.".format(self.parameters('crawl_pattern')[0])) | ||
|
@@ -559,14 +543,15 @@ def run(self): | |
|
||
except: | ||
logger.error("could not create pool.") | ||
print sys.exc_info() | ||
sys.exit(1) | ||
|
||
while True: | ||
logger.info("number of pending jobs in queue is {}.".format(len(self.pool._cache.keys()))) | ||
self.rulesList = self.read_config(self.parameters['config_url']) | ||
logger.debug("found {0} rules in {1}".format(len(self.rulesList), self.parameters['config_url'])) | ||
|
||
|
||
|
||
logger.info("computing rule versus file matching ...") | ||
tStart = time.time() | ||
|
||
|