Skip to content

Commit

Permalink
fixed #1
Browse files Browse the repository at this point in the history
  • Loading branch information
cpanse committed Aug 23, 2019
1 parent 9f3dab9 commit 5c330f1
Showing 1 changed file with 55 additions and 70 deletions.
125 changes: 55 additions & 70 deletions fgcz/fcc.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#!/usr/bin/python
# -*- coding: latin1 -*-

# $HeadURL: https://github.com/fgcz/PyFGCZ/fcc.py $
# $HeadURL: http://fgcz-svn.uzh.ch/repos/fgcz/stable/proteomics/fcc/fcc.py $
# $Id: fcc.py 7518 2015-05-27 15:20:12Z cpanse $
# $Date: 2015-05-27 17:20:12 +0200 (Wed, 27 May 2015) $
# $Author: cpanse $


# Copyright 2008-2017
# Copyright 2008-2015
# Christian Panse <[email protected]>
# Simon Barkow-Oesterreicher
# Witold Eryk Wolski <[email protected]>
# Witold Eric Wolski <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -80,7 +80,7 @@
2012-12-04 handles dirs as files, e.g. conversion of waters.com instruments raw folders (SB,CP)
2015-07-07 on github.com
"""
__version__ = "https://github.com/fgcz/PyFGCZ"
__version__ = "https://github.com/fgcz/fcc"

import os
import urllib
Expand All @@ -98,10 +98,9 @@
import logging
import logging.handlers
import hashlib
import yaml


def create_logger(name="fcc", address=("fgcz-ms.uzh.ch", 514)):
def create_logger(name="fcc", address=("130.60.193.21", 514)):
"""
create a logger object
"""
Expand All @@ -121,7 +120,7 @@ def create_logger(name="fcc", address=("fgcz-ms.uzh.ch", 514)):
class FgczCrawl(object):


def __init__(self, pattern=None, max_time_diff=None):
def __init__(self, pattern=None, max_time_diff=None, min_time_diff=300):
"""
"""
self.para = {}
Expand All @@ -137,7 +136,7 @@ def __init__(self, pattern=None, max_time_diff=None):

self.regex_list = map(lambda p: re.compile(p), self.pattern_list)

self.para['min_time_diff'] = 300
self.para['min_time_diff'] = min_time_diff
if not max_time_diff is None:
self.para['max_time_diff'] = max_time_diff
else:
Expand All @@ -163,12 +162,14 @@ def dfs_(self, path, idx):
res.append(new_path)


res = filter(lambda f: time.time() - os.path.getmtime(f) > self.para[
'min_time_diff'] and time.time() - os.path.getmtime(f) < self.para['max_time_diff'], res)

#logger.info("min_time_diff={}".format(self.para['min_time_diff']))
res = filter(lambda f: time.time() - os.path.getctime(f) > self.para[
'min_time_diff'] and time.time() - os.path.getctime(f) < self.para['max_time_diff'], res)
res = filter(lambda f: os.path.getsize(f) >
self.para['min_size'] or os.path.isdir(f), res)


return res

@property
Expand Down Expand Up @@ -254,7 +255,7 @@ def parseConfig(xml):
rule = dict()
try:
converter = converterDict[i.attributes['converterID'].value]
for a in ("converterID", "converterDir", "converterCmd", "converterOptions", "toFileExt", "fromFileExt", "hostname"):
for a in ("converterDir", "converterCmd", "converterOptions", "toFileExt", "fromFileExt", "hostname"):
rule[a] = converter[a]

# hard constraints
Expand Down Expand Up @@ -294,7 +295,7 @@ def getDetailsFromFilePath(filePath):
return fileDetails


def matchFileToRules(fileDetails, rulesList, myHostname = None):
def matchFileToRules(fileDetails, rulesList, myHostname=None):
"""
returns rules that are matched to instrument RAW-files.
NOTE: date cmp function assumes YYYYMMDD!
Expand All @@ -310,7 +311,7 @@ def matchFileToRules(fileDetails, rulesList, myHostname = None):
logger.debug("skipping" + filename + "because of file size is 0.")
return matchedRules

timediff = time.time() - os.path.getmtime(filename)
timediff = time.time() - os.path.getctime(filename)

# TODO(cp): should be a variable
if timediff < 300:
Expand All @@ -327,21 +328,39 @@ def matchFileToRules(fileDetails, rulesList, myHostname = None):
regex = re.compile(".*{0}.*".format(rule["keyword"]))
regex2 = re.compile(".*{0}.*".format(rule["converterDir"]))



if (((fileDetails["project"] == rule["project"]) or ('' == rule["project"])) and
(fileDetails["omics"] == rule["omics"]) and
((fileDetails["instrument"] == rule["instrument"]) or ('' == rule["instrument"])) and
((fileDetails["user"] == rule["user"]) or ('' == rule["user"])) and
(fileDetails["date"] >= rule["beginDate"]) and
(fileDetails["date"] <= rule["endDate"]) and
(fileDetails["extension"] == rule["fromFileExt"]) and
(regex.match(fileDetails["filePath"])) and
(re.search(myHostname, rule["hostname"]))):
if (regex2.match(fileDetails["filePath"])):
bool(regex.match(fileDetails["filePath"])) and
bool(re.search(myHostname, rule["hostname"]))):

if (bool(regex2.match(fileDetails["filePath"]))):
logger.debug("skipping '" + filename + "' because of recursion warning." + str(
rule["converterDir"]) + " is already in the path.")
continue


"""
print filename, timediff, \
(fileDetails["project"] == rule["project"] or ('' == rule["project"])), \
fileDetails["omics"] == rule["omics"], \
((fileDetails["instrument"] == rule["instrument"]) or ('' == rule["instrument"])), \
((fileDetails["user"] == rule["user"]) or ('' == rule["user"])), \
(fileDetails["date"] >= rule["beginDate"]), \
(fileDetails["date"] <= rule["endDate"]), \
(fileDetails["extension"] == rule["fromFileExt"]), \
(bool(regex.match(fileDetails["filePath"])) and bool((re.search(myHostname, rule["hostname"])))), \
bool(re.search(myHostname, rule["hostname"])), \
bool(regex2.match(fileDetails["filePath"]))
"""

matchedRules.append(rule)
# print rule
except:
pass
return matchedRules
Expand All @@ -359,20 +378,21 @@ def usage():
class Fcc:
"""
"""
parameters = {'config_url': "http://fgcz-ms.uzh.ch/config/fcc_config.xml", 'readme_url': "http://fgcz-r-021.uzh.ch/config/fcc_readme.txt",
parameters = {'config_url': "http://fgcz-ms.uzh.ch/config/fcc_config.xml",
'crawl_pattern': ['/srv/www/htdocs/Data2San/',
'p[0-9]{2,4}', 'Metabolomics',
'(GCT)_[0-9]',
'[a-z]{3,18}_[0-9]{8}(_[-a-zA-Z0-9_]{0,100}){0,1}',
'[-a-zA-Z0-9_]+.(raw|RAW|wiff|wiff\.scan)'],
'nCPU': 1,
'nCPU': None,
'max_time_diff': 60 * 60 * 24 * 7 * 4,
'sleepDuration': 300,
'sleepDuration': 600,
'min_time_diff': 300,
'loop': False,
'exec': False}

myProcessId = os.getpid()
parameters['hostname'] = "{0}".format(socket.gethostbyaddr(socket.gethostname())[0].split('.')[0])
myHostname = "{0}".format(socket.gethostbyaddr(socket.gethostname())[0].split('.')[0])

signal.signal(signal.SIGINT, signal_handler)
myRootDir = None
Expand Down Expand Up @@ -413,7 +433,7 @@ def read_config(self, url=''):
logger.error("The XML config file is missing or malformed. Error: ")
logger.error(sys.exc_info()[1])
print ("Unexpected error:", sys.exc_info()[1])
raise
sys.exit(1)

# TODO(cp): use lxml
try:
Expand All @@ -422,19 +442,6 @@ def read_config(self, url=''):
logger.error("could not parse xml configuration")
return None

"""
write all considered cmds into a file
"""
def update_processed_cmd(self, filename = r'C:\FGCZ\fcc\cmds_conducted.yaml'):
if self.parameters['exec']:
try:
os.rename(filename, "{}.bak".format(filename))
except:
pass
with open(filename, "w") as f:
yaml.dump(self.processedCmdMD5Dict, f, default_flow_style=False)


def process(self, file):
"""
computes a match and executes cmd (add to spool dir)
Expand All @@ -450,8 +457,11 @@ def process(self, file):
fileDir = os.path.dirname(file)
fileDetails = getDetailsFromFilePath(file)

# print self.myHostname, fileDetails

matchingRules = matchFileToRules(fileDetails, self.rulesList, myHostname = self.parameters['hostname'])
matchingRules = matchFileToRules(fileDetails, self.rulesList, myHostname=self.myHostname)

#print matchingRules

if len(matchingRules) > 0:
logger.debug(
Expand All @@ -465,41 +475,21 @@ def process(self, file):
"""
create the directory in the python way,
"""
# if not os.path.exists(converterDir) and self.parameters['exec']:
if not os.path.exists(converterDir):
if not os.path.exists(converterDir) and self.parameters['exec']:
try:
os.mkdir(converterDir)
except:
logger.error(
"mkdir {0} failed.".format(converterDir))
raise

readme_filename = os.path.normpath("{0}/README.txt".format(converterDir))
readme_content = """
the files contained in this directory have been generated using fcc applying rule #{0}.
more information can be found using the following url:
http://fgcz-data.uzh.ch/config/fcc_config.xml#converterID-{0}
or by contacting Christian Panse <[email protected]>
if you use these files in your publication please cite:
http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3614436/
""".format(mrule["converterID"])

if os.path.isfile(readme_filename) is False:
with open(readme_filename, "w") as readmef:
readmef.write(readme_content)
sys.exit(1)

toFileName = os.path.normpath(
"{0}/{1}{2}".format(converterDir,
os.path.splitext(
os.path.basename(file))[0],
mrule["toFileExt"]))

# print "DEBUG", toFileName
if not os.path.exists(toFileName):
if mrule["project"] in countDict:
countDict[mrule["project"]] = countDict[
Expand All @@ -509,31 +499,25 @@ def process(self, file):

candCmdLine = createSystemBatch(
file, toFileName, mrule)

checksum = hashlib.md5()
checksum.update(candCmdLine.encode("utf-8"))
candCmdLineMD5 = checksum.hexdigest()

if not candCmdLineMD5 in self.processedCmdMD5Dict:

self.processedCmdMD5Dict[candCmdLineMD5] = candCmdLine
self.update_processed_cmd()
if self.parameters['exec']:
self.pool.map_async(myExecWorker0, [ candCmdLine ],
callback=lambda i: logger.info("callback {0}".format(i)))
logger.info("added|cmd='{}' to pool".format(candCmdLine))
else:
# TODO(cp): make this generic working
with open("C:\\FGCZ\\fcc\Cmds-problems.txt", "a") as cmdf:
cmdf.write("{0}\t{1}\n".format(candCmdLineMD5, candCmdLine))


def run(self):
"""
:return:
"""


crawler = FgczCrawl(pattern=self.parameters['crawl_pattern'], max_time_diff=self.parameters['max_time_diff'])
crawler = FgczCrawl(pattern=self.parameters['crawl_pattern'], max_time_diff=self.parameters['max_time_diff'], min_time_diff=self.parameters['min_time_diff'])

if not os.path.exists(os.path.normpath(self.parameters['crawl_pattern'][0])):
logger.error("{0} does not exsist.".format(self.parameters('crawl_pattern')[0]))
Expand All @@ -559,14 +543,15 @@ def run(self):

except:
logger.error("could not create pool.")
print sys.exc_info()
sys.exit(1)

while True:
logger.info("number of pending jobs in queue is {}.".format(len(self.pool._cache.keys())))
self.rulesList = self.read_config(self.parameters['config_url'])
logger.debug("found {0} rules in {1}".format(len(self.rulesList), self.parameters['config_url']))



logger.info("computing rule versus file matching ...")
tStart = time.time()

Expand Down

0 comments on commit 5c330f1

Please sign in to comment.