Skip to content

Commit

Permalink
added wildcard characters in path specification and also multiple dir…
Browse files Browse the repository at this point in the history
…ectories are allowed now to be specified
  • Loading branch information
kbessonov1984 committed Jul 29, 2024
1 parent f8cd859 commit 74edb81
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 14 deletions.
12 changes: 11 additions & 1 deletion ectyper/commandLineOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,19 @@ def checkdbversion():
"--input",
help="Location of E. coli genome file(s). Can be a single file, a \
comma-separated list of files, or a directory",
required=True
required=True,
nargs="+"
)

parser.add_argument(
"-d",
"--maxdepth",
help="Maximum number of directories to descend when searching an input directory of files",
default=1e6,
type=int,
required=False
)

parser.add_argument(
"-c",
"--cores",
Expand Down
2 changes: 1 addition & 1 deletion ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def run_program():
os.makedirs(temp_dir, exist_ok=True)

LOG.info("Gathering genome files list ...")
input_files_list = genomeFunctions.get_files_as_list(args.input)
input_files_list = genomeFunctions.get_files_as_list(args.input, args.maxdepth)
raw_genome_files = decompress_gunzip_files(input_files_list, temp_dir)

LOG.info(f"Identifying genome file types on {len(raw_genome_files)} inputs ...")
Expand Down
44 changes: 32 additions & 12 deletions ectyper/genomeFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@

LOG = logging.getLogger(__name__)

def get_relative_directory_level(path = '.', init_min_level=0):
level = os.path.abspath(path).count(os.sep) - init_min_level
if os.path.isdir(path):
return level + 1 #directory paths are missing terminating '/' symbol so need to adjust counter
else:
return level


def get_files_as_list(file_or_directory):
def get_files_as_list(files_or_directories, max_depth_level):
"""
Creates a list of files from either the given file, or all files within the
directory specified (where each file name is its absolute path).
Expand All @@ -30,26 +35,41 @@ def get_files_as_list(file_or_directory):
"""

files_list = []
if file_or_directory:
if os.path.isdir(file_or_directory):
LOG.info("Gathering genomes from directory " + file_or_directory)
init_min_dir_level = min([os.path.abspath(p).count(os.sep)+1 if os.path.isdir(p) else os.path.abspath(p).count(os.sep) for p in files_or_directories])

for file_or_directory in sorted([os.path.abspath(p) for p in files_or_directories]):

dir_level_current = get_relative_directory_level(file_or_directory, init_min_dir_level)
LOG.info(f"Gathering genomes from directory {file_or_directory} at level {dir_level_current} ...")

if dir_level_current > max_depth_level:
LOG.info(f"Directory level exceeded ({dir_level_current} > {max_depth_level}), skipping directory {file_or_directory} ...")
continue

# if single directory is specified
if os.path.isdir(file_or_directory):
# Create a list containing the file names
for root, dirs, files in os.walk(file_or_directory):
for root, dirs, files in os.walk(os.path.abspath(file_or_directory)):
dir_level = get_relative_directory_level(root, init_min_dir_level)
LOG.info(f"In '{root}' level {dir_level} identified {len(dirs)} sub-directory(ies) and {len(files)} file(s) ...")
if dir_level > max_depth_level:
continue
for filename in files:
files_list.append(os.path.join(root, filename))
LOG.info(f"Identified {len(files_list)} genomes in {file_or_directory}")
# check if input is concatenated file locations separated by , (comma)
elif ',' in file_or_directory:
LOG.info("Using genomes in the input list separated by ','")
LOG.info("Using file paths in the input list separated by the ',' symbol ...")
missing_inputs_count = 0
for filename in file_or_directory.split(','):
if os.path.exists(os.path.abspath(filename)):
if os.path.exists(os.path.abspath(filename)) == True and os.path.isdir(filename) == False:
files_list.append(os.path.abspath(filename))
elif os.path.isdir(os.path.abspath(filename)):
LOG.warning(f"Provided {filename} is a directory and not a file. Only paths to files are acceptable in ',' separated list ...")
else:
LOG.warning(f"File {filename} not found in the ',' separated list")
missing_inputs_count += 1
LOG.info(f"Total of {len(files_list)} genomes identified with a valid path and {missing_inputs_count} missing")
LOG.info(f"Total of {len(files_list)} files identified with a valid path and {missing_inputs_count} are missing ...")
# a path to a file is specified
else:
LOG.info("Checking existence of file " + file_or_directory)
input_abs_file_path = os.path.abspath(file_or_directory)
Expand All @@ -62,8 +82,8 @@ def get_files_as_list(file_or_directory):
if not files_list:
LOG.critical("No files were found for the ectyper run")
raise FileNotFoundError("No files were found to run on")

sorted_files = sorted(files_list)
LOG.info(f"Overall identified {len(files_list)} file(s) to process ...");
sorted_files = sorted(list(set(files_list)))
LOG.debug(sorted_files)
return sorted_files

Expand Down

0 comments on commit 74edb81

Please sign in to comment.