Skip to content

Commit

Permalink
Feature/v1.7.0 -- adds parquet support (#114)
Browse files Browse the repository at this point in the history
* working parquet

* incorp Mauro's updates (pNegECDF and infer switch) into v1.7.0 (#112)

* Feature/v1.6.3 (#108)

* GEO download much improved and should handle more datastets; passes tests

* version fix

* changed test dataset to 450k, b/c 27k not fully supported

* correct both addressA and addressB for OOB probes when channel swapping (#111)

* correct both addressA and addressB for OOB probes

* cleaner solution to getting address A and B from lookup

* tests passing; changed ref values slightly

* Addition of negative control based pvalue calculation (#109)

* Update pOOBAH pval calculation

* added feature to output negative control based pvalue - pNegECDF_pval

Co-authored-by: Chavez, Mauro <[email protected]>

Co-authored-by: Mauro Chavez <[email protected]>
Co-authored-by: Chavez, Mauro <[email protected]>

Co-authored-by: Mauro Chavez <[email protected]>
Co-authored-by: Chavez, Mauro <[email protected]>
  • Loading branch information
3 people authored Jul 8, 2022
1 parent 9828e71 commit 8a540ce
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 120 deletions.
8 changes: 7 additions & 1 deletion docs/release-history.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# Release History

## v1.7.0
- added support for `parquet` file formats (as an alternative to pickle that is readable
by other languages)
- run_pipeline (beta, M, noob, raw_meth/unmeth, samplesheet_data_frame) as parquet
- minor fixes to GEO download processing

## v1.6.2
- Minor bug fixes
- Minor bug fixes

## v1.6.1
- samplesheet: ensures csv and meta_data pickle match
Expand Down
12 changes: 10 additions & 2 deletions methylprep/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ def cli_process(cmd_args):
help='If specified, exports a pickled dataframe of the poobah p-values per sample.'
)

parser.add_argument(
'-f', '--file_format',
required=False,
default='pickle',
help='Specify `parquet` instead of default `pickle`'
)

parser.add_argument(
'--minfi',
required=False,
Expand Down Expand Up @@ -285,9 +292,10 @@ def cli_process(cmd_args):
export_poobah=args.export_poobah,
quality_mask=(not args.no_quality_mask),
sesame=(not args.minfi), # default 'sesame' method can be turned off using --minfi,
pneg_ecdf=args.pneg_ecdf
pneg_ecdf=args.pneg_ecdf,
file_format=args.file_format
)


def cli_beta_bakery(cmd_args):
parser = DefaultParser(
Expand Down
19 changes: 18 additions & 1 deletion methylprep/download/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,8 @@ def download_geo_processed(geo_id, working, verbose=False, use_headers=False):
# delete gzip
if Path(unzipped_file).exists():
saved_file_path.unlink()
if Path(unzipped_file).stat().st_size < 100000:
LOGGER.warning(f"Series Matrix file size ({Path(unzipped_file).stat().st_size/1000}K) is too small to contain beta_values.")
import methylcheck
data = methylcheck.read_geo_processed.read_series_matrix(unzipped_file, include_headers_df=True)
if verbose:
Expand All @@ -739,7 +741,20 @@ def download_geo_processed(geo_id, working, verbose=False, use_headers=False):
with open(Path(Path(unzipped_file).parent, f"{geo_id}_series_summary.json"), 'w', encoding='utf8') as f:
json.dump(data['series_dict'],f)
if isinstance(data.get('df'), pd.DataFrame): # betas
data['df'].to_pickle(Path(Path(unzipped_file).parent, f"{geo_id}_beta_values.pkl"))
if len(data['df']) == 0:
LOGGER.error(f"beta values DataFrame is empty: {data['df'].shape}")
downloaded_files = False # the processed data file (series matrix) was empty, so setting to False
else:
data['df'].to_pickle(Path(Path(unzipped_file).parent, f"{geo_id}_beta_values.pkl"))
# if not compressing later, move this file out of tempfolder
if compress != True:
current = Path(unzipped_file).parent
parent = Path(unzipped_file).parent.parent
# shutil.move(unzipped_file, str(parent)) --- the .txt file is no longer needed. everything is repackaged.
shutil.move(str(Path(current, f"{geo_id}_samplesheet.csv")), str(parent))
shutil.move(str(Path(current, f"{geo_id}_series_summary.json")), str(parent))
shutil.move(str(Path(current, f"{geo_id}_beta_values.pkl")), str(parent))
continue
except Exception as e:
LOGGER.info(f"Series_matrix download failed: {e}, trying other saved files")
import traceback
Expand Down Expand Up @@ -1090,6 +1105,8 @@ def samplesheet_from_series_matrix(df):
OVERWRITE_WARNINGS[key.strip()] += 1
else:
new[key.strip()] = value.strip()
elif item == '':
continue
else:
LOGGER.warning(f"Characteristic '{item}' not understood")
if len(OVERWRITE_WARNINGS) > 0:
Expand Down
4 changes: 2 additions & 2 deletions methylprep/models/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,6 @@ def _build_and_verify_path(self, filename, alt_filename=None, allow_compressed=F
LOGGER.warning(f'Multiple ({len(file_matches)}) files matched {alt_filename} -- saved path to first one: {file_matches[0]}')
return file_matches[0]

def get_export_filepath(self):
def get_export_filepath(self, extension='csv'):
""" Called by run_pipeline to find the folder/filename to export data as CSV, but CSV file doesn't exist yet."""
return self.get_filepath('csv', 'processed', verify=False)
return self.get_filepath(extension, 'processed', verify=False)
Loading

0 comments on commit 8a540ce

Please sign in to comment.