Skip to content

Commit

Permalink
added a number of tests for preprocessing step
Browse files Browse the repository at this point in the history
  • Loading branch information
weiju committed Mar 11, 2024
1 parent 9934d85 commit ed9ef03
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 55 deletions.
14 changes: 5 additions & 9 deletions miner/miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,20 +162,16 @@ def save_response_content(response, destination):
# Functions used for pre-processing data
# =============================================================================

def removeNullRows(df):

def remove_null_rows(df):
minimum = np.percentile(df,0)
if minimum == 0:
filteredDf = df.loc[df.sum(axis=1)>0,:]
else:
filteredDf = df

return filteredDf

def convertToEnsembl(df,conversionTable,input_format=None):

from collections import Counter

# Index Conversion table on ENSG notation
conversionTableEnsg = conversionTable.copy()
conversionTableEnsg.index = conversionTableEnsg.iloc[:,0]
Expand Down Expand Up @@ -554,7 +550,7 @@ def zscore(expressionData):
print("completed z-transformation.")
return transform

def correctBatchEffects(df, do_preprocess_tpm):
def correct_batch_effects(df, do_preprocess_tpm):

zscoredExpression = zscore(df)
means = []
Expand All @@ -573,16 +569,16 @@ def correctBatchEffects(df, do_preprocess_tpm):

def preprocess(filename, mapfile, convert_ids=True, do_preprocess_tpm=True):
rawExpression = readFileToDf(filename)
rawExpressionZeroFiltered = removeNullRows(rawExpression)
zscoredExpression = correctBatchEffects(rawExpressionZeroFiltered, do_preprocess_tpm)
rawExpressionZeroFiltered = remove_null_rows(rawExpression)
zscoredExpression = correct_batch_effects(rawExpressionZeroFiltered, do_preprocess_tpm)
if convert_ids:
expressionData, conversionTable = identifierConversion(zscoredExpression, mapfile)
return expressionData, conversionTable
else:
return zscoredExpression

# =============================================================================
# Functions used for clustering
# Functions used for clustering
# =============================================================================

def pearson_array(array,vector):
Expand Down
90 changes: 44 additions & 46 deletions test/preprocess_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,65 +3,63 @@
import os
import pytest

#import pandas as pd
import pandas as pd
from miner import miner

"""

class PreprocessTest(unittest.TestCase):
def test_remove_null_rows_min_0_remove_ok():
df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [0, 0, 0], [4, 5, 6]])
df2 = miner.remove_null_rows(df)
assert 3 == df2.shape[0], "wrong number of rows"

def test_remove_null_rows_min_0_remove_ok(self):
df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [0, 0, 0], [4, 5, 6]])
df2 = miner.remove_null_rows(df)
self.assertEqual(3, df2.shape[0], "wrong number of rows")
def test_remove_null_rows_min_0_unchanged():
df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [1, 0, 1], [4, 5, 6]])
df2 = miner.remove_null_rows(df)
assert 4 == df2.shape[0], "wrong number of rows"

def test_remove_null_rows_min_0_unchanged(self):
df = pd.DataFrame([[0, 1, 2], [1, 2, 3], [1, 0, 1], [4, 5, 6]])
df2 = miner.remove_null_rows(df)
self.assertEqual(4, df2.shape[0], "wrong number of rows")
def test_remove_null_rows_min_negative_unchanged():
df = pd.DataFrame([[0, 1, -2], [1, 2, 3], [0, 0, 0], [4, 5, 6]])
df2 = miner.remove_null_rows(df)
assert 4 == df2.shape[0], "wrong number of rows"

def test_remove_null_rows_min_negative_unchanged(self):
df = pd.DataFrame([[0, 1, -2], [1, 2, 3], [0, 0, 0], [4, 5, 6]])
df2 = miner.remove_null_rows(df)
self.assertEqual(4, df2.shape[0], "wrong number of rows")
EPS = 0.001

def test_correct_batch_effects_tpm():
# large means to trigger the TPM function
df = pd.DataFrame([[4, 1, 2], [1, 2, 3], [4, 5, 6]])
df2 = miner.correct_batch_effects(df, False)
assert (3, 3) == df2.shape
assert abs(df2.values[0, 0] - 1.0910894511799618) < EPS
assert abs(df2.values[1, 0] - (-1.0)) < EPS
assert abs(df2.values[2, 0] - (-1.0)) < EPS

def test_correct_batch_effects_tpm(self):
# large means to trigger the TPM function
df = pd.DataFrame([[4, 1, 2], [1, 2, 3], [4, 5, 6]])
df2 = miner.correct_batch_effects(df, False)
self.assertEquals((3, 3), df2.shape)
self.assertAlmostEquals(df2.values[0, 0], 1.0910894511799618)
self.assertAlmostEquals(df2.values[1, 0], -1.0)
self.assertAlmostEquals(df2.values[2, 0], -1.0)
assert abs(df2.values[0, 1] - (-0.8728715609439697)) < EPS
assert abs(df2.values[1, 1] - 0.0) < EPS
assert abs(df2.values[2, 1] - 0.0) < EPS

self.assertAlmostEquals(df2.values[0, 1], -0.8728715609439697)
self.assertAlmostEquals(df2.values[1, 1], 0.0)
self.assertAlmostEquals(df2.values[2, 1], 0.0)
assert abs(df2.values[0, 2] - (-0.2182178902359925)) < EPS
assert abs(df2.values[1, 2] - 1.0) < EPS
assert abs(df2.values[2, 2] - 1.0) < EPS

self.assertAlmostEquals(df2.values[0, 2], -0.2182178902359925)
self.assertAlmostEquals(df2.values[1, 2], 1.0)
self.assertAlmostEquals(df2.values[2, 2], 1.0)

def test_correct_batch_effects_no_tpm(self):
# small means standard deviation
df = pd.DataFrame([[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]])
df2 = miner.correct_batch_effects(df, False)
self.assertEquals((3, 3), df2.shape)
for i in range(3):
for j in range(3):
self.assertAlmostEquals(df2.values[i, j], -0.8164965809277261)
def test_correct_batch_effects_no_tpm():
# small means standard deviation
df = pd.DataFrame([[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]])
df2 = miner.correct_batch_effects(df, False)
assert (3, 3) == df2.shape
for i in range(3):
for j in range(3):
assert abs(df2.values[i, j] - (-0.8164965809277261)) < EPS

def test_preprocess_main_simple(self):
exp, conv_table = miner.preprocess('testdata/exp_data-001.csv', 'testdata/conv_table-001.tsv')
self.assertEquals((10, 3), exp.shape)
for i in range(3):
for j in range(3):
self.assertAlmostEquals(exp.values[i, j], -0.8164965809277261)
"""

def test_dummy():
pass
"""
def test_preprocess_main_simple():
exp, conv_table = miner.preprocess('testdata/exp_data-001.csv', 'testdata/conv_table-001.tsv')
assert (10, 3) == exp.shape
for i in range(3):
for j in range(3):
assert abs(exp.values[i, j] - (-0.8164965809277261)) < EPS
"""

def test_has_testdir():
assert os.path.exists('miner_mindata')

0 comments on commit ed9ef03

Please sign in to comment.