From 85de9640c44e1a7460ee078c9328e4a25f397397 Mon Sep 17 00:00:00 2001 From: weiju Date: Wed, 29 May 2024 12:54:22 -0700 Subject: [PATCH] unit test for get_axes() * refactorings and tests for get_axes() * pass through random_state * separate data set for cluster() function --- miner/miner.py | 25 ++++++++++++++----------- test/mechinf_test.py | 22 +++++++++++++++++++++- testdata/init_clusters-001.json | 1 + testdata/ref_axes-000.json | 1 + 4 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 testdata/init_clusters-001.json create mode 100644 testdata/ref_axes-000.json diff --git a/miner/miner.py b/miner/miner.py index a15cdbc..bd7e53d 100644 --- a/miner/miner.py +++ b/miner/miner.py @@ -685,7 +685,7 @@ def pearson_array(array, vector): return np.sum(product_array,axis=1)/float(product_array.shape[1]-1) -def getAxes(clusters, expressionData): +def get_axes(clusters, expressionData, random_state): axes = {} for key in list(clusters.keys()): genes = clusters[key] @@ -959,7 +959,7 @@ def combineClusters(axes,clusters,threshold=0.925): return revisedClusters -def reconstruction(decomposedList,expressionData,threshold=0.925): +def reconstruction(decomposedList,expressionData, random_state, threshold=0.925): if len(decomposedList) == 0: return decomposedList @@ -968,16 +968,17 @@ def reconstruction(decomposedList,expressionData,threshold=0.925): return decomposedList clusters = {i:decomposedList[i] for i in range(len(decomposedList))} - axes = getAxes(clusters,expressionData) + axes = get_axes(clusters, expressionData, random_state) recombine = combineClusters(axes,clusters,threshold) return recombine -def recursive_alignment(geneset,expressionData,minNumberGenes=6,pct_threshold=80): +def recursive_alignment(geneset,expressionData,minNumberGenes=6, + pct_threshold=80, random_state=12): recDecomp = recursive_decomposition(geneset,expressionData,minNumberGenes,pct_threshold) if len(recDecomp) == 0: return [] - reconstructed = reconstruction(recDecomp,expressionData) + reconstructed = reconstruction(recDecomp,expressionData, random_state) reconstructedList = [reconstructed[i] for i in list(reconstructed.keys()) if len(reconstructed[i])>minNumberGenes] reconstructedList.sort(key = lambda s: -len(s)) return reconstructedList @@ -1020,7 +1021,7 @@ def cluster(expressionData, minNumberGenes=6, minNumberOverExpSamples=4, maxSamp cluster2 = np.array(df.index[np.where(pearson < lowpass)[0]]) for clst in [cluster1, cluster2]: - pdc = recursive_alignment(clst, expressionData=df, minNumberGenes=minNumberGenes, pct_threshold=pct_threshold) + pdc = recursive_alignment(clst, expressionData=df, minNumberGenes=minNumberGenes, pct_threshold=pct_threshold, random_state=random_state) if len(pdc) == 0: continue elif len(pdc) == 1: @@ -1193,19 +1194,21 @@ def membershipToIncidence(membershipDictionary,expressionData): return incidence -def processCoexpressionLists(lists,expressionData,threshold=0.925): - reconstructed = reconstruction(lists,expressionData,threshold) +def processCoexpressionLists(lists,expressionData, random_state, threshold=0.925): + reconstructed = reconstruction(lists,expressionData, random_state, threshold) reconstructedList = [reconstructed[i] for i in reconstructed.keys()] reconstructedList.sort(key = lambda s: -len(s)) return reconstructedList -def reviseInitialClusters(clusterList,expressionData,threshold=0.925): - coexpressionLists = processCoexpressionLists(clusterList,expressionData,threshold) + +def reviseInitialClusters(clusterList, expressionData, random_state=12, threshold=0.925): + coexpressionLists = processCoexpressionLists(clusterList, expressionData, random_state, threshold) coexpressionLists.sort(key= lambda s: -len(s)) for iteration in range(5): previousLength = len(coexpressionLists) - coexpressionLists = processCoexpressionLists(coexpressionLists,expressionData,threshold) + coexpressionLists = processCoexpressionLists(coexpressionLists, expressionData, + random_state, threshold) newLength = len(coexpressionLists) if newLength == previousLength: break diff --git a/test/mechinf_test.py b/test/mechinf_test.py index 4af7213..73c3918 100644 --- a/test/mechinf_test.py +++ b/test/mechinf_test.py @@ -15,7 +15,7 @@ def test_cluster(): exp = pd.read_csv('testdata/exp_data_preprocessed-002.csv', header=0, index_col=0) - with open("testdata/init_clusters-002.json") as infile: + with open("testdata/init_clusters-001.json") as infile: ref_init_clusters = json.load(infile) init_clusters = miner.cluster(exp, minNumberGenes=6, @@ -23,11 +23,31 @@ def test_cluster(): maxSamplesExcluded=0.5, random_state=12, overExpressionThreshold=80) + #with open("init_clusters-002.json", "w") as outfile: + # json.dump(init_clusters, outfile) + for cluster in init_clusters: assert(len(cluster) >= 6) #assert(len(ref_init_clusters) == len(init_clusters)) +def test_get_axes(): + cluster = [] + with open("testdata/cluster1-00.txt") as infile: + for line in infile: + cluster.append(line.strip()) + exp = pd.read_csv('testdata/exp_data_preprocessed-002.csv', header=0, + index_col=0) + with open("testdata/ref_axes-000.json") as infile: + ref_axes = json.load(infile) + + axes = miner.get_axes({"1": cluster}, exp, random_state=12) + json_axes = {} + for key, arr in axes.items(): + json_axes[key] = list(arr) + assert(ref_axes == json_axes) + + def test_recursive_decomposition(): cluster = [] with open("testdata/cluster1-00.txt") as infile: diff --git a/testdata/init_clusters-001.json b/testdata/init_clusters-001.json new file mode 100644 index 0000000..71148b2 --- /dev/null +++ b/testdata/init_clusters-001.json @@ -0,0 +1 @@ +[["ENSG00000076928", "ENSG00000187244", "ENSG00000159374", "ENSG00000143333", "ENSG00000116863", "ENSG00000008441", "ENSG00000196923", "ENSG00000247596", "ENSG00000055070", "ENSG00000042753"], ["ENSG00000006756", "ENSG00000040487", "ENSG00000101605", "ENSG00000175556", "ENSG00000179820", "ENSG00000198799", "ENSG00000166484", "ENSG00000019995", "ENSG00000092148"], ["ENSG00000111644", "ENSG00000187961", "ENSG00000122912", "ENSG00000198556", "ENSG00000006282", "ENSG00000012171", "ENSG00000138111", "ENSG00000099866", "ENSG00000061656"], ["ENSG00000175467", "ENSG00000163171", "ENSG00000134184", "ENSG00000114503", "ENSG00000083635", "ENSG00000167004", "ENSG00000029363", "ENSG00000067596", "ENSG00000126883"], ["ENSG00000124588", "ENSG00000179826", "ENSG00000164751", "ENSG00000243958", "ENSG00000162598", "ENSG00000130513", "ENSG00000138463", "ENSG00000162594", "ENSG00000281618"], ["ENSG00000108639", "ENSG00000154065", "ENSG00000206113", "ENSG00000135912", "ENSG00000140521", "ENSG00000108578", "ENSG00000187840", "ENSG00000172828", "ENSG00000145220"], ["ENSG00000092140", "ENSG00000183579", "ENSG00000084652", "ENSG00000100395", "ENSG00000113648", "ENSG00000107833", "ENSG00000197771", "ENSG00000112715", "ENSG00000069998"], ["ENSG00000134028", "ENSG00000204923", "ENSG00000135631", "ENSG00000141068", "ENSG00000139428", "ENSG00000118640", "ENSG00000171045", "ENSG00000152056", "ENSG00000103642"], ["ENSG00000012174", "ENSG00000072506", "ENSG00000129562", "ENSG00000170522", "ENSG00000163170", "ENSG00000175137", "ENSG00000153029", "ENSG00000117262"], ["ENSG00000073578", "ENSG00000162148", "ENSG00000164654", "ENSG00000177042", "ENSG00000125046", "ENSG00000140522", "ENSG00000153790", "ENSG00000151292"], ["ENSG00000165309", "ENSG00000176927", "ENSG00000056586", "ENSG00000077458", "ENSG00000009830", "ENSG00000135913", "ENSG00000156313", "ENSG00000133131"], ["ENSG00000137145", "ENSG00000012174", "ENSG00000101935", "ENSG00000162599", "ENSG00000165704", "ENSG00000176399", "ENSG00000283620", "ENSG00000109572"], ["ENSG00000262795", "ENSG00000107651", "ENSG00000141068", "ENSG00000113389", "ENSG00000073578", "ENSG00000124214", "ENSG00000105856", "ENSG00000114030"], ["ENSG00000166855", "ENSG00000172671", "ENSG00000176920", "ENSG00000089009", "ENSG00000155506", "ENSG00000149187", "ENSG00000104112", "ENSG00000161021"], ["ENSG00000197776", "ENSG00000178922", "ENSG00000147316", "ENSG00000138468", "ENSG00000100393", "ENSG00000263874", "ENSG00000134186", "ENSG00000130518"], ["ENSG00000188596", "ENSG00000169203", "ENSG00000183571", "ENSG00000184702", "ENSG00000138468", "ENSG00000151923", "ENSG00000156502", "ENSG00000167103"], ["ENSG00000164758", "ENSG00000020426", "ENSG00000105851", "ENSG00000257341", "ENSG00000005471", "ENSG00000181218", "ENSG00000163466", "ENSG00000135919"], ["ENSG00000132744", "ENSG00000163468", "ENSG00000175550", "ENSG00000198795", "ENSG00000072501", "ENSG00000182010", "ENSG00000135776", "ENSG00000183864"], ["ENSG00000183570", "ENSG00000132185", "ENSG00000172164", "ENSG00000142765", "ENSG00000182508", "ENSG00000163376", "ENSG00000198793", "ENSG00000066855"], ["ENSG00000130517", "ENSG00000155330", "ENSG00000188021", "ENSG00000100483", "ENSG00000109220", "ENSG00000124214", "ENSG00000111647"], ["ENSG00000154889", "ENSG00000204923", "ENSG00000149716", "ENSG00000250565", "ENSG00000134278", "ENSG00000266173", "ENSG00000135632"], ["ENSG00000088205", "ENSG00000164414", "ENSG00000179826", "ENSG00000267228", "ENSG00000135775", "ENSG00000133138", "ENSG00000076321"], ["ENSG00000101935", "ENSG00000174780", "ENSG00000198554", "ENSG00000166851", "ENSG00000281306", "ENSG00000145220", "ENSG00000058085"], ["ENSG00000180667", "ENSG00000137309", "ENSG00000101871", "ENSG00000123933", "ENSG00000064726", "ENSG00000111832", "ENSG00000167470"], ["ENSG00000149792", "ENSG00000168763", "ENSG00000169989", "ENSG00000156509", "ENSG00000084754", "ENSG00000185158", "ENSG00000167536"], ["ENSG00000147883", "ENSG00000196550", "ENSG00000163374", "ENSG00000165304", "ENSG00000236843", "ENSG00000222009", "ENSG00000101871"], ["ENSG00000120075", "ENSG00000100399", "ENSG00000278437", "ENSG00000155719", "ENSG00000236177", "ENSG00000188026", "ENSG00000159450"], ["ENSG00000089009", "ENSG00000106633", "ENSG00000225526", "ENSG00000155714", "ENSG00000184787", "ENSG00000151150", "ENSG00000104112"], ["ENSG00000104808", "ENSG00000182013", "ENSG00000147883", "ENSG00000105401", "ENSG00000164756", "ENSG00000130758", "ENSG00000198792"], ["ENSG00000173193", "ENSG00000163378", "ENSG00000164542", "ENSG00000160710", "ENSG00000247595", "ENSG00000143641", "ENSG00000148225"], ["ENSG00000154262", "ENSG00000037749", "ENSG00000154263", "ENSG00000154265", "ENSG00000161021", "ENSG00000188596", "ENSG00000168297"], ["ENSG00000172785", "ENSG00000120498", "ENSG00000112695", "ENSG00000255767", "ENSG00000103037", "ENSG00000169989", "ENSG00000154269"], ["ENSG00000189283", "ENSG00000213401", "ENSG00000163467", "ENSG00000217442", "ENSG00000130751", "ENSG00000160712", "ENSG00000187024"], ["ENSG00000138942", "ENSG00000188419", "ENSG00000175550", "ENSG00000072501", "ENSG00000134183", "ENSG00000198246", "ENSG00000163463"], ["ENSG00000116478", "ENSG00000276126", "ENSG00000182010", "ENSG00000204118", "ENSG00000106404", "ENSG00000213401", "ENSG00000183248"], ["ENSG00000176928", "ENSG00000144724", "ENSG00000177045", "ENSG00000188620", "ENSG00000178927", "ENSG00000124215", "ENSG00000011478"], ["ENSG00000126882", "ENSG00000140526", "ENSG00000240021", "ENSG00000086015", "ENSG00000148229", "ENSG00000108576", "ENSG00000183354"], ["ENSG00000099864", "ENSG00000172167", "ENSG00000167476", "ENSG00000026652", "ENSG00000164418", "ENSG00000105855", "ENSG00000112531"], ["ENSG00000180660", "ENSG00000198794", "ENSG00000184012", "ENSG00000105402", "ENSG00000183578", "ENSG00000128849", "ENSG00000278545"]] \ No newline at end of file diff --git a/testdata/ref_axes-000.json b/testdata/ref_axes-000.json new file mode 100644 index 0000000..1ed0919 --- /dev/null +++ b/testdata/ref_axes-000.json @@ -0,0 +1 @@ +{"1": [-3.2053257675159794, 0.7336668967026231, 10.015249539735647, -5.0157843801320166, 0.1244238925363025, -4.566642689742843, -6.028487634563582, -2.6422967842808855, -2.239835807761678, 1.2655478068534982, -4.211017432635741, -0.06750486152444292, 1.7540481217371204, -0.44550924423364835, -4.626940669175253, -5.182963010663376, 1.3226698164424027, 4.922411509397451, -4.885202241021158, 0.10525237041097457, -1.5431771377769472, 4.934620252680376, 8.16253928472123, -5.07468280252522, 6.829680037240505, -0.3491153464575949, -0.9475711407150105, 5.279577667050634, -3.611202917336161, 1.8087327684635888, 8.585837685613258, 1.584075541276682, -0.2997117040966497, 0.980476986775055, 5.06611161477977, -1.4452838354211912, -2.7739358915838768, 1.171808594937817, 10.376468379412415, -4.563303351956499, -0.5674493289612625, -0.6419440162633384, -0.48251381266449095, 2.7894684583848774, 0.8671551269244898, -2.6772737717059147, -4.848581084979227, -1.248192596725568, -4.488373089657147]}