diff --git a/cLoops/.cModel.py.swp b/cLoops/.cModel.py.swp deleted file mode 100644 index cefd1d4..0000000 Binary files a/cLoops/.cModel.py.swp and /dev/null differ diff --git a/cLoops/cDBSCAN.py b/cLoops/cDBSCAN.py index c04ac46..328ae03 100644 --- a/cLoops/cDBSCAN.py +++ b/cLoops/cDBSCAN.py @@ -1,15 +1,13 @@ #--coding:utf-8 -- """ """ +import bisect class cDBSCAN: """ - The major class of the cDBSCAN algorithm, belong to CAO Yaqiang, CHEN Xingwei & AI Daosheng. - Algorithm complexity analysis is as following: - buildGrids: 2N,N for find the minal X and Y, N for coordinates convertion. - removeNoiseGrids: - + The major class of the cDBSCAN algorithm, belong to CAO Yaqiang, CHEN Xingwei, AI Daosheng and CHEN Zhaoxiong. + 2018-02-02: all CHEN Zhaoxiong's improvement, very little difference from previouse. """ def __init__(self, mat, eps, minPts): @@ -31,34 +29,16 @@ def __init__(self, mat, eps, minPts): #: cell width, city block distance self.cw = self.eps #: build the square index for quick neighbor search - self.buildGrids(mat) - #: get the points for all neighbors - self.buildGridNeighbors() - #: remove noise grids - self.removeNoiseGrids() - #: get the points for all neighbors - self.buildGridNeighbors() + self.buildGrid(mat) #: get the clusters - self.callClusters() - del self.Gs, self.Gs2, self.ps + self.queryGrid() + del self.Grid - def getDist(self, p, q): - """ - Basic function 1, city block distance funciton. - """ - x = self.ps[p] - y = self.ps[q] - d = abs(x[0] - y[0]) + abs(x[1] - y[1]) - #euclidean distance ,just in case. - #d = np.sqrt((x[0] - y[0])**2 + (x[1] - y[1])**2) - return d - - def getNearbyGrids(self, cell): + def getNearbyCells(self, index): """ Basic funciton 2, 9 grid as searching neghbors, grid width is eps. """ - x, y = cell[0], cell[1] - #keys = [(x, y), + x, y = index[0], index[1] keys = [(x, y - 1), (x, y + 1), (x - 1, y), (x + 1, y), (x - 1, y - 1), (x - 1, y + 1), (x + 1, y - 1), (x + 1, y + 1)] #keys = [(x, y), (x, y - 1), (x, y + 1), (x - 1, y), (x - 1, y - 1), @@ -66,143 +46,322 @@ def getNearbyGrids(self, cell): # (x, y + 2), (x, y - 2), (x + 1, y + 2), (x + 1, y - 2), # (x - 1, y + 2), (x - 1, y - 2), (x + 2, y), (x + 2, y + 1), # (x + 2, y - 1), (x - 2, y), (x - 2, y + 1), (x - 2, y - 1)] - ncells = [] + nindex = [] for key in keys: - if key in self.Gs: - ncells.append(key) - return ncells + if key in self.Grid: + nindex.append(key) + return nindex - def buildGrids(self, mat): + def buildGrid(self, mat): """ Algorithm 1: Construct the grids. @param mat: the raw or normalized [pointId,X,Y] data matrix """ - minX, minY = mat[0][1], mat[0][2] - for t in mat: - minX = min([minX, t[1]]) - minY = min([minY, t[2]]) - Gs = {} - ps = {} + Grid = {} + Gorder = {'x':{}, 'y':{}} + Gtype = {} + self.axisindex = {'x':0, 'y':1} for d in mat: - nx = int((d[1] - minX) / self.cw) + 1 - ny = int((d[2] - minY) / self.cw) + 1 - Gs.setdefault((nx, ny), []) - Gs[(nx, ny)].append(d[0]) - #last elements marks the class, initially -1 as noise - ps[d[0]] = [d[1], d[2], nx, ny, -1] - self.Gs, self.ps = Gs, ps - - def buildGridNeighbors(self): - """ - Algorithm 2 : Grid index with all neighbor points. - """ - Gs2 = {} - for cell in self.Gs.keys(): - nps = [] - nps.extend(self.Gs[cell]) - for cellj in self.getNearbyGrids(cell): - nps.extend(self.Gs[cellj]) - Gs2[cell] = nps - self.Gs2 = Gs2 - - def removeNoiseGrids(self): - """ - Algorithm 3: Remove noise grid according to KNN and get the obvious core points and core grids. - """ - #: noise cells without neighbors - tode = set() - #: noise cells with neighbors - tode2 = set() - for cell in self.Gs.keys(): - if len(self.Gs2[cell]) < self.minPts: - tode2.add(cell) - #KNN to noise cells with neighbors - for cell in tode2: - cells = self.getNearbyGrids(cell) - ncells = set(cells) & tode2 - #all neighbor cells are noise - if len(cells) == len(ncells): - tode.add(cell) - for cell in tode: - for p in self.Gs[cell]: - del self.ps[p] - del self.Gs[cell] - - def callClusters(self): - """ - Algorithm 4: Do DBSCAN clustering by go through all points in the sets. - """ - #: clustering id, noise is -2 and unclassified point is -1. + #modification + #rotate coordinate system by 45 degree + x = d[1] - d[2] + y = d[1] + d[2] + nx = int(x / self.cw) + 1 + ny = int(y / self.cw) + 1 + Grid.setdefault((nx, ny), []) + #gird types {0: sparse cell, 1: crowded cell, + #2: core cell (crowded cell assigned to a cluster) + #-1: noise cell or edge cell} + Grid[(nx, ny)].append([x, y, d[0], -1]) + self.Grid = Grid + for index, cell in Grid.iteritems(): + Gorder['x'][index] = Grid[index] + nearpnum = len(cell) + if nearpnum >= self.minPts: + #crowded cell + Gtype[index] = 1 + continue + for near_index in self.getNearbyCells(index): + nearpnum += len(Grid[near_index]) + if nearpnum < self.minPts: + #noise cell or edge cell + Gtype[index] = -1 + else: + #sparse cell + Gtype[index] = 0 + #get pt orders + noisecell = [] + for index in Grid: + noiseflag = all([Gtype[near_index] == -1 for near_index in self.getNearbyCells(index)]) + if Gtype[index] == -1 and noiseflag: + #check noise cell + #noise cell should not waste time sorting + noisecell.append(index) + continue + Gorder['x'][index].sort(key = lambda x: x[0]) + Gorder['y'][index] = sorted(Grid[index], key = lambda x: x[1]) + #remove noise cell + for index in noisecell: + del Grid[index] + del Gtype[index] + self.Grid = Grid + self.Gtype = Gtype + self.Gorder = Gorder + + + def queryGrid(self): clusterId = 0 - for key in self.ps: - if self.ps[key][-1] == -1: - if self.expandCluster(key, clusterId): - clusterId += 1 - #remove the noise and unclassified points - labels = {} - cs = {} - for p in self.ps.keys(): - c = self.ps[p][-1] - if c == -2: + clusters = {} + for index, cell in self.Grid.iteritems(): + #omit edge cells and core cells + if self.Gtype[index] in [-1, 2]: continue - labels[p] = c - if c not in cs: - cs[c] = [] - cs[c].append(p) - for key in cs.keys(): - if len(cs[key]) < self.minPts: - for p in cs[key]: - del labels[p] - self.labels = labels - - def expandCluster(self, pointKey, clusterId): - """ - Search connection for given point to others. - - @param pointKey: the key in self.dataPoints - @type pointKey: - - @param clusterId: the cluster id for the current - @type clusterId: int + border_pts = {} + #start cell check + clusters[clusterId] = [] + if self.Gtype[index] == 1: + #crowded cell must be core cell in one cluster + border_pts[index] = self.Grid[index] + else: + #sparse cell first round checking + #omit pt which already assigned to other cluster + pts = [x for x in cell if x[-1] == -1] + adjacent_pts, flag = self.getSparseCellNeighbor(pts, index) + if flag: + #at leaset one pt in cell meet requirement + for p in pts: + p[-1] = clusterId + clusters[clusterId].extend(pts) + border_pts = adjacent_pts + else: + #skip this cell if no core pt found + continue + #Breadth First Search + while len(border_pts) > 0: + nindex = sorted(border_pts.keys())[0] + #print nindex + #print sorted(border_pts.keys()) + ncell = self.Grid[nindex] + if self.Gtype[nindex] == 1: + #crowded cell process + self.Gtype[nindex] = 2 + for p in ncell: + p[-1] = clusterId + clusters[clusterId].extend(ncell) + self.updatePtDict(border_pts, self.getCrowdedCellNeighbor(nindex)) + elif self.Gtype[nindex] == 0: + #sparse cell process + adjacent_pts, flag = self.getSparseCellNeighbor(border_pts[nindex], nindex) + if flag: + #at leaset one pt in cell meet requirement + for p in ncell: + if p[-1] == -1: + p[-1] = clusterId + clusters[clusterId].append(p) + self.updatePtDict(border_pts, adjacent_pts) + else: + #all queried pt are border pt in cluster + for p in border_pts[nindex]: + p[-1] = clusterId + clusters[clusterId].extend(border_pts[nindex]) + else: + #edge cell process + #can't expand to nearby cells + for p in border_pts[nindex]: + p[-1] = clusterId + clusters[clusterId].extend(border_pts[nindex]) + del border_pts[nindex] - @return: bool - """ - seeds = self.regionQuery(pointKey) - if len(seeds) < self.minPts: - self.ps[pointKey][-1] = -2 - return False - else: - for key in seeds: - self.ps[key][-1] = clusterId - while len(seeds) > 0: - currentP = seeds[0] - result = self.regionQuery(currentP) - if len(result) >= self.minPts: - for key in result: - if self.ps[key][-1] in [-1, -2]: - if self.ps[key][-1] == -1: - seeds.append(key) - self.ps[key][-1] = clusterId - del (seeds[0]) - return True - - def regionQuery(self, pointKey): - """ - Find the related points to the queried point, city block distance is used. - - @param pointKey: the key in self.dataPoints - @type pointKey: + #release pts if cluster size is too small + if len(clusters[clusterId]) < self.minPts: + for p in clusters[clusterId]: + p[-1] = -1 + del clusters[clusterId] + else: + clusterId += 1 + self.labels = {} + #f = open("cD_v4.txt", 'w') + for cid, cluster_pts in clusters.iteritems(): + #f.write("\t".join(map(lambda x: str(x[-2]), sorted(clusters[cid], key=lambda x: x[-2]))) + "\n") + for p in cluster_pts: + self.labels[p[-2]] = cid + #f.close() + + + def getCrowdedCellNeighbor(self, index): + adj_pts = {} + for axis in ['x', 'y']: + for delta in [-1, 1]: + if axis == 'x': + newindex = (index[0] + delta, index[1]) + else: + newindex = (index[0], index[1] + delta) + if newindex not in self.Grid or self.Gtype[newindex] == 2: + #crowded cell can't expand to core cell + continue + if delta == -1: + edgept = self.Gorder[axis][index][0] + else: + edgept = self.Gorder[axis][index][-1] + newresult = [x for x in self.binSearchAdjPt(newindex, edgept, axis, delta) + if x[-1] == -1] + if newresult: adj_pts[newindex] = newresult - @return: list - """ - p = self.ps[pointKey] - x = p[2] - y = p[3] - #scan square and get nearby points. - result = [pointKey] - for q in self.Gs2[(x, y)]: - if q == pointKey: + edge_pts = self.findEdgePts(index) + for delta in [(-1, -1), (-1, 1), (1, -1), (1, 1)]: + newindex = (index[0] + delta[0], index[1] + delta[1]) + if newindex not in self.Grid or self.Gtype[newindex] == 2: + #crowded cell can't expand to core cell continue - if self.getDist(pointKey, q) <= self.eps: - result.append(q) - return result + for p in edge_pts[delta]: + newresult = self.overlapPtList( + self.binSearchAdjPt(newindex, p, 'x', delta[0]), + self.binSearchAdjPt(newindex, p, 'y', delta[1])) + if self.Gtype[newindex] == 1 and len(newresult) > 0: + #In crowded cell one hit equals to all hit + adj_pts[newindex] = self.Grid[newindex] + break + if newindex in adj_pts: + pre_ids = set([x[-2] for x in adj_pts[newindex]]) + #for x in newresult: + # if x[-2] not in pre_ids and x[-1] == -1: + # adj_pts[newindex].append(x) + adj_pts[newindex].extend([x for x in newresult + if x[-2] not in pre_ids and x[-1] == -1]) + else: + newresult = [x for x in newresult if x[-1] == -1] + if newresult: adj_pts[newindex] = newresult + return adj_pts + + def findEdgePts(self, index): + #find edge pts in crowded cell + order = {'x':self.Gorder['x'][index], 'y':self.Gorder['y'][index]} + upleft = [order['x'][0]] + downleft = [order['x'][0]] + upflag = True + downflag = True + for i in order['x'][1:]: + if upflag: + j = upleft[-1] + if i[1] > j[1]: + if i[0] == j[0]: + upleft[-1] = i + else: + upleft.append(i) + if i[1] == order['y'][-1][1]: + upflag = False + if downflag: + j = downleft[-1] + if i[1] < j[1]: + if i[0] == j[0]: + downleft[-1] = i + else: + downleft.append(i) + if i[1] == order['y'][0][1]: + downflag = False + if not (upflag or downflag): + break + upright = [order['x'][-1]] + downright = [order['x'][-1]] + upflag = True + downflag = True + for i in order['x'][-1::-1]: + if upflag: + j = upright[-1] + if i[1] > j[1]: + if i[0] == j[0]: + upright[-1] = i + else: + upright.append(i) + if i[1] == order['y'][-1][1]: + upflag = False + if downflag: + j = downright[-1] + if i[1] < j[1]: + if i[0] == j[0]: + downright[-1] = i + else: + downright.append(i) + if i[1] == order['y'][0][1]: + downflag = False + if not (upflag or downflag): + break + return {(-1, -1):downleft, (-1, 1):upleft, (1, -1):downright, (1, 1):upright} + + def getSparseCellNeighbor(self, seedpts, index): + cell_pt_num = len(self.Grid[index]) + adj_pts_order = {} + totalresult = {} + #### + #pts = copy.copy(seedpts) + pts = seedpts[:] + flag = False + while pts: + p = pts.pop() + p_adjacent = {} + for delta in [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]: + newindex = (index[0] + delta[0], index[1] + delta[1]) + if newindex not in self.Grid: + continue + if delta[1] == 0: + #only need to compare x axis + p_adjacent[newindex] = self.binSearchAdjPt(newindex, p, 'x', delta[0]) + elif delta[0] == 0: + #only need to compare y axis + p_adjacent[newindex] = self.binSearchAdjPt(newindex, p, 'y', delta[1]) + else: + #find overlap + p_adjacent[newindex] = self.overlapPtList( + self.binSearchAdjPt(newindex, p, 'x', delta[0]), + self.binSearchAdjPt(newindex, p, 'y', delta[1])) + n = sum([len(cellpts) for cellpts in p_adjacent.values()]) + if n + cell_pt_num >= self.minPts: + #meet minPts requirement + self.updatePtDict(totalresult, p_adjacent, checkPt=True) + if not flag: + #if any pt meet the requirement, other unassigned pts not + #already in seedpts should be added to check list + seedPtIds = set([x[-2] for x in seedpts]) + pts.extend([x for x in self.Grid[index] + if x[-1] == -1 and x[-2] not in seedPtIds]) + flag = True + return totalresult, flag + + def updatePtDict(self, dictA, dictB, checkPt=False): + #update dictA by adding items from dictB to dictA + for index, pts in dictB.iteritems(): + if checkPt: + pts = [x for x in pts if x[-1] == -1] + if pts: + if index in dictA: + pre_ids = [x[-2] for x in dictA[index]] + #for p in pts: + # if p[-2] not in pre_ids: + # dictA[index].append(p) + dictA[index].extend([x for x in pts + if x[-2] not in pre_ids]) + else: + dictA[index] = pts + + def binSearchAdjPt(self, index, q_pt, axis, delta): + #binary search + #p + pts = self.Gorder[axis][index] + if delta == 0: + return pts + axispos = self.axisindex[axis] + posarray = [p[axispos] for p in pts] + xpos = q_pt[axispos] + self.eps * delta + if delta == 1: + index = bisect.bisect_right(posarray, xpos) + return pts[0:index] + elif delta == -1: + index = bisect.bisect_left(posarray, xpos) + return pts[index:] + + def overlapPtList(self, ptlistA, ptlistB): + ptdictA = {x[-2]: x for x in ptlistA} + newkeys = set(ptdictA.keys()) & set([x[-2] for x in ptlistB]) + return [ptdictA[id] for id in newkeys] + + diff --git a/cLoops/cModel.py b/cLoops/cModel.py index d252d97..0dcc61f 100644 --- a/cLoops/cModel.py +++ b/cLoops/cModel.py @@ -2,13 +2,7 @@ #--coding:utf-8 -- """ Stastical significance is tested for every chromosome using the local permutated background. -2017-03-24: try except added for estimating reads in a region just in case. -2017-06-07: fix small bug as all self-ligation clusters due to distance cutoff ; also change the PETs of a chromosome to the filtered numbers according to distance cutoff. -2017-06-13: modified ES caculation in case the mean ES is zero, especially for larger window size shifting -2017-06-15: modified combined p-values as geometric mean -2017-06-20: modified the hypergeometric test rab to rab-1, all as following to avoid p-values too small; if no noise nearby, the ES is set to rab;combined-p-value is cancled; pending change N to N' share the same distance distribution,this will lead to too much burden on computation. -2017-07-19: modified HiChIP significant cutoff, for all datasets, eps=5000,minPts=20 without twice option is a good parameter. -2017-08-02: modified genomecoverage model, not using HTSeq now to save memories, also, much faster than previouse method +2018-02-01: improved data structure for genomecoverage,much faster and less memory than previouse version for significance calling,slightly changed the loops boundary. """ __date__ = "2017-03-15" __modified__ = "" @@ -27,6 +21,25 @@ from cLoops.utils import cFlush +def getCorLink(cs): + """ + @param cs: [1,2,3,4], a list for the coordinates x or y + @rtype: dic, keys is the coordinate, value is the closest next coordinate and points index in this coordinate + """ + ts = {} + for i,c in enumerate(cs): + if c not in ts: + ts[c] = [] + ts[c].append(i) + keys = sorted(ts.keys()) + for i in xrange(len(keys)-1): + ni = keys[i] + nj = keys[i+1] + ts[ni] = {"next":nj,"points":ts[ni]} + ts[nj] = {"next":None,"points":ts[nj]} + return ts + + def getGenomeCoverage(f, cut=0): """ Build the genomic model for random access. Could use a lot of memory. @@ -37,35 +50,35 @@ def getGenomeCoverage(f, cut=0): j = mat.shape[0] if j == 0: return None, 0 - m = max([np.max(mat[:, 1]), np.max(mat[:, 2])]) - model = [False] * (m + 1000000) #+10 just in case boundary escape - for t in mat: - if model[t[1]] == False: - model[t[1]] = [] - if model[t[2]] == False: - model[t[2]] = [] - model[t[1]].append(t[0]) - model[t[2]].append(0 - t[0]) - return model, j * 2 + xs = getCorLink(mat[:,1]) + ys = getCorLink(mat[:,2]) + return [xs,ys],j*2 -def getCounts(iv, model): - """ - Get reads ids for a region. - """ - rs = [] - for i in xrange(iv[0], iv[1]): - if model[i] != False: - rs.extend(model[i]) - rs = list(np.abs(list(rs))) - return rs +def getCounts(iv,ts): + ps = [] + pos = None + for i in xrange(iv[0],iv[1]): + if i in ts: + pos = i + break + while pos <= iv[1] and pos!=None: + ps.extend(ts[pos]["points"]) + pos = ts[pos]["next"] + return set(ps) -def getPETsforRegions(iva, ivb, model): - ra = getCounts(iva, model) - rb = getCounts(ivb, model) - rab = set(ra).intersection(set(rb)) - return len(ra), len(rb), len(rab) + +def getPETsforRegions(iva,ivb,model): + raSource = getCounts(iva,model[0]) + raTarget = getCounts(iva,model[1]) + rbSource = getCounts(ivb,model[0]) + rbTarget = getCounts(ivb,model[1]) + ra = len(raSource.union(raTarget)) + rb = len(rbSource.union(rbTarget)) + rab = len(raSource.intersection(rbTarget)) + return ra,rb,rab + def getNearbyPairRegions(iva, ivb, win=6): @@ -105,28 +118,28 @@ def getMultiplePsFdr(iva, ivb, model, N, win=6): ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win) hyps, rabs, nbps = [], [], [] for na in ivas: - try: - nra = getCounts(na, model) - except: - continue + nraSource = getCounts(na,model[0]) + nraTarget = getCounts(na,model[1]) + nra = nraSource.union(nraTarget) nralen = float(len(nra)) - if nralen == 0: + if nralen < 1: continue for nb in ivbs: - try: - nrb = getCounts(nb, model) - except: - continue - if len(nrb) == 0: + nrbSource = getCounts(nb,model[0]) + nrbTarget = getCounts(nb,model[1]) + nrb = nrbSource.union(nrbTarget) + nrblen = len(nrb) + if nrblen < 1: continue - nrab = len(set(nra).intersection(set(nrb))) + nrab = float(len(nra.intersection(nrb))) + #nrab = float(len(nraSource.intersection(nrbTarget))) #collect the value for poisson test rabs.append(nrab) #collect the nearby hypergeometric test result - nhyp = hypergeom.sf(nrab - 1.0, N, nralen, len(nrb)) + nhyp = hypergeom.sf(nrab - 1.0, N, nralen, nrblen) hyps.append(nhyp) #collect the possibility for following binomal test - den = nrab / (nralen * len(nrb)) + den = nrab / (nralen * nrblen) nbps.append(den) if len(rabs) == 0: return ra, rb, rab, np.inf, 0.0, hyp, 0.0, 0.0, 0.0, @@ -179,8 +192,10 @@ def getIntSig(f, records, minPts, discut): for r in records: chrom = r[0] key = "%s-%s-%s" % (r[0], r[3], i) - iva = [r[1] - 1, r[2] + 1] - ivb = [r[4] - 1, r[5] + 1] + #iva = [r[1] - 1, r[2] + 1] + #ivb = [r[4] - 1, r[5] + 1] + iva = [r[1], r[2]] + ivb = [r[4], r[5]] #filter loops distance = abs(sum(ivb) / 2.0 - sum(iva) / 2.0) if distance < discut: