forked from riteshsharma29/Python_find_duplicate_records
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_duplicate.py
79 lines (54 loc) · 1.84 KB
/
find_duplicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python
# coding: utf-8 -*-
import codecs,sys
import pandas as pd
def find_duplicate(filename):
#empty lists
unq = []
dup = []
#read records.txt file
filename = codecs.open(filename,encoding='utf-8')
log_file = codecs.open("log.txt",'w',encoding='utf-8')
log_file.write("Below values are duplicate" + '\n' + '\n')
#Iterate through the file
for rec in filename:
rec = rec.replace('\n',"")
if rec != "" and rec not in unq:
unq.append(rec)
elif rec != "" and rec in unq:
dup.append(rec)
#log duplicate values
for vals in dup:
log_file.write(vals + '\n')
print vals
'''Change sample.txt with your file
call find_duplicate function'''
find_duplicate("sample.txt")
#################################################################################################################
def find_duplicate_excel(excelfile,sheet,colheadr):
df = pd.read_excel(excelfile,sheetname=sheet)
#empty lists
unq = []
dup = []
#read records from the excel column
colvalues = df['Movies'].values
log_file = codecs.open("excel_log.txt",'w',encoding='utf-8')
log_file.write("Below values are duplicate in excel file's " + str(colheadr) + " column" + '\n' + '\n')
#Iterate through the column values
for rec in colvalues:
rec = rec.replace('\n',"")
if rec != "" and rec not in unq:
unq.append(rec)
elif rec != "" and rec in unq:
dup.append(rec)
#log duplicate values
for vals in dup:
log_file.write(vals + '\n')
print vals
'''Change sample.xlsx with your file
call find_duplicate_excel function
first parameter - excel filename
second parameter - excel sheet name
third parameter -column header consisting values
'''
find_duplicate_excel('sample.xlsx','test','Movies')