-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathfind_non_matching.awk
126 lines (118 loc) · 3.4 KB
/
find_non_matching.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/awk -f
#
# script matches expressions/words from a matching file against a field of the input file
#
# when you run this script pass the name of the file with the list of words to match
# to it. you also need to specify which field (column) of the input file you want
# to have checked. fields need to e seperated by a semicolon (;)
#
# example usage:
#
# ./find_non_matching.awk -v fieldnumber=4 -v matchingfile=match.txt inputfile.csv
#
# or
#
# ./find_non_matching.awk -v nonumbers=1 -v fieldnumber=4 -v matchingfile=match.txt inputfile.csv
#
# or
#
# ./find_non_matching.awk -v reverse=1 -v fieldnumber=4 -v matchingfile=match.txt inputfile.csv
#
# process description: the script reads the entries of the matchfile match.txt
# and looks in each row of the input file for an equivalent value in the field/column specified.
# at the end of the script the non-matched values will be output.
#
# you may reverse the process by looking for data that matches instead of looking for non-matching
# data by specifying a value for the reverse variable (-v reverse=1)
#
# specify the variable noheader=1 if the input file has no header row. per default the process will
# skip the first row.
#
#
# note: if you don't want to have the number of occurrence of each non-match value displayed
# then pass a variable of "nonumbers=1" to the script
#
#
# http://datamelt.com
#
# last update: 2010-07-27
#
#
# before we processs the main file, we load the matching file into memory in an array and
# set some variables;
BEGIN {
# define input and output field seperators
FS=";";
OFS=";";
# counter for total input lines
totalcounter=0;
#counter for nonmatching/matching values
counter=0;
# counter for entries in the matching file
entrycounter=0;
# check if a matching file was specified otherwise exit the script
if(!matchingfile)
{
print "\nerror: no matching file specified";
exit 1;
}
# check if a fieldnumber variable was specified otherwise
# use the first column
if(!fieldnumber || fieldnumber==0)
{
fieldnumber=1;
}
# loop over all lines in the matching file
# and add value to an array
while ( (getline matching < matchingfile) > 0 )
{
if(matching !="" && substr(matching,1,1)!="#") # only if not empty and not a comment
{
# add the value to the array
entrycounter++;
++entry[tolower(matching)];
}
}
}
# we skip the first header row as a default, unless the noheader variable is set to "1"
NR > 1 || noheader==1 {
totalcounter++;
found_value=entry[tolower($fieldnumber)]
#print "checking for: " $fieldnumber;
# in normal mode and if the value is not matched, we keep it
# in reverse mode we keep all that match
if((!reverse && !found_value) && $fieldnumber!="" || (reverse && found_value && $fieldnumber!="" ))
{
counter++;
++foundarray[$fieldnumber];
}
}
END {
print "";
# output will be the value that could not be matched and the number of occurrence in
# the input file
for (item in foundarray)
{
if(!nonumbers)
{
print item ": " foundarray[item];
}
else
{
print item;
}
}
print ""
print "total entries in matching file: " entrycounter;
print "total lines in input file: " totalcounter;
print "total elements found in input file: " length(foundarray);
if (reverse)
{
print "total matching lines in input file: " counter;
}
else
{
print "total non-matching lines in input file: " counter;
}
}