-
Notifications
You must be signed in to change notification settings - Fork 0
/
FileIndexer.java
70 lines (60 loc) · 2.32 KB
/
FileIndexer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import java.util.*;
import java.io.*;
/**
* A class that provides a CLI which takes n files as input and returns the top 10 most common words in the files.
* @version 1.00 15 Oct 2016
* @author Mustaqeem Khowaja
*/
public class FileIndexer {
// Delimited by non alphanumeric characters
private static final String REGEX = "[^a-zA-Z0-9]+";
private static int N = 10;
public static void main(String [] args){
if (args.length == 0) {
System.out.println("Please include one or more input files.");
}
// Collection to store word tokens as they are read
Map <String, Word> wordMap = new HashMap<>();
// Iterate through each file and read its contents
for (int i = 0; i < args.length; i++){
BufferedReader reader;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[i])));
String line;
// Iterate through the lines in the file
while ((line = reader.readLine()) != null) {
line = line.toLowerCase();
// split line of words into array of tokens
String[] tokens = line.split(REGEX);
for (String token : tokens) {
// Not a word
if (token.equals("")) {
continue;
}
Word curWord = wordMap.get(token);
if (curWord == null) {
curWord = new Word(token);
wordMap.put(token, curWord);
}
curWord.incrementCount();
}
}
reader.close();
}
catch (IOException e){
System.err.println("Could not read file " + args[i]);
continue;
}
}
// Define a Sorted set to hold the words in descending order of occurrence
SortedSet <Word> sortedWordSet = new TreeSet<Word>(wordMap.values());
int i = 0;
for (Word word : sortedWordSet){
if (i == N){
break;
}
System.out.println(i+1 + ".\t" + word.Word() + "\t" + word.Count());
i++;
}
}
}