-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
db: search different forms of a query
Now we also search [fuzzed fuzzes fuzzing fuzzer fuzzers fuzz] for fuzzing
- Loading branch information
Showing
13 changed files
with
41,942 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,4 +25,6 @@ rawdata/ | |
|
||
secret/ | ||
data_crawler/failed.json | ||
test.py | ||
test.py | ||
|
||
*.gob |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,16 @@ | ||
FROM golang:1.22 AS build | ||
WORKDIR /src | ||
COPY . . | ||
RUN make index | ||
RUN make lemmas | ||
RUN make server | ||
RUN make index | ||
|
||
FROM alpine:latest | ||
WORKDIR / | ||
COPY --from=build /src/server /server | ||
COPY --from=build /src/db.cspapers.org /db.cspapers.org | ||
COPY ./default.server.config /default.server.config | ||
COPY --from=build /src/lemma.gob /lemma.gob | ||
COPY --from=build /src/lemmaInv.gob /lemmaInv.gob | ||
CMD ["/server", "-config", "/default.server.config"] | ||
EXPOSE 8000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
lemmas: | ||
go run ./lemmas | ||
|
||
index: | ||
go run ./api.cspapers.org/index -config default.index.config | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package wordforms | ||
|
||
import ( | ||
"encoding/gob" | ||
"os" | ||
) | ||
|
||
var lemma map[string]string | ||
var lemmaInv map[string][]string | ||
|
||
func LoadLemmas(lemmaPath string, lemmaInvPath string) { | ||
lemma = make(map[string]string) | ||
lemmaInv = make(map[string][]string) | ||
|
||
lemmaFile, err := os.Open(lemmaPath) | ||
if err != nil { | ||
panic(err) | ||
} | ||
dataDecoder := gob.NewDecoder(lemmaFile) | ||
err = dataDecoder.Decode(&lemma) | ||
if err != nil { | ||
panic(err) | ||
} | ||
lemmaFile.Close() | ||
|
||
lemmaInvFile, err := os.Open(lemmaInvPath) | ||
if err != nil { | ||
panic(err) | ||
} | ||
dataDecoder = gob.NewDecoder(lemmaInvFile) | ||
err = dataDecoder.Decode(&lemmaInv) | ||
if err != nil { | ||
panic(err) | ||
} | ||
lemmaInvFile.Close() | ||
} | ||
|
||
func GetWordForms(word string) []string { | ||
lem, ok := lemma[word] | ||
wf := []string{} | ||
|
||
if ok { | ||
wf = append(wf, lemmaInv[lem]...) | ||
wf = append(wf, lem) | ||
} else { | ||
wf = append(wf, lemmaInv[word]...) | ||
wf = append(wf, word) | ||
} | ||
|
||
return wf | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package wordforms_test | ||
|
||
import ( | ||
"os" | ||
"reflect" | ||
"testing" | ||
|
||
"github.com/swkim101/cspapers.org/api.cspapers.org/wordforms" | ||
) | ||
|
||
func TestMain(m *testing.M) { | ||
wordforms.LoadLemmas("../../lemma.gob", "../../lemmaInv.gob") | ||
code := m.Run() | ||
os.Exit(code) | ||
} | ||
|
||
func TestWords(t *testing.T) { | ||
{ | ||
got := wordforms.GetWordForms("tests") | ||
expected := []string{"tested", "testing", "tests", "test"} | ||
if !reflect.DeepEqual(got, expected) { | ||
t.Fatalf("expected %v, got %v", expected, got) | ||
} | ||
} | ||
{ | ||
got := wordforms.GetWordForms("fuzzing") | ||
expected := []string{"fuzzed", "fuzzes", "fuzzing", "fuzzer", "fuzzers", "fuzz"} | ||
if !reflect.DeepEqual(got, expected) { | ||
t.Fatalf("expected %v, got %v", expected, got) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package main | ||
|
||
import ( | ||
"bufio" | ||
"encoding/gob" | ||
"log" | ||
"os" | ||
"strings" | ||
) | ||
|
||
var lemma map[string]string | ||
var lemmaInv map[string][]string | ||
|
||
func main() { | ||
lemma = make(map[string]string) | ||
lemmaInv = make(map[string][]string) | ||
|
||
readFile("lemmas/lemmatization-en.txt") | ||
readFile("lemmas/lemmatization-cspapers.txt") | ||
|
||
lemmaFile, err := os.Create("lemma.gob") | ||
if err != nil { | ||
panic(err) | ||
} | ||
defer lemmaFile.Close() | ||
enc := gob.NewEncoder(lemmaFile) | ||
enc.Encode(lemma) | ||
|
||
lemmaInvFile, err := os.Create("lemmaInv.gob") | ||
if err != nil { | ||
panic(err) | ||
} | ||
defer lemmaInvFile.Close() | ||
enc = gob.NewEncoder(lemmaInvFile) | ||
enc.Encode(lemmaInv) | ||
} | ||
|
||
func readFile(filename string) { | ||
file, err := os.Open(filename) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
defer file.Close() | ||
|
||
scanner := bufio.NewScanner(file) | ||
for scanner.Scan() { | ||
line := strings.TrimSpace(scanner.Text()) | ||
if len(line) == 0 { | ||
continue | ||
} | ||
words := strings.Fields(line) | ||
left := words[0] | ||
right := words[1] | ||
lemma[right] = left | ||
|
||
if a, ok := lemmaInv[left]; ok { | ||
lemmaInv[left] = append(a, right) | ||
} else { | ||
lemmaInv[left] = []string{right} | ||
} | ||
} | ||
|
||
if err := scanner.Err(); err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
fuzz fuzzer | ||
fuzz fuzzers |
Oops, something went wrong.