Skip to content

Commit

Permalink
db: search different forms of a query
Browse files Browse the repository at this point in the history
Now we also search [fuzzed fuzzes fuzzing fuzzer fuzzers fuzz] for fuzzing
  • Loading branch information
swkim101 committed Nov 19, 2024
1 parent 357ba23 commit 14dbc2a
Show file tree
Hide file tree
Showing 13 changed files with 41,942 additions and 6 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,6 @@ rawdata/

secret/
data_crawler/failed.json
test.py
test.py

*.gob
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
FROM golang:1.22 AS build
WORKDIR /src
COPY . .
RUN make index
RUN make lemmas
RUN make server
RUN make index

FROM alpine:latest
WORKDIR /
COPY --from=build /src/server /server
COPY --from=build /src/db.cspapers.org /db.cspapers.org
COPY ./default.server.config /default.server.config
COPY --from=build /src/lemma.gob /lemma.gob
COPY --from=build /src/lemmaInv.gob /lemmaInv.gob
CMD ["/server", "-config", "/default.server.config"]
EXPOSE 8000
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
lemmas:
go run ./lemmas

index:
go run ./api.cspapers.org/index -config default.index.config

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ PR is welcome
Referred to csrankings.org for organizing conferences.
Thanks to https://github.com/michmech/lemmatization-lists for lemma data.
## Disclaimer
cspapers.org is served as implemented as is in this repository. I do not modify source code or data in and after distributions.
Expand Down
10 changes: 10 additions & 0 deletions api.cspapers.org/db/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/swkim101/cspapers.org/api.cspapers.org/db/dbimpl"
"github.com/swkim101/cspapers.org/api.cspapers.org/log"
"github.com/swkim101/cspapers.org/api.cspapers.org/types"
"github.com/swkim101/cspapers.org/api.cspapers.org/wordforms"
)

func init() {
Expand Down Expand Up @@ -134,6 +135,15 @@ func search(req *types.SearchRequest) *types.SearchResponse {
fq.SetFuzziness(2)
keywordQuery = append(keywordQuery, fq)
}
/* try different word forms */
if 3 < len(word) {
for _, wf := range wordforms.GetWordForms(word) {
qsTitle := fmt.Sprintf("title:%v^2", wf)
qsAbs := fmt.Sprintf("abstract:%v", wf)
keywordQuery = append(keywordQuery, bleve.NewQueryStringQuery(qsTitle))
keywordQuery = append(keywordQuery, bleve.NewQueryStringQuery(qsAbs))
}
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions api.cspapers.org/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/swkim101/cspapers.org/api.cspapers.org/log"
"github.com/swkim101/cspapers.org/api.cspapers.org/types"
"github.com/swkim101/cspapers.org/api.cspapers.org/wordforms"
)

var (
Expand All @@ -28,6 +29,7 @@ func main() {
if err != nil {
panic(err)
}
wordforms.LoadLemmas(cfg.Lemma, cfg.LemmaInv)
runServer(cfg)
}

Expand Down
8 changes: 5 additions & 3 deletions api.cspapers.org/server/serverConfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ import (

type serverConfig struct {
config.Config
Host string `json:"host"`
Port int `json:"port"`
Prefix string `json:"prefix"`
Host string `json:"host"`
Port int `json:"port"`
Prefix string `json:"prefix"`
Lemma string `json:"lemma"`
LemmaInv string `json:"lemmaInv"`

dbimpl *dbimpl.Type `json:"-"`
}
Expand Down
51 changes: 51 additions & 0 deletions api.cspapers.org/wordforms/wordforms.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package wordforms

import (
"encoding/gob"
"os"
)

var lemma map[string]string
var lemmaInv map[string][]string

func LoadLemmas(lemmaPath string, lemmaInvPath string) {
lemma = make(map[string]string)
lemmaInv = make(map[string][]string)

lemmaFile, err := os.Open(lemmaPath)
if err != nil {
panic(err)
}
dataDecoder := gob.NewDecoder(lemmaFile)
err = dataDecoder.Decode(&lemma)
if err != nil {
panic(err)
}
lemmaFile.Close()

lemmaInvFile, err := os.Open(lemmaInvPath)
if err != nil {
panic(err)
}
dataDecoder = gob.NewDecoder(lemmaInvFile)
err = dataDecoder.Decode(&lemmaInv)
if err != nil {
panic(err)
}
lemmaInvFile.Close()
}

func GetWordForms(word string) []string {
lem, ok := lemma[word]
wf := []string{}

if ok {
wf = append(wf, lemmaInv[lem]...)
wf = append(wf, lem)
} else {
wf = append(wf, lemmaInv[word]...)
wf = append(wf, word)
}

return wf
}
32 changes: 32 additions & 0 deletions api.cspapers.org/wordforms/wordforms_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package wordforms_test

import (
"os"
"reflect"
"testing"

"github.com/swkim101/cspapers.org/api.cspapers.org/wordforms"
)

func TestMain(m *testing.M) {
wordforms.LoadLemmas("../../lemma.gob", "../../lemmaInv.gob")
code := m.Run()
os.Exit(code)
}

func TestWords(t *testing.T) {
{
got := wordforms.GetWordForms("tests")
expected := []string{"tested", "testing", "tests", "test"}
if !reflect.DeepEqual(got, expected) {
t.Fatalf("expected %v, got %v", expected, got)
}
}
{
got := wordforms.GetWordForms("fuzzing")
expected := []string{"fuzzed", "fuzzes", "fuzzing", "fuzzer", "fuzzers", "fuzz"}
if !reflect.DeepEqual(got, expected) {
t.Fatalf("expected %v, got %v", expected, got)
}
}
}
4 changes: 3 additions & 1 deletion default.server.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@
"type": "bleve",
"host": "localhost",
"port": 8000,
"prefix": "/"
"prefix": "/",
"lemma": "lemma.gob",
"lemmaInv": "lemmaInv.gob"
}
67 changes: 67 additions & 0 deletions lemmas/lemmas.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package main

import (
"bufio"
"encoding/gob"
"log"
"os"
"strings"
)

var lemma map[string]string
var lemmaInv map[string][]string

func main() {
lemma = make(map[string]string)
lemmaInv = make(map[string][]string)

readFile("lemmas/lemmatization-en.txt")
readFile("lemmas/lemmatization-cspapers.txt")

lemmaFile, err := os.Create("lemma.gob")
if err != nil {
panic(err)
}
defer lemmaFile.Close()
enc := gob.NewEncoder(lemmaFile)
enc.Encode(lemma)

lemmaInvFile, err := os.Create("lemmaInv.gob")
if err != nil {
panic(err)
}
defer lemmaInvFile.Close()
enc = gob.NewEncoder(lemmaInvFile)
enc.Encode(lemmaInv)
}

func readFile(filename string) {
file, err := os.Open(filename)
if err != nil {
log.Fatal(err)
}
defer file.Close()

scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if len(line) == 0 {
continue
}
words := strings.Fields(line)
left := words[0]
right := words[1]
lemma[right] = left

if a, ok := lemmaInv[left]; ok {
lemmaInv[left] = append(a, right)
} else {
lemmaInv[left] = []string{right}
}
}

if err := scanner.Err(); err != nil {
log.Fatal(err)
}

}
2 changes: 2 additions & 0 deletions lemmas/lemmatization-cspapers.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
fuzz fuzzer
fuzz fuzzers
Loading

0 comments on commit 14dbc2a

Please sign in to comment.