From 66aa534018401c3deef628a64349696208f659fc Mon Sep 17 00:00:00 2001 From: anjali9791 Date: Wed, 4 Oct 2023 17:51:51 +0530 Subject: [PATCH] feat: add english stemmer in es schema (#59) * feat: add english stemmer in es schema * add test case --------- Co-authored-by: anjali.agarwal --- internal/store/elasticsearch/es_test.go | 44 +++++++++++++++++++++++++ internal/store/elasticsearch/schema.go | 8 ++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/internal/store/elasticsearch/es_test.go b/internal/store/elasticsearch/es_test.go index ab7bb6e7..4290f905 100644 --- a/internal/store/elasticsearch/es_test.go +++ b/internal/store/elasticsearch/es_test.go @@ -122,6 +122,50 @@ func TestElasticsearch(t *testing.T) { analyzedTokens = append(analyzedTokens, tok.Token) } + if reflect.DeepEqual(expectTokens, analyzedTokens) == false { + return fmt.Errorf("expected analyzer to tokenize %q as %v, was %v", textToAnalyze, expectTokens, analyzedTokens) + } + return nil + }, + }, + { + Title: "created index should be able to correctly identify stemmed tokens", + Service: daggerService, + Validate: func(esClient *store.Client, cli *elasticsearch.Client, indexName string) error { + textToAnalyze := "walking" + analyzerPath := fmt.Sprintf("/%s/_analyze", indexName) + analyzerPayload := fmt.Sprintf(`{"analyzer": "my_analyzer", "text": %q}`, textToAnalyze) + + //nolint:noctx + req, err := http.NewRequest(http.MethodPost, analyzerPath, strings.NewReader(analyzerPayload)) + if err != nil { + return fmt.Errorf("error creating analyzer request: %w", err) + } + req.Header.Add("content-type", "application/json") + + res, err := cli.Perform(req) + if err != nil { + return fmt.Errorf("invoke analyzer: %w", err) + } + defer res.Body.Close() + if res.StatusCode != http.StatusOK { + return fmt.Errorf("elasticsearch returned non-200 response: %d", res.StatusCode) + } + var response struct { + Tokens []struct { + Token string `json:"token"` + } `json:"tokens"` + } + err = json.NewDecoder(res.Body).Decode(&response) + if err != nil { + return fmt.Errorf("error decoding response: %w", err) + } + expectTokens := []string{"walk"} + analyzedTokens := []string{} + for _, tok := range response.Tokens { + analyzedTokens = append(analyzedTokens, tok.Token) + } + if reflect.DeepEqual(expectTokens, analyzedTokens) == false { return fmt.Errorf("expected analyzer to tokenize %q as %v, was %v", textToAnalyze, expectTokens, analyzedTokens) } diff --git a/internal/store/elasticsearch/schema.go b/internal/store/elasticsearch/schema.go index 7710c8e0..18b7d398 100644 --- a/internal/store/elasticsearch/schema.go +++ b/internal/store/elasticsearch/schema.go @@ -15,9 +15,15 @@ var indexSettingsTemplate = `{ "my_analyzer": { "type": "custom", "tokenizer": "my_tokenizer", - "filter": ["lowercase"] + "filter": ["lowercase", "english_stemmer"] } }, + "filter": { + "english_stemmer": { + "type": "stemmer", + "name": "english" + } + }, "tokenizer": { "my_tokenizer": { "type": "pattern",