diff --git a/.gitignore b/.gitignore index 245a3504..c74fef74 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ data/taxonomies data/**/*.jsonl products*.jsonl.gz data/searchalicious-openapi.yml +data/searchalicious-config-schema.yml +data/searchalicious-settings-schema.yml # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Makefile b/Makefile index 3d832ee2..97d894fc 100644 --- a/Makefile +++ b/Makefile @@ -164,6 +164,14 @@ generate-custom-elements: _ensure_network @echo "🔎 Generating custome-elements.json …" ${DOCKER_COMPOSE} run --rm search_nodejs npm run analyze +generate-config-schema: _ensure_network + @echo "🔎 Generating config-schema.yml …" + ${DOCKER_COMPOSE} run --rm api python3 -m app export-config-schema /opt/search/data/searchalicious-config-schema.yml + +generate-settings-schema: _ensure_network + @echo "🔎 Generating settings-schema.yml …" + ${DOCKER_COMPOSE} run --rm api python3 -m app export-settings-schema /opt/search/data/searchalicious-settings-schema.yml + #-------# # Tests # #-------# diff --git a/app/cli/main.py b/app/cli/main.py index ebf79af9..bd62c4f4 100644 --- a/app/cli/main.py +++ b/app/cli/main.py @@ -225,7 +225,7 @@ def export_openapi( exists=None, file_okay=True, dir_okay=False, - help="Path of target_path the YAML or JSON data file", + help="Path of the YAML or JSON data file", ) ): """Export OpenAPI specification to a file.""" @@ -248,5 +248,51 @@ def export_openapi( print(f"spec written to {target_path}") +def export_schema( + class_: type["app.config.Config"] | type["app.config.Settings"], target_path: Path +): + """Export schema to a file.""" + import json + + import yaml + + from app.config import ConfigGenerateJsonSchema + + schema = class_.model_json_schema(schema_generator=ConfigGenerateJsonSchema) + + print("writing json schema") + with open(target_path, "w") as f: + if str(target_path).endswith(".json"): + json.dump(schema, f, indent=2) + else: + yaml.safe_dump(schema, f, sort_keys=False) + + print(f"schema written to {target_path}") + + +schema_target_path = typer.Argument( + exists=None, + file_okay=True, + dir_okay=False, + help="Path of the YAML or JSON data file", +) + + +@cli.command() +def export_config_schema(target_path: Path = schema_target_path): + """Export Configuration json schema to a file.""" + from app.config import Config + + export_schema(Config, target_path) + + +@cli.command() +def export_settings_schema(target_path: Path = schema_target_path): + """Export Configuration json schema to a file.""" + from app.config import Settings + + export_schema(Settings, target_path) + + def main() -> None: cli() diff --git a/app/config.py b/app/config.py index 3182059e..6ce2adcc 100644 --- a/app/config.py +++ b/app/config.py @@ -1,6 +1,6 @@ -import json import logging from enum import StrEnum, auto +from inspect import cleandoc as cd_ from pathlib import Path from typing import Annotated, Any @@ -11,8 +11,17 @@ log = logging.getLogger(__name__) +ES_DOCS_URL = "https://www.elastic.co/guide/en/elasticsearch/reference/current" + class LoggingLevel(StrEnum): + """Accepted logging levels + + * NOTSET - means no los + * DEBUG / INFO / WARNING / ERROR / CRITICAL + - match standard Python logging levels + """ + NOTSET = "NOTSET" DEBUG = "DEBUG" INFO = "INFO" @@ -41,17 +50,77 @@ class ScriptType(StrEnum): class Settings(BaseSettings): - # Path of the search-a-licious yaml configuration file - config_path: Path | None = None - redis_reader_timeout: int = 5 - elasticsearch_url: str = "http://localhost:9200" - redis_host: str = "localhost" - redis_port: int = 6379 - sentry_dns: str | None = None - log_level: LoggingLevel = LoggingLevel.INFO - taxonomy_cache_dir: Path = Path("data/taxonomies") - # User-Agent used when fetching resources (taxonomies) or documents - user_agent: str = "search-a-licious" + """Settings for Search-a-licious + + The most important settings is `config_path`. + + Those settings can be overridden through environment + by using the name in capital letters. + If you use docker compose, a good way to do that + is to modify those values in your .env file. + """ + + config_path: Annotated[ + Path | None, + Field( + description=cd_( + """Path to the search-a-licious yaml configuration file. + + See [Explain configuration file](../explain-configuration/) for more information + """ + ) + ), + ] = None + elasticsearch_url: Annotated[ + str, + Field( + description=cd_( + """URL to the ElasticSearch instance + + Bare in mind this is from inside the container. + """ + ) + ), + ] = "http://localhost:9200" + redis_host: Annotated[ + str, + Field( + description=cd_( + """Host for the Redis instance containing event stream + + Bare in mind this is from inside the container. + """ + ) + ), + ] = "localhost" + redis_port: Annotated[ + int, + Field(description="Port for the redis host instance containing event stream"), + ] = 6379 + redis_reader_timeout: Annotated[ + int, Field(description="timeout in seconds to read redis event stream") + ] = 5 + sentry_dns: Annotated[ + str | None, + Field( + description="Sentry DNS to report incident, if None no incident is reported" + ), + ] = None + log_level: Annotated[ + LoggingLevel, Field(description=f"Log level. {LoggingLevel.__doc__}") + ] = LoggingLevel.INFO + taxonomy_cache_dir: Annotated[ + Path, + Field( + description="Directory where to store taxonomies before ingestion to ElasticSearch" + ), + ] = Path("data/taxonomies") + user_agent: Annotated[ + str, + Field( + description="User-Agent used when fetching resources (taxonomies) or documents" + ), + ] = "search-a-licious" settings = Settings() @@ -105,17 +174,65 @@ def generate(self, schema, mode="validation"): class TaxonomySourceConfig(BaseModel): - name: Annotated[str, Field(description="name of the taxonomy")] + """Configuration on how to fetch a particular taxonomy.""" + + name: Annotated[ + str, + Field( + description=cd_( + """Name of the taxonomy + + This is the name you will use in the configuration (and the API) + to reference this taxonomy + """ + ) + ), + ] url: Annotated[ HttpUrl, Field( - description="URL of the taxonomy, must be in JSON format and follows Open Food Facts " - "taxonomy format." + description=cd_( + """URL of the taxonomy. + + The target file must be in JSON format + and follows Open Food Facts JSON taxonomy format. + + This is a dict where each key correspond to a taxonomy entry id, + values are dict with following properties: + + * name: contains a dict giving the name (string) for this entry + in various languages (keys are language codes) + * synonyms: contains a dict giving a list of synonyms by language code + * parents: contains a list of direct parent ids (taxonomy is a directed acyclic graph) + + Other keys correspond to properties associated to this entry (eg. wikidata id). + """ + ) ), ] class FieldType(StrEnum): + """Supported field types in Search-a-Licious are: + + * keyword: string values that won't be interpreted (tokenized). + Good for things like tags, serial, property values, etc. + * date: Date fields + * double, float, half_float, scaled_float: + different ways of storing floats with different capacity + * short, integer, long, unsigned_long : + integers (with different capacity: 8 / 16 / 32 bits) + * bool: boolean (true / false) values + * text: a text which is tokenized to enable full text search + * text_lang: like text, but with different values in different languages. + Tokenization will use analyzers specific to each languages. + * taxonomy: a field akin to keyword but + with support for matching using taxonomy synonyms and translations + * disabled: a field that is not stored nor searchable + (see [Elasticsearch help]) + * object: this field contains a dict with sub-fields. + """ + keyword = auto() date = auto() half_float = auto() @@ -136,63 +253,107 @@ class FieldType(StrEnum): object = auto() def is_numeric(self): + """Return wether this field type can be considered numeric""" return self in (FieldType.integer, FieldType.float, FieldType.double) +# add url to FieldType doc +if FieldType.__doc__: + FieldType.__doc__ += f"\n\n[Elasticsearch help]: {ES_DOCS_URL}/enabled.html" + + class FieldConfig(BaseModel): # name of the field (internal field), it's added here for convenience. # It's set by the `add_field_name_to_each_field` classmethod. - name: Annotated[str, Field(description="name of the field, must be unique")] = "" + name: Annotated[str, Field(description="name of the field (must be unique")] = "" type: Annotated[ FieldType, - Field(description="type of the field, see `FieldType` for possible values"), + Field(description=f"Type of the field\n\n{cd_(FieldType.__doc__)}"), ] required: Annotated[ bool, - Field(description="if required=True, the field is required in the input data"), + Field( + description=cd_( + """if required=True, the field is required in the input data + + An entry that does not contains a value for this field will be rejected. + """ + ) + ), ] = False input_field: Annotated[ str | None, - Field(description="name of the input field to use when importing data"), + Field( + description=cd_( + """name of the input field to use when importing data + + By default, Search-a-licious use the same name as the field name. + + This is useful to index the same field using different types or configurations. + """ + ) + ), ] = None - # split: Annotated[ bool, Field( - description="do we split the input field with `split_separator` ?\n\n" - "This is useful if you have some text fields that contains list of values, " - "(for example a comma separated list of values, like apple,banana,carrot).\n\n" - "You must set split_separator to the character that separates the values in the dataset." + description=cd_( + """do we split the input field with `split_separator` ? + + This is useful if you have some text fields that contains list of values, + (for example a comma separated list of values, like apple,banana,carrot). + + You must set split_separator to the character that separates the values in the dataset. + """ + ) ), ] = False full_text_search: Annotated[ bool, Field( - description="do we include perform full text search using this field. If " - "false, the field is only used during search when filters involving this " - "field are provided." + description=cd_( + """Wether this field in included on default full text search. + + If `false`, the field is only used during search + when filters involving this field are provided + (as opposed to full text search expressions without any explicit field). + """ + ) ), ] = False bucket_agg: Annotated[ bool, Field( - description="do we add an bucket aggregation to the elasticsearch query for this field. " - "It is used to return a 'faceted-view' with the number of results for each facet value. " - "Only valid for keyword or numeric field types." + description=cd_( + """do we add an bucket aggregation to the elasticsearch query for this field. + + It is used to return a 'faceted-view' with the number of results for each facet value, + or to generate bar charts. + + Only valid for keyword or numeric field types. + """ + ) ), ] = False taxonomy_name: Annotated[ str | None, Field( - description="the name of the taxonomy associated with this field. " - "It must only be provided for taxonomy field type." + description=cd_( + """the name of the taxonomy associated with this field. + + It must only be provided for taxonomy field type. + """ + ) ), ] = None add_taxonomy_synonyms: Annotated[ bool, Field( - description="if True, add all synonyms of the taxonomy values to the index. " - "The flag is ignored if the field type is not `taxonomy`." + description=cd_( + """if True, add all synonyms of the taxonomy values to the index. + The flag is ignored if the field type is not `taxonomy`. + """ + ) ), ] = True @@ -213,81 +374,138 @@ def get_input_field(self): return self.input_field or self.name def has_lang_subfield(self) -> bool: + """Return wether this field type is supposed to have different values + per languages""" return self.type in (FieldType.taxonomy, FieldType.text_lang) -class ESIndexConfig(BaseModel): - name: Annotated[str, Field(description="name of the index alias to use")] - id_field_name: Annotated[ +class BaseESIndexConfig(BaseModel): + """Base class for configuring ElasticSearch indexes""" + + name: Annotated[ str, Field( - description="name of the field to use for `_id`." - "it is mandatory to provide one.\n\n " - "If your dataset does not have an identifier field, " - "you should use a document preprocessor to compute one." + description=cd_( + """Name of the index alias to use. + + Search-a-licious will create an index using this name and an import date, + but alias will always point to the latest index. + + The alias must not already exists in your ElasticSearch instance. + """ + ) ), ] - last_modified_field_name: Annotated[ - str, + number_of_shards: Annotated[ + int, Field( - description="name of the field containing the date of last modification, " - "used for incremental updates using Redis queues. " - "The field value must be an int/float representing the timestamp.\n\n" + description=cd_( + f"""Number of shards to use for the index. + + Shards are useful to distribute the load on your cluster. + (see [index settings]({ES_DOCS_URL}/index-modules.html#_static_index_settings)) + """ + ) ), - ] - number_of_shards: Annotated[ - int, Field(description="number of shards to use for the index") ] = 4 number_of_replicas: Annotated[ - int, Field(description="number of replicas to use for the index") + int, + Field( + description=cd_( + f"""Number of replicas to use for the index. + + More replica means more resiliency but also more disk space and memory. + + (see [index settings]({ES_DOCS_URL}/index-modules.html#_static_index_settings)) + """ + ) + ), ] = 1 -class TaxonomyIndexConfig(BaseModel): - """We have an index storing multiple taxonomies +class ESIndexConfig(BaseESIndexConfig): + """This is the configuration for the main index containing the data. - It enables functions like auto-completion, or field suggestions - as well as enrichment of requests with synonyms + It's used to create the index in ElasticSearch, and configure its mappings + (along with the *fields* config) """ - name: Annotated[ + id_field_name: Annotated[ str, - Field(description="name of the taxonomy index alias to use"), + Field( + description=cd_( + """Name of the field to use for `_id`. + it is mandatory to provide one. + + If your dataset does not have an identifier field, + you should use a document preprocessor to compute one (see `preprocessor`). + """ + ) + ), ] - number_of_shards: Annotated[ - int, Field(description="number of shards to use for the index") - ] = 4 - number_of_replicas: Annotated[ - int, Field(description="number of replicas to use for the index") - ] = 1 + last_modified_field_name: Annotated[ + str, + Field( + description=cd_( + """Name of the field containing the date of last modification, + in your indexed objects. + + This is used for incremental updates using Redis queues. + + The field value must be an int/float representing the timestamp. + """ + ) + ), + ] + + +class TaxonomyIndexConfig(BaseESIndexConfig): + """This is the configuration of + the ElasticSearch index storing the taxonomies. + + All taxonomies are stored within the same index. + + It enables functions like auto-completion, or field suggestions + as well as enrichment of requests with synonyms. + """ class TaxonomyConfig(BaseModel): """Configuration of taxonomies, - that is collections of entries with synonyms in multiple languages + that is collections of entries with synonyms in multiple languages. Field may be linked to taxonomies. + + It enables enriching search with synonyms, + as well as providing suggestions, + or informative facets. """ sources: Annotated[ list[TaxonomySourceConfig], - Field(description="configurations of used taxonomies"), + Field(description="Configurations of taxonomies that this project will use."), ] exported_langs: Annotated[ list[str], Field( - description="a list of languages for which we want taxonomized fields " - "to be always exported during indexing. During indexing, we use the taxonomy " - "to translate every taxonomized field in a language-specific subfield. The list " - "of language depends on the value defined here and on the optional " - "`taxonomy_langs` field that can be defined in each document.", + description=cd_( + """a list of languages for which + we want taxonomized fields to be always exported during indexing. + + During indexing, we use the taxonomy to translate every taxonomized field + in a language-specific subfield. + + The list of language depends on the value defined here and on the optional + `taxonomy_langs` field that can be defined in each document. + + Beware that providing many language might inflate the index size. + """, + ) ), ] index: Annotated[ TaxonomyIndexConfig, - Field( - description="configuration of the taxonomy index. There is a single index for all taxonomies." - ), + Field(description=TaxonomyIndexConfig.__doc__), ] @@ -326,20 +544,39 @@ class ScriptConfig(BaseModel): # Or some type checking/transformation ? +INDEX_CONFIG_INDEX_DESCRIPTION = """ +Through this settings, you can tweak some of the index settings. +""" + + class IndexConfig(BaseModel): - """Inside the config file we can have several indexes defined. + """This object gives configuration for one index. - This object gives configuration for one index. + One index usually correspond to one dataset. """ - index: Annotated[ - ESIndexConfig, Field(description="configuration of the Elasticsearch index") - ] + index: Annotated[ESIndexConfig, Field(description=ESIndexConfig.__doc__)] fields: Annotated[ dict[str, FieldConfig], Field( - description="configuration of all fields in the index, keys are field " - "names and values contain the field configuration" + description=cd_( + """Configuration of all fields we need to store in the index. + + Keys are field names, + values contain the field configuration. + + This is a very important part of the configuration. + + Most of the ElasticSearch mapping will depends on it. + ElasticSearch will also use this configuration + to provide intended behaviour. + + (see also [Explain Configuration](./explain_configuration.md#fields)) + + If you change those settings you will have to re-index all the data. + (But you can do so in the background). + """ + ) ), ] split_separator: Annotated[ @@ -357,30 +594,36 @@ class IndexConfig(BaseModel): ] = "_" primary_color: Annotated[ str, - Field(description="Used for vega charts. Should be html code."), + Field(description="Used for vega charts. Use CSS color code."), ] = "#aaa" accent_color: Annotated[ str, - Field( - description="Used for vega. Should be html code." - 'and the language code, ex: product_name_it if lang_separator="_"' - ), + Field(description="Used for vega. Should be CSS color code."), ] = "#222" - taxonomy: Annotated[ - TaxonomyConfig, Field(description="configuration of the taxonomies used") - ] + taxonomy: Annotated[TaxonomyConfig, Field(description=TaxonomyConfig.__doc__)] supported_langs: Annotated[ list[str], Field( - description="A list of all supported languages, it is used to build index mapping" + description="A list of all supported languages, it is used to build index mapping", + examples=[["en", "fr", "it"]], ), ] document_fetcher: Annotated[ str, Field( - description="The full qualified reference to the document fetcher, i.e. the class " - "responsible from fetching the document using the document ID present in the Redis " - "Stream.", + description=cd_( + """The full qualified reference to the document fetcher, + i.e. the class responsible from fetching the document. + using the document ID present in the Redis Stream. + + It should inherit `app._import.BaseDocumentFetcher` + and specialize the `fetch_document` method. + + To keep things sleek, + you generally have few item fields in the event stream payload. + This class will fetch the full document using your application API. + """ + ), examples=["app.openfoodfacts.DocumentFetcher"], ), ] @@ -388,9 +631,18 @@ class IndexConfig(BaseModel): Annotated[ str, Field( - description="The full qualified reference to the preprocessor to use before " - "data import. This is used to adapt the data schema or to add search-a-licious " - "specific fields for example.", + description=cd_( + """The full qualified reference to the preprocessor + to use before data import. + + This class must inherit `app.indexing.BaseDocumentPreprocessor` + and specialize the `preprocess` method. + + This is used to adapt the data schema + or to add search-a-licious specific fields + for example. + """ + ), examples=["app.openfoodfacts.DocumentPreprocessor"], ), ] @@ -400,9 +652,16 @@ class IndexConfig(BaseModel): Annotated[ str, Field( - description="The full qualified reference to the elasticsearch result processor " - "to use after search query to Elasticsearch. This is used to add custom fields " - "for example.", + description=cd_( + """The full qualified reference to the elasticsearch result processor + to use after search query to Elasticsearch. + +) This class must inherit `app.postprocessing.BaseResultProcessor` + and specialize the `process_after` + + This is can be used to add custom fields computed from index content. + """ + ), examples=["app.openfoodfacts.ResultProcessor"], ), ] @@ -412,23 +671,48 @@ class IndexConfig(BaseModel): Annotated[ dict[str, ScriptConfig], Field( - description="You can add scripts that can be used for sorting results", + description=cd_( + """You can add scripts that can be used for sorting results. + + Each key is a script name, with it's configuration. + """ + ), ), ] | None ) = None match_phrase_boost: Annotated[ - float, Field(description="How much we boost exact matches on individual fields") + float, + Field( + description=cd_( + """How much we boost exact matches on individual fields + + This only makes sense when using "best match" order. + """ + ) + ), ] = 2.0 document_denylist: Annotated[ - set[str], Field(description="list of documents IDs to ignore") + set[str], + Field( + description=cd_( + """list of documents IDs to ignore. + + Use this to skip some documents at indexing time. + """ + ) + ), ] = Field(default_factory=set) redis_stream_name: Annotated[ str | None, Field( - description="name of the Redis stream to read from when listening to document updates. " - "If not provided, document updates won't be listened to for this index." + description=cd_( + """Name of the Redis stream to read from when listening to document updates. + + If not provided, document updates won't be listened to for this index. + """ + ) ), ] = None @@ -473,6 +757,7 @@ def field_references_must_exist_and_be_valid(self): @field_validator("fields") @classmethod def add_field_name_to_each_field(cls, fields: dict[str, FieldConfig]): + """It's handy to have the name of the field in the field definition""" for field_name, field_item in fields.items(): field_item.name = field_name return fields @@ -503,17 +788,31 @@ def get_fields_with_bucket_agg(self): ] +CONFIG_DESCRIPTION_INDICES = """ +A Search-a-licious instance only have one configuration file, +but is capable of serving multiple datasets + +It provides a section for each index you want to create (corresponding to a dataset). + +The key is the ID of the index that can be referenced at query time. +One index corresponds to a specific set of documents and can be queried independently. + +If you have multiple indexes, one of those index must be designed as the default one, +see `default_index`. +""" + + class Config(BaseModel): - """This is the global config object that reflects - the yaml configuration file. + """Search-a-licious server configuration. + + The configuration is loaded from a YAML file, + that must satisfy this schema. Validations will be performed while we load it. """ indices: dict[str, IndexConfig] = Field( - description="configuration of indices. " - "The key is the ID of the index that can be referenced at query time. " - "One index corresponds to a specific set of documents and can be queried independently." + description="configuration of indices.\n\n" + CONFIG_DESCRIPTION_INDICES ) default_index: Annotated[ str, @@ -560,16 +859,6 @@ def from_yaml(cls, path: Path) -> "Config": data = yaml.safe_load(f) return cls(**data) - @classmethod - def export_json_schema(cls): - """Export JSON schema.""" - (Path(__file__).parent.parent / "config_schema.json").write_text( - json.dumps( - cls.model_json_schema(schema_generator=ConfigGenerateJsonSchema), - indent=4, - ) - ) - # CONFIG is a global variable that contains the search-a-licious configuration # used. It is specified by the envvar CONFIG_PATH. diff --git a/config_schema.json b/config_schema.json deleted file mode 100644 index 925684dd..00000000 --- a/config_schema.json +++ /dev/null @@ -1,374 +0,0 @@ -{ - "$defs": { - "ESIndexConfig": { - "properties": { - "name": { - "description": "name of the index alias to use", - "title": "Name", - "type": "string" - }, - "id_field_name": { - "description": "name of the field to use for `_id`", - "title": "Id Field Name", - "type": "string" - }, - "last_modified_field_name": { - "description": "name of the field containing the date of last modification, used for incremental updates using Redis queues. The field value must be an int/float representing the timestamp.", - "title": "Last Modified Field Name", - "type": "string" - }, - "number_of_shards": { - "default": 4, - "description": "number of shards to use for the index", - "title": "Number Of Shards", - "type": "integer" - }, - "number_of_replicas": { - "default": 1, - "description": "number of replicas to use for the index", - "title": "Number Of Replicas", - "type": "integer" - } - }, - "required": [ - "name", - "id_field_name", - "last_modified_field_name" - ], - "title": "ESIndexConfig", - "type": "object" - }, - "FieldConfig": { - "properties": { - "name": { - "default": "", - "description": "name of the field, must be unique", - "title": "Name", - "type": "string" - }, - "type": { - "allOf": [ - { - "$ref": "#/$defs/FieldType" - } - ], - "description": "type of the field, see `FieldType` for possible values" - }, - "required": { - "default": false, - "description": "if required=True, the field is required in the input data", - "title": "Required", - "type": "boolean" - }, - "input_field": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "name of the input field to use when importing data", - "title": "Input Field" - }, - "split": { - "default": false, - "description": "do we split the input field with `split_separator`", - "title": "Split", - "type": "boolean" - }, - "full_text_search": { - "default": false, - "description": "do we include perform full text search using this field. If false, the field is only used during search when filters involving this field are provided.", - "title": "Full Text Search", - "type": "boolean" - }, - "bucket_agg": { - "default": false, - "description": "do we add an bucket aggregation to the elasticsearch query for this field. It is used to return a 'faceted-view' with the number of results for each facet value. Only valid for keyword or numeric field types.", - "title": "Bucket Agg", - "type": "boolean" - }, - "taxonomy_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "the name of the taxonomy associated with this field. It must only be provided for taxonomy field type.", - "title": "Taxonomy Name" - }, - "add_taxonomy_synonyms": { - "default": true, - "description": "if True, add all synonyms of the taxonomy values to the index. The flag is ignored if the field type is not `taxonomy`.", - "title": "Add Taxonomy Synonyms", - "type": "boolean" - } - }, - "required": [ - "type" - ], - "title": "FieldConfig", - "type": "object" - }, - "FieldType": { - "enum": [ - "keyword", - "date", - "half_float", - "scaled_float", - "float", - "double", - "integer", - "short", - "long", - "unsigned_long", - "bool", - "text", - "text_lang", - "taxonomy", - "disabled", - "object" - ], - "title": "FieldType", - "type": "string" - }, - "IndexConfig": { - "properties": { - "index": { - "allOf": [ - { - "$ref": "#/$defs/ESIndexConfig" - } - ], - "description": "configuration of the Elasticsearch index" - }, - "fields": { - "additionalProperties": { - "$ref": "#/$defs/FieldConfig" - }, - "description": "configuration of all fields in the index, keys are field names and values contain the field configuration", - "title": "Fields", - "type": "object" - }, - "split_separator": { - "default": ",", - "description": "separator to use when splitting values, for fields that have split=True", - "title": "Split Separator", - "type": "string" - }, - "lang_separator": { - "default": "_", - "description": "for `text_lang` FieldType, the separator between the name of the field and the language code, ex: product_name_it if lang_separator=\"_\"", - "title": "Lang Separator", - "type": "string" - }, - "taxonomy": { - "allOf": [ - { - "$ref": "#/$defs/TaxonomyConfig" - } - ], - "description": "configuration of the taxonomies used" - }, - "supported_langs": { - "description": "A list of all supported languages, it is used to build index mapping", - "items": { - "type": "string" - }, - "title": "Supported Langs", - "type": "array" - }, - "document_fetcher": { - "description": "The full qualified reference to the document fetcher, i.e. the class responsible from fetching the document using the document ID present in the Redis Stream.", - "examples": [ - "app.openfoodfacts.DocumentFetcher" - ], - "title": "Document Fetcher", - "type": "string" - }, - "preprocessor": { - "anyOf": [ - { - "description": "The full qualified reference to the preprocessor to use before data import. This is used to adapt the data schema or to add search-a-licious specific fields for example.", - "examples": [ - "app.openfoodfacts.DocumentPreprocessor" - ], - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Preprocessor" - }, - "result_processor": { - "anyOf": [ - { - "description": "The full qualified reference to the elasticsearch result processor to use after search query to Elasticsearch. This is used to add custom fields for example.", - "examples": [ - "app.openfoodfacts.ResultProcessor" - ], - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Result Processor" - }, - "match_phrase_boost": { - "default": 2.0, - "description": "How much we boost exact matches on individual fields", - "title": "Match Phrase Boost", - "type": "number" - }, - "document_denylist": { - "description": "list of documents IDs to ignore", - "items": { - "type": "string" - }, - "title": "Document Denylist", - "type": "array", - "uniqueItems": true - }, - "redis_stream_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "name of the Redis stream to read from when listening to document updates. If not provided, document updates won't be listened to for this index.", - "title": "Redis Stream Name" - } - }, - "required": [ - "index", - "fields", - "taxonomy", - "supported_langs", - "document_fetcher" - ], - "title": "IndexConfig", - "type": "object" - }, - "TaxonomyConfig": { - "properties": { - "sources": { - "description": "configurations of used taxonomies", - "items": { - "$ref": "#/$defs/TaxonomySourceConfig" - }, - "title": "Sources", - "type": "array" - }, - "exported_langs": { - "description": "a list of languages for which we want taxonomized fields to be always exported during indexing. During indexing, we use the taxonomy to translate every taxonomized field in a language-specific subfield. The list of language depends on the value defined here and on the optional `taxonomy_langs` field that can be defined in each document.", - "items": { - "type": "string" - }, - "title": "Exported Langs", - "type": "array" - }, - "index": { - "allOf": [ - { - "$ref": "#/$defs/TaxonomyIndexConfig" - } - ], - "description": "configuration of the taxonomy index. There is a single index for all taxonomies." - } - }, - "required": [ - "sources", - "exported_langs", - "index" - ], - "title": "TaxonomyConfig", - "type": "object" - }, - "TaxonomyIndexConfig": { - "properties": { - "name": { - "description": "name of the taxonomy index alias to use", - "title": "Name", - "type": "string" - }, - "number_of_shards": { - "default": 4, - "description": "number of shards to use for the index", - "title": "Number Of Shards", - "type": "integer" - }, - "number_of_replicas": { - "default": 1, - "description": "number of replicas to use for the index", - "title": "Number Of Replicas", - "type": "integer" - } - }, - "required": [ - "name" - ], - "title": "TaxonomyIndexConfig", - "type": "object" - }, - "TaxonomySourceConfig": { - "properties": { - "name": { - "description": "name of the taxonomy", - "title": "Name", - "type": "string" - }, - "url": { - "description": "URL of the taxonomy, must be in JSON format and follows Open Food Facts taxonomy format.", - "format": "uri", - "maxLength": 2083, - "minLength": 1, - "title": "Url", - "type": "string" - } - }, - "required": [ - "name", - "url" - ], - "title": "TaxonomySourceConfig", - "type": "object" - } - }, - "properties": { - "indices": { - "additionalProperties": { - "$ref": "#/$defs/IndexConfig" - }, - "description": "configuration of indices. The key is the ID of the index that can be referenced at query time. One index corresponds to a specific set of documents and can be queried independently.", - "title": "Indices", - "type": "object" - }, - "default_index": { - "description": "the default index to use when no index is specified in the query", - "title": "Default Index", - "type": "string" - } - }, - "required": [ - "indices", - "default_index" - ], - "title": "JSON schema for search-a-licious configuration file", - "type": "object", - "$schema": "https://json-schema.org/draft/2020-12/schema" -} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f5ee159f..4467b325 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -35,7 +35,7 @@ x-api-common: &api-common image: ghcr.io/openfoodfacts/search-a-licious/search_service_image:${TAG:-dev} restart: ${RESTART_POLICY:-always} environment: - - ELASTICSEARCH_URL=http://es01:9200 + - ELASTICSEARCH_URL=${ELASTICSEARCH_URL:-http://es01:9200} - SENTRY_DNS - LOG_LEVEL - REDIS_HOST diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index 4c9066ad..4468747d 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -35,7 +35,7 @@ "github_repo": "search-a-licious", "github_banner": True, "extra_nav_links": { - "Back to main doc": "/search-a-licious", + "🢀 Back to main doc": "/search-a-licious", }, } diff --git a/docs/users/explain-configuration.md b/docs/users/explain-configuration.md index 30c16beb..fe5c8a86 100644 --- a/docs/users/explain-configuration.md +++ b/docs/users/explain-configuration.md @@ -5,8 +5,72 @@ and all the rest works (at least for main scenarios). The configuration file is a YAML file. +## One configuration, multiple datasets + +A Search-a-licious instance only have one configuration file, +but is capable of serving multiple datasets + It provides a section for each index you want to create (corresponding to a dataset). +If you have more than one dataset, one must be declared the default (see [default_index](../ref-config/searchalicious-config-schema.html#default_index)) + +## Main sections + +For each indexe the main sections are: + +* index: some configuration of the Elasticsearch index +* fields: the fields you want to put in the index, their type and other configurations +* taxonomy: definitions of taxonomies that are used by this index +* redis_stream_name and document_fetcher: if you use continuous updates, you will need to define one +* preprocessor and result_processor are two fields enabling to handle specificities of your dataset. +* scripts: to use sort by script (see [How to use scripts](./how-to-use-scripts.md)) + + +## Index configuration + +Search-a-licious is really based upon Elasticsearch, + +This section provides some important fields to control the way it is used. + +`id_field_name` is particularly important as it must contain a field that uniquely identifies each items. +If you don't have such field, you might use `preprocessor` to compute one. +It is important to have such an id to be able to use [continuous updates](FIXME). + +`last_modified_field_name` is also important for continuous updates to decide +where to start the event stream processing. + +## Fields + +This is one of the most important section. + +It specifies what will be stored in your index, +which fields will be searchable, and how. + +You have to plan in advance how you configure this. + +Think well about: +* fields you want to search and how you want to search them +* which informations you need to display in search results +* what you need to sort on +* which facets you want to display +* which charts you need to build + +Changing this section will probably involve a full re-indexing of all your items. + +Read more in the [reference documentation](../ref-config/searchalicious-config-schema.html#fields). + +## Document fetcher, pre-processors and post-processors + +It is not always straight forward to index an item. + +Search-a-licious offers a way for you to customize some critical operations using Python code. + +* preprocessor adapts you document before being indexed +* whereas result_processor adapts each result returned by a search, keep it lightweight ! +* document_fetcher is only used for continuous updates to fetch documents using an API + +Read more in the [reference documentation](../ref-config/searchalicious-config-schema.html). +## Scripts -### Split separator \ No newline at end of file +You can also add scripts for sorting documents. See [How to use scripts](./how-to-use-scripts.md). \ No newline at end of file diff --git a/docs/users/how-to-install.md b/docs/users/how-to-install.md index 9d25db03..e9b6fe48 100644 --- a/docs/users/how-to-install.md +++ b/docs/users/how-to-install.md @@ -8,5 +8,4 @@ All configuration are passed through environment variables to services through t The only required change is to set the `CONFIG_PATH` variable to the path of your YAML configuration file. This file is used to configure the search-a-licious indexer and search services. - - +If you want to see more about settings, see the [Reference for Settings](./ref-settings.md) \ No newline at end of file diff --git a/docs/users/how-to-use-scripts.md b/docs/users/how-to-use-scripts.md index c3670c7d..c7b5c54f 100644 --- a/docs/users/how-to-use-scripts.md +++ b/docs/users/how-to-use-scripts.md @@ -59,7 +59,10 @@ Here: It's mostly a way to declare constants in the script. (hopefully more convenient than declaring them in the script) -See [introduction to script in Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-scripting-using.html) +For more information on configuration for scripts see [configuration reference](../ref-config/searchalicious-config-schema.html#indices_additionalProperties_scripts) + +For informations on how to write scripts, +see [introduction to script in Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-scripting-using.html) ## Import the scripts in Elasticsearch diff --git a/docs/users/ref-config.md b/docs/users/ref-config.md new file mode 100644 index 00000000..3c7e76ee --- /dev/null +++ b/docs/users/ref-config.md @@ -0,0 +1,8 @@ +# Reference for Configuration file + +You can find the [raw json schema here](./searchalicious-config-schema.yml) + +[See configuration documentation on it's own page](./searchalicious-config-schema.html) + \ No newline at end of file diff --git a/docs/users/ref-settings.md b/docs/users/ref-settings.md new file mode 100644 index 00000000..a4489189 --- /dev/null +++ b/docs/users/ref-settings.md @@ -0,0 +1,8 @@ +# Reference for Settings + +You can find the [raw json schema here](./searchalicious-settings-schema.yml) + +[See Settings documentation on it's own page](./searchalicious-settings-schema.html) + \ No newline at end of file diff --git a/docs/users/searchalicious-config-schema.yml b/docs/users/searchalicious-config-schema.yml new file mode 100644 index 00000000..f060e0f6 --- /dev/null +++ b/docs/users/searchalicious-config-schema.yml @@ -0,0 +1,2 @@ +# keep empty - This file will be replaced by generated schema +# at documentation generation time \ No newline at end of file diff --git a/scripts/Dockerfile.schema b/scripts/Dockerfile.schema new file mode 100644 index 00000000..5e8fce33 --- /dev/null +++ b/scripts/Dockerfile.schema @@ -0,0 +1,14 @@ +FROM python:3-slim + +ARG USER_UID=1000 +ARG USER_GID=1000 +USER root +# add user with right id +RUN addgroup --gid $USER_GID user && adduser --uid $USER_UID --ingroup user --no-create-home --disabled-password --quiet user +# create folders +RUN mkdir -p /docs/in /docs/out && chown user:user /docs +# install some packages we need +RUN pip3 install -U pip && pip3 install json-schema-for-humans +CMD ["generate-schema-doc", "/docs/in/", "/docs/out/"] +WORKDIR /docs +USER user diff --git a/scripts/build_mkdocs.sh b/scripts/build_mkdocs.sh index f6690fbf..e4444867 100755 --- a/scripts/build_mkdocs.sh +++ b/scripts/build_mkdocs.sh @@ -12,4 +12,4 @@ docker build --build-arg "USER_UID=$UID" --build-arg "USER_GID=$GID" --tag 'mkdo docker run --rm \ -e USER_ID=$UID -e GROUP_ID=$GID \ -v $(pwd):/app -w /app \ - mkdocs-builder build + mkdocs-builder build \ No newline at end of file diff --git a/scripts/build_schema.sh b/scripts/build_schema.sh new file mode 100755 index 00000000..87ba91a2 --- /dev/null +++ b/scripts/build_schema.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Build config documentation in markdown +# Use it before using mkdocs + +# Parameter is the schema type: config / settings +SCHEMA=$1 + +[[ -z $SCHEMA ]] && echo "You must provide a schema type: config / settings" && exit 1 + +set -e + +# get group id to use it in the docker +GID=$(id -g) + +# ensure dest dir +mkdir -p build/ref-$SCHEMA + +# create yaml +make generate-$SCHEMA-schema +# create image +docker build --build-arg "USER_UID=$UID" --build-arg "USER_GID=$GID" --tag 'json-schema-for-humans' -f scripts/Dockerfile.schema . + +# use image to generate documentation +docker run --rm --user user \ + -v $(pwd)/scripts/schema-config.json:/docs/schema-config.json \ + -v $(pwd)/data/searchalicious-$SCHEMA-schema.yml:/docs/in/searchalicious-$SCHEMA-schema.yml \ + -v $(pwd)/build/ref-$SCHEMA:/docs/out \ + json-schema-for-humans \ + generate-schema-doc --config-file /docs/schema-config.json /docs/in/ /docs/out/ + +# copy to ref-$SCHEMA folder +mv build/ref-$SCHEMA/* gh_pages/users/ref-$SCHEMA/ +# also source +cp data/searchalicious-$SCHEMA-schema.yml gh_pages/users/ref-$SCHEMA/ diff --git a/scripts/generate_doc.sh b/scripts/generate_doc.sh index 43c4db39..e41f9939 100755 --- a/scripts/generate_doc.sh +++ b/scripts/generate_doc.sh @@ -11,7 +11,9 @@ mkdir -p gh_pages echo "Build documentation with MkDocs" scripts/build_mkdocs.sh -# TODO: generating python and documentation with sphinx +echo "Generate documentation for configuration file and settings" +scripts/build_schema.sh config +scripts/build_schema.sh settings echo "Generate OpenAPI documentation" make generate-openapi diff --git a/scripts/schema-config.json b/scripts/schema-config.json new file mode 100644 index 00000000..fad7e013 --- /dev/null +++ b/scripts/schema-config.json @@ -0,0 +1,5 @@ +{ + "collapse_long_descriptions": false, + "examples_as_yaml": true, + "expand_buttons": true +} \ No newline at end of file