diff --git a/.gitignore b/.gitignore index 426058d5..6135fa83 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ MANIFEST* # Unit test / coverage reports .pytest_cache/ +logs/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fb3dbe6..161ed11f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,118 @@ # Changelog +## [5.0.0] + +### Added + +- Added a [quick start guide](https://spraakbanken.gu.se/sparv/#/user-manual/quick-start) in the documentation. +- Added importers for more file formats: docx and odt. +- Added support for [language + varieties](https://spraakbanken.gu.se/sparv/#/developers-guide/writing-sparv-plugins?id=languages-and-varieties). +- Re-introduced analyses for [Old Swedish and Swedish from the + 1800's](https://spraakbanken.gu.se/sparv/#/developers-guide/writing-sparv-plugins?id=languages-and-varieties). +- Added a more flexible stats export which lets you choose which annotations to include in the frequency list. +- Added installer for stats export. +- Added Stanza support for English. +- Added better install and uninstall instructions for plugins. +- Added support for [XML + namespaces](https://spraakbanken.gu.se/sparv/#/user-manual/corpus-configuration?id=xml-namespaces). +- Added explicit `ref` annotations (indexing tokens within sentences) for Stanza, Malt and Stanford. +- Added a `--reset` flag to the `sparv setup` command for resetting the data directory setting. +- Added a separate installer for installing scrambled CWB files. +- A warning message is printed when Sparv discovers source files that don't match the file extension in the corpus + config. +- An error message is shown if unknown exporters are listed under `export.default`. +- Allow source annotations named "not". +- Added a source filename annotator. +- Show an error message if user specifies an invalid installation. +- Added a `--stats` flag to several commands, showing a summary after completion of time spent per annotator. +- Added `stanza.max_token_length` option. +- Added Hunpos-backoff annotation for Stanza msd and pos. +- Added `--force` flag to `run-rule` and `create-file` commands to force recreation of the listed targets. +- Added a new exporter which produces a YAML file with info about the Sparv version and annotation date. + This info is also added to the combined XML exports. +- Exit with an error message if a required executable is missing. +- Show a warning if an installed plugin is incompatible with Sparv. +- Introduced compression of annotation files in sparv-workdir. The type of compression can be configured (or disabled) + by using the `sparv.compression` variable. `gzip` is used by default. +- Add flags `--rerun-incomplete` and `--mark-complete` to the `sparv run` command for handling incomplete output files. +- Several exporters now show a warning if a token annotation isn't included in the list of export annotations. +- Added `get_size()` to the `Annotation` and `AnnotationAllSourceFiles` classes, to get the size (number of values) + for an annotation. +- Added support for [individual progress bars for + annotators](https://spraakbanken.gu.se/sparv/#/developers-guide/writing-sparv-plugins?id=progress-bar). +- Added `SourceAnnotationsAllSourceFiles` class. + +### Changed + +- Significantly improved the CLI startup time. +- Replaced the `--verbose` flag with `--simple` and made verbose the default mode. +- Everything needed by Sparv modules (including `utils`) is now available through the `sparv.api` package. +- Empty corpus config files are treated as missing config files. +- Moved CWB corpus installer from `korp` module into `cwb` module. + This lead to some name changes of variables used in the corpus config: + - `korp.remote_cwb_datadir` is now called `cwb.remote_data_dir` + - `korp.remote_cwb_registry` is now called `cwb.remote_registry_dir` + - `korp.remote_host` has been split into `korp.remote_host` (host for SQL files) and `cwb.remote_host` (host for CWB + files) + - install target `korp:install_corpus` has been renamed and split into `cwb:install_corpus` and + `cwb:install_corpus_scrambled` +- Renamed the following stats exports: + `stats_export:freq_list` is now called `stats_export:sbx_freq_list` + `stats_export:freq_list_simple` is now called `stats_export:sbx_freq_list_simple` + `stats_export:install_freq_list` is now called `stats_export:install_sbx_freq_list` + `stats_export:freq_list_fsv` is now called `stats_export:sbx_freq_list_fsv` +- Now incrementally compresses bz2 files in compressed XML export to avoid memory problems with large files. +- Corpus source files are now called "source files" instead of "documents". Consequently, the `--doc/-d` flag has been + renamed to `--file/-f`. +- `import.document_annotation` has been renamed to `import.text_annotation`, and all references to "document" as a text + unit have been changed to "text". +- Minimum Python version is now 3.6.2. +- Removed Python 2 dependency for hfst-SweNER. +- Tweaked compound analysis to make it less slow and added option to disable using source text as lexicon. +- `cwb` module now exports to regular export directory instead of CWB's own directories. +- Removed ability to use absolute path for exports. +- Renamed the installer `xml_export:install_original` to `xml_export:install`. The configuration variables + `xml_export.export_original_host` and `xml_export.export_original_path` have been changed to + `xml_export.export_host` and `xml_export.export_path` respectively. The configuration variables for the scrambled + installer has been changed from `xml_export.export_host` and `xml_export.export_path` to + `xml_export.export_scrambled_host` and `xml_export.export_scrambled_path` respectively. +- Removed `header_annotations` configuration variable from `export` (it is still available as + `xml_export.header_annotations`). +- All export files must now be written to subdirectories, and each subdirectory must use the exporter's module name as + prefix (or be equal to the module name). +- Empty attributes are no longer included in the csv export. +- When Sparv crashes due to unexpected errors, the traceback is now hidden from the user unless the `--log debug` + argument is used. +- If the `-j`/`--cores` option is used without an argument, all available CPU cores are used. +- Importers are now required to write a source structure file. +- CWB installation now also works locally. + +### Fixed + +- Fixed rule ambiguity problems (functions with an order higher than 1 were not accessible). +- Automatically download correct Hunpos model depending on the Hunpos version installed. +- Stanza can now handle tokens containing whitespaces. +- Fixed a bug which lead to computing the source file list multiple times. +- Fixed a few date related crashes in the `cwb` module. +- Fixed installation of compressed, scrambled XML export. +- Fixed bug in PunctuationTokenizer leading to orphaned tokens. +- Fixed crash when scrambling nested spans by only scrambling the outermost ones. +- Fixed crash in xml_import when no elements are imported. +- Fixed crash on empty sentences in Stanza. +- Better handling of empty XML elements in XML export. +- Faulty custom modules now result in a warning instead of a crash. +- Notify user when SweNER crashes. +- Fixed crash when config file can't be read due to file permissions. +- Fixed bug where `geo:contextual` would only work for sentences. +- Fixed crash on systems with encodings other than UTF-8. + ## [4.1.1] - 2021-09-20 ### Fixed - Workaround for bug in some versions of Python 3.8 and 3.9. -- Fixed bugs in segmenter module. +- Fixed bugs in `segmenter` module. ## [4.1.0] - 2021-04-14 diff --git a/README.md b/README.md index 0bda9e1d..4c7eedb7 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ If you have any questions, problems or suggestions please contact ")`. +An instance of this class represents a regular annotation tied to one source file. This class is used when an +annotation is needed as input for a function, e.g. `Annotation("")`. **Arguments:** - `name`: The name of the annotation. -- `doc`: The name of the document. +- `source_file`: The name of the source file. +- `is_input`: If set to `False` the annotation won't be added to the rule's input. Default: `True` **Properties:** @@ -35,7 +36,7 @@ needed as input for a function, e.g. `Annotation("")`. - `get_children(child: BaseAnnotation, orphan_alert=False, preserve_parent_annotation_order=False)`: Return two lists. The first one is a list with n (= total number of parents) elements where every element is a list of indices in the child annotation. The second one is a list of orphans, i.e. containing indices in the child annotation that have no - parent. Both parents and children are sorted according to their position in the source document, unless + parent. Both parents and children are sorted according to their position in the source file, unless preserve_parent_annotation_order is set to True, in which case the parents keep the order from the parent annotation. - `get_parents(parent: BaseAnnotation, orphan_alert: bool = False)`: Return a list with n (= total number of children) @@ -46,11 +47,12 @@ needed as input for a function, e.g. `Annotation("")`. False, allow_newlines: bool = False)`: Yield tuples of multiple attributes on the same annotation. - `read_spans(decimals=False, with_annotation_name=False)`: Yield the spans of the annotation. - `create_empty_attribute()`: Return a list filled with None of the same size as this annotation. +- `get_size()`: Get the number of values. -## AnnotationAllDocs -Regular annotation but the document must be specified for all actions. Use as input to an annotator function to require -the specificed annotation for every document in the corpus. +## AnnotationAllSourceFiles +Regular annotation but the source filename must be specified for all actions. Use as input to an annotator function to +require the specificed annotation for every source file in the corpus. **Arguments:** @@ -64,16 +66,17 @@ the specificed annotation for every document in the corpus. **Methods:** - `split()`: Split name into annotation name and attribute. -- `read(doc: str)`: Yield each line from the annotation. -- `read_spans(doc: str, decimals=False, with_annotation_name=False)`: Yield the spans of the annotation. -- `create_empty_attribute(doc: str)`: Return a list filled with None of the same size as this annotation. -- `exists(doc: str)`: Return True if annotation file exists. +- `read(source_file: str)`: Yield each line from the annotation. +- `read_spans(source_file: str, decimals=False, with_annotation_name=False)`: Yield the spans of the annotation. +- `create_empty_attribute(source_file: str)`: Return a list filled with None of the same size as this annotation. +- `exists(source_file: str)`: Return True if annotation file exists. +- `get_size(source_file: str)`: Get the number of values. ## AnnotationCommonData Like [`AnnotationData`](#annotationdata), an instance of this class represents an annotation with arbitrary data, but `AnnotationCommonData` is used for data that applies to the whole corpus (i.e. data that is not specific to one source -document). +file). **Arguments:** @@ -96,7 +99,7 @@ This class represents an annotation holding arbitrary data, i.e. data that is no **Arguments:** - `name`: The name of the annotation. -- `doc`: The name of the document. +- `source_file`: The name of the source file. **Properties:** @@ -108,13 +111,13 @@ This class represents an annotation holding arbitrary data, i.e. data that is no - `split()`: Split name into annotation name and attribute. - `exists()`: Return True if annotation file exists. -- `read(doc: Optional[str] = None)`: Read arbitrary string data from annotation file. +- `read(source_file: Optional[str] = None)`: Read arbitrary string data from annotation file. -## AnnotationDataAllDocs -Like [`AnnotationData`](#annotationdata), this class is used for annotations holding arbitrary data but the document -must be specified for all actions. Use as input to an annotator to require the specificed annotation for every document -in the corpus. +## AnnotationDataAllSourceFiles +Like [`AnnotationData`](#annotationdata), this class is used for annotations holding arbitrary data but the source file +must be specified for all actions. Use as input to an annotator to require the specificed annotation for every source +file in the corpus. **Arguments:** @@ -129,7 +132,7 @@ in the corpus. - `split()`: Split name into annotation name and attribute. - `exists()`: Return True if annotation file exists. -- `read(doc: Optional[str] = None)`: Read arbitrary string data from annotation file. +- `read(source_file: Optional[str] = None)`: Read arbitrary string data from annotation file. ## Binary @@ -138,7 +141,7 @@ Sparv data directory. This class is often used to define a prerequisite for an a **Arguments:** -- default argument: Path to binary executable. +- Path to binary executable. ## BinaryDir @@ -147,7 +150,7 @@ the `bin` path inside the Sparv data directory. **Arguments:** -- default argument: Path to directory containing executable binaries. +- Path to directory containing executable binaries. ## Config @@ -164,8 +167,8 @@ An instance of this class holds a configuration key name and its default value. An instance of this class holds the name (ID) of the corpus. -## Document -An instance of this class holds the name of a source document. +## SourceFilename +An instance of this class holds the name of a source file. ## Export @@ -173,12 +176,26 @@ An instance of this class represents an export file. This class is used to defin **Arguments:** -- `name`: The export directory and filename pattern (e.g. `"xml_pretty/[xml_export.filename]"`). -- `absolute_path`: Set to `True` if the path is absolute. Default: `False` +- The export directory and filename pattern (e.g. `"xml_export.pretty/[xml_export.filename]"`). The export directory + must contain the module name as a prefix, or be equal to the module name. ## ExportAnnotations -List of annotations to be included in the export. This list is defined in the corpus configuration. +List of annotations to be included in the export. This list is defined in the corpus configuration. Annotation files +for the current source file will automatically be added as dependencies when using this class, unless `is_input` is set +to `False`. + +**Arguments:** + +- `config_name`: The config variable pointing out what annotations to include. +- `is_input`: If set to `False` the annotations won't be added to the rule's input. Default: `True` + + +## ExportAnnotationsAllSourceFiles +List of annotations to be included in the export. This list is defined in the corpus configuration. Annotation files +for _all_ source files will automatically be added as dependencies when using this class, unless `is_input` is set to +`False`. With `is_input` set to `False`, there is no difference between using `ExportAnnotationsAllSourceFiles` and +`ExportAnnotations`. **Arguments:** @@ -192,23 +209,22 @@ function. **Arguments:** -- `val`: The export directory and filename pattern (e.g. `"xml_pretty/[xml_export.filename]"`). -- `all_docs`: Set to `True` to get the export for all source documents. Default: `False` -- `absolute_path`: Set to `True` if the path is absolute. Default: `False` +- `val`: The export directory and filename pattern (e.g. `"xml_export.pretty/[xml_export.filename]"`). +- `all_files`: Set to `True` to get the export for all source files. Default: `False` ## Headers -List of header annotation names for a given document. +List of header annotation names for a given source file. **Arguments:** -- default argument: The name of the document. +- The name of the source file. **Methods:** - `read()`: Read the headers file and return a list of header annotation names. - `write(header_annotations: List[str])`: Write headers file. -- `exists()`: Return True if headers file exists for this document. +- `exists()`: Return True if headers file exists for this source file. ## Language @@ -267,19 +283,19 @@ Regular annotation or attribute used as output (e.g. of an annotator function). - `name`: The name of the annotation. - `cls`: The annotation class of the output. - `description`: An optional description. -- `doc`: The name of the document. +- `source_file`: The name of the source file. **Methods:** - `split()`: Split name into annotation name and attribute. -- `write(values, append: bool = False, allow_newlines: bool = False, doc: Optional[str] = None)`: Write an annotation to - file. Existing annotation will be overwritten. 'values' should be a list of values. +- `write(values, append: bool = False, allow_newlines: bool = False, source_file: Optional[str] = None)`: Write an + annotation to file. Existing annotation will be overwritten. 'values' should be a list of values. - `exists()`: Return True if annotation file exists. -## OutputAllDocs -Similar to [`Output`](#output) this class represents a regular annotation or attribute used as output, but the document -must be specified for all actions. +## OutputAllSourceFiles +Similar to [`Output`](#output) this class represents a regular annotation or attribute used as output, but the source +file must be specified for all actions. **Arguments**: - `name`: The name of the annotation. @@ -289,9 +305,9 @@ must be specified for all actions. **Methods:** - `split()`: Split name into annotation name and attribute. -- `write(values, doc: str, append: bool = False, allow_newlines: bool = False)`: Write an annotation to file. Existing - annotation will be overwritten. 'values' should be a list of values. -- `exists(doc: str)`: Return True if annotation file exists. +- `write(values, source_file: str, append: bool = False, allow_newlines: bool = False)`: Write an annotation to file. + Existing annotation will be overwritten. 'values' should be a list of values. +- `exists(source_file: str)`: Return True if annotation file exists. ## OutputCommonData @@ -316,7 +332,7 @@ is used as output. - `name`: The name of the annotation. - `cls`: The annotation class of the output. - `description`: An optional description. -- `doc`: The name of the document. +- `source_file`: The name of the source file. **Methods:** @@ -325,9 +341,9 @@ is used as output. - `exists()`: Return True if annotation file exists. -## OutputDataAllDocs +## OutputDataAllSourceFiles Like [`OutputData`](#outputdata), this class is used for annotations holding arbitrary data and that is used as output, -but the document must be specified for all actions. +but the source file must be specified for all actions. **Arguments**: - `name`: The name of the annotation. @@ -337,8 +353,8 @@ but the document must be specified for all actions. **Methods:** - `split()`: Split name into annotation name and attribute. -- `write(value, doc: str, append: bool = False)`: Write arbitrary corpus level string data to annotation file. -- `exists(doc: str)`: Return True if annotation file exists. +- `write(value, source_file: str, append: bool = False)`: Write arbitrary corpus level string data to annotation file. +- `exists(source_file: str)`: Return True if annotation file exists. ## Source @@ -346,11 +362,11 @@ An instance of this class holds a path to the directory containing input files. **Arguments:** -- default argument: Path to directory containing input files. +- Path to directory containing input files. **Methods:** -- `get_path(doc: Document, extension: str)`: Get path to a specific source file. +- `get_path(source_file: SourceFilename, extension: str)`: Get path to a specific source file. ## SourceAnnotations @@ -359,20 +375,29 @@ List of source annotations to be included in the export. This list is defined in **Arguments:** - `config_name`: The config variable pointing out what source annotations to include. -- `is_input`: If set to `False` the annotations won't be added to the rule's input. Default: `True` + + +## SourceAnnotationsAllSourceFiles +List of source annotations to be included in the export. This list is defined in the corpus configuration. This +differs from `SourceAnnotations` in that the source annotations structure file (created by using `SourceStructure`) +of _every_ source file will be added as dependencies. + +**Arguments:** + +- `config_name`: The config variable pointing out what source annotations to include. ## SourceStructure -Every annotation available in a source document. +Every annotation name available in a source file. **Arguments:** -- default argument: The name of the document. +- The name of the source file. **Methods:** - `read()`: Read structure file. -- `write(structure)`: Sort the document's structural elements and write structure file. +- `write(structure)`: Sort the source file's annotation names and write to structure file. ## SourceStructureParser @@ -393,7 +418,7 @@ An instance of this class represents the corpus text. **Arguments:** -- `doc`: The name of the document. +- `source_file`: The name of the source file. **Methods:** @@ -402,7 +427,8 @@ An instance of this class represents the corpus text. ## Wildcard -An instance of this class holds wildcard information. It is typically used in the `wildcards` list passed as an argument to the [`@annotator` decorator](developers-guide/sparv-decorators.md#annotator), e.g.: +An instance of this class holds wildcard information. It is typically used in the `wildcards` list passed as an argument +to the [`@annotator` decorator](developers-guide/sparv-decorators.md#annotator), e.g.: ```python @annotator("Number {annotation} by relative position within {parent}", wildcards=[ Wildcard("annotation", Wildcard.ANNOTATION), diff --git a/docs/developers-guide/sparv-decorators.md b/docs/developers-guide/sparv-decorators.md index 1302d282..4ba9e12a 100644 --- a/docs/developers-guide/sparv-decorators.md +++ b/docs/developers-guide/sparv-decorators.md @@ -9,7 +9,7 @@ other arguments are optional and default to `None`. ## @annotator A function decorated with `@annotator` usually takes some input (e.g. models, one or more arbitrary annotations like -tokens, sentences, parts of speeches etc) and outputs one or more new annotations. +tokens, sentences, parts of speeches etc.) and outputs one or more new annotations. **Arguments:** @@ -49,9 +49,15 @@ def annotate(lang: Language = Language(), ## @importer A function decorated with `@importer` is used for importing corpus files in a certain file format. Its job is to read a corpus file, extract the corpus text and existing markup (if applicable), and write annotation files for the corpus text -and markup. The corpus text output is implicit for importers and thus not listed among the function arguments. Any -additional outputs may be listed in the `outputs` argument of the decorator. This is necessary in case any output is -needed as input in another part of the pipeline. +and markup. + +Importers do not use the `Output` class to specify its outputs. Instead, outputs may be listed using the `outputs` +argument of the decorator. Any output that is to be used as explicit input by another part of the pipeline needs to be +listed here, but additional unlisted outputs may also be created. + +Two outputs are implicit (and thus not listed in `outputs`) but required for every importer: the corpus text, saved by +using the `Text` class, and a list of the annotations created from existing markup, saved by using the +`SourceStructure` class. **Arguments:** @@ -66,10 +72,10 @@ needed as input in another part of the pipeline. **Example:** ```python @importer("TXT import", file_extension="txt", outputs=["text"]) -def parse(doc: Document = Document(), +def parse(source_file: SourceFilename = SourceFilename(), source_dir: Source = Source(), prefix: str = "", - encoding: str = util.UTF8, + encoding: str = util.constants.UTF8, normalize: str = "NFC") -> None: ... ``` @@ -97,11 +103,11 @@ files into one output file. Config("stats_export.cutoff", default=1, description="The minimum frequency a word must have in order to be included in the result") ]) def freq_list_simple(corpus: Corpus = Corpus(), - docs: AllDocuments = AllDocuments(), - word: AnnotationAllDocs = AnnotationAllDocs(""), - pos: AnnotationAllDocs = AnnotationAllDocs(""), - baseform: AnnotationAllDocs = AnnotationAllDocs(""), - out: Export = Export("frequency_list/stats_[metadata.id].csv"), + source_files: AllSourceFilenames = AllSourceFilenames(), + word: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + pos: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + baseform: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + out: Export = Export("stats_export.frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), cutoff: int = Config("stats_export.cutoff")): ... @@ -119,15 +125,15 @@ A function decorated with `@installer` is used to copy a corpus export to a remo **Example:** ```python -@installer("Copy compressed scrambled XML to remote host", config=[ - Config("xml_export.export_host", "", description="Remote host to copy scrambled XML export to"), - Config("xml_export.export_path", "", description="Path on remote host to copy scrambled XML export to") +@installer("Copy compressed XML to remote host", config=[ + Config("xml_export.export_host", "", description="Remote host to copy XML export to."), + Config("xml_export.export_path", "", description="Path on remote host to copy XML export to.") ]) -def install_scrambled(corpus: Corpus = Corpus(), - xmlfile: ExportInput = ExportInput("[metadata.id]_scrambled.xml"), - out: OutputCommonData = OutputCommonData("xml_export.install_export_scrambled_marker"), - export_path: str = Config("xml_export.export_path"), - host: str = Config("xml_export.export_host")): +def install(corpus: Corpus = Corpus(), + xmlfile: ExportInput = ExportInput("xml_export.combined/[metadata.id].xml.bz2"), + out: OutputCommonData = OutputCommonData("xml_export.install_export_pretty_marker"), + export_path: str = Config("xml_export.export_path"), + host: str = Config("xml_export.export_host")): ... ``` diff --git a/docs/developers-guide/utilities.md b/docs/developers-guide/utilities.md index a1147169..ce6e3c78 100644 --- a/docs/developers-guide/utilities.md +++ b/docs/developers-guide/utilities.md @@ -1,13 +1,14 @@ # Utilities -Sparv has a number of utility functions, classes and constants that are not specific to any particular module. They are -imported from `sparv.util`, e.g.: +Sparv has a number of utility functions, classes and constants that are not specific to any particular module. +Most of them are imported from `sparv.api.util` and its submodules, e.g.: ```python -from sparv.util import UTF8 +from sparv.api.util.system import call_binary ``` ## Constants +`sparv.api.util.constants` contains the following constants: - `DELIM = "|"` Delimiter char to put between ambiguous results @@ -32,7 +33,7 @@ from sparv.util import UTF8 ## Export Utils -Util functions used for preparing data for export. +`sparv.api.util.export` provides util functions used for preparing data for export. ### gather_annotations() Calculate the span hierarchy and the annotation_dict containing all annotation elements and attributes. Returns a @@ -43,7 +44,7 @@ Calculate the span hierarchy and the annotation_dict containing all annotation e - `annotations`: A list of annotations to include. - `export_names`: Dictionary that maps from annotation names to export names. - `header_annotations`: A list of header annotations. -- `doc`: The document name. +- `source_file`: The source filename. - `flatten`: Whether to return the spans as a flat list. Default: `True` - `split_overlaps`: Whether to split up overlapping spans. Default: `False` @@ -54,10 +55,10 @@ Get a list of annotations, token attributes and a dictionary with translations f **Arguments:** - `annotations`: List of elements:attributes (annotations) to include. -- `source_annotations`: List of elements:attributes from the original document to include. If not specified, everything +- `source_annotations`: List of elements:attributes from the source file to include. If not specified, everything will be included. -- `doc`: Name of the source document. -- `docs`: List of names of source documents (alternative to `doc`). +- `source_file`: Name of the source file. +- `source_files`: List of names of source files (alternative to `source_file`). - `token_name`: Name of the token annotation. - `remove_namespaces`: Remove all namespaces in export_names unless names are ambiguous. Default: `False` - `keep_struct_names`: For structural attributes (anything other than token), include the annotation base name @@ -71,10 +72,10 @@ Get a list of header annotations and a dictionary for renamed annotations. **Arguments:** -- `header_annotation_names`: List of header elements:attributes from the original document to include. If not specified, +- `header_annotation_names`: List of header elements:attributes from the source file to include. If not specified, everything will be included. -- `doc`: Name of the source document. -- `docs`: List of names of source documents (alternative to `doc`). +- `source_file`: Name of the source file. +- `source_files`: List of names of source files (alternative to `source_file`). ### scramble_spans() @@ -88,7 +89,7 @@ Reorder chunks according to `chunk_order` and open/close tags in the correct ord ## Install Utils -Util functions used for installing corpora onto remote servers. +`sparv.api.util.install` provides util functions used for installing corpora onto remote locations. ### install_directory() @@ -106,8 +107,8 @@ Rsync a file to a target host. **Arguments:** -- `host`: The remote host to install to. - `local_file`: Path to the local file to sync. +- `host`: The remote host to install to. - `remote_file`: The name of the resulting file on the remote host. @@ -132,7 +133,7 @@ Copy selected tables (including data) from local to remote MySQL database. ## System Utils -Util functions related to staring and stopping processes, creating directories etc. +`sparv.api.util.system` provides functions related to starting and stopping processes, creating directories etc. ### call_binary() @@ -215,9 +216,9 @@ deleted. ## Tagsets -Functions and objects related to tagset conversions. +`sparv.api.util.tagsets` is a subpackage with modules containing functions and objects related to tagset conversions. -### tagsets.join_tag() +### tagmappings.join_tag() Convert a complex SUC or SALDO tag record into a string. **Arguments:** @@ -226,13 +227,13 @@ Convert a complex SUC or SALDO tag record into a string. - `sep`: The separator to be used. Default: "." -### tagsets.mappings +### tagmappings.mappings Dictionary containing mappings (dictionaries) for of part-of-speech tag mappings between different tag sets. -### tagsets.pos_to_upos() -Map POS tags to Universal Depenendy POS tags. This only works if there is a conversion function in `util.pos_to_upos` -for the given language and tagset. +### pos_to_upos() +Map POS tags to Universal Depenendy POS tags. This only works if there is a conversion function in +`util.tagsets.pos_to_upos` for the given language and tagset. **Arguments:** @@ -241,7 +242,7 @@ for the given language and tagset. - `tagset`: The name of the tagset that `pos` belongs to. -### tagsets.split_tag() +### tagmappings.split_tag() Split a SUC or Saldo tag string ('X.Y.Z') into a tuple ('X', 'Y.Z') where 'X' is a part of speech and 'Y', 'Z' etc. are morphologic features (i.e. MSD tags). @@ -251,7 +252,7 @@ morphologic features (i.e. MSD tags). - `sep`: The separator to split on. Default: "." -### tagsets.suc_to_feats() +### suc_to_feats() Convert SUC MSD tags into UCoNNL feature list (universal morphological features). Returns a list of universal features. **Arguments:** @@ -261,12 +262,12 @@ Convert SUC MSD tags into UCoNNL feature list (universal morphological features) - `delim`: The delimiter separating the features in `msd`. Default: "." -### tagsets.tags +### tagmappings.tags Dictionary containing sets of part-of-speech tags. ## Miscellaneous Utils -Miscellaneous utils functions. +`sparv.api.util.misc` provides miscellaneous util functions. @@ -285,14 +286,6 @@ Take an iterable object and return a set in the format used by Corpus Workbench. - `encoding`: Encoding of `values`. Default: "UTF-8" -### get_logger() -Get a logger that is a child of `sparv.modules`. - -**Arguments:** - -- `name`: The name of the current module (usually `__name__`) - - ### indent_xml() Add pretty-print indentation to an XML tree. @@ -323,7 +316,7 @@ Class for reading basic pickled lexicon and looking up keys. **Arguments:** -- default argument: A `pathlib.Path` or `Model` object pointing to a pickled lexicon. +- `picklefile`: A `pathlib.Path` or `Model` object pointing to a pickled lexicon. - `verbose`: Logs status updates upon reading the lexicon if set to `True`. Default: `True` **Methods:** @@ -360,8 +353,27 @@ Turn a set string into a list. - `affix`: Character that `setstring` starts and ends with. that Default: "|" +### test_lexicon() +Test the validity of a lexicon. Takes a dictionary (lexicon) and a list of test words that are expected to occur as keys +in the lexicon. Prints the value for each test word. + +**Arguments:** + +- `lexicon`: A dictionary. +- `testwords`: An iterable containing strings that are expected to occur as keys in `lexicon`. + + +## Error Messages and Logging +The `SparvErrorMessage` exception and `get_logger` function are integral parts of the Sparv pipeline, and unlike other +utilities on this page, they are found directly under `sparv.api`. + + ### SparvErrorMessage -Exception (class) used to notify users of errors in a friendly way without displaying traceback. +Exception (class) used to notify users of errors in a friendly way without displaying traceback. Its usage is described +in the [Writing Sparv Plugins](developers-guide/writing-sparv-plugins#error-messages) section. + +> [!NOTE] +> Only the `message` argument should be used when raisning this exception in a Sparv module. **Arguments:** @@ -370,11 +382,10 @@ Exception (class) used to notify users of errors in a friendly way without displ - `function`: Name of the function where the error occurred (optional, not used in Sparv modules). Default: "" -### test_lexicon() -Test the validity of a lexicon. Takes a dictionary (lexicon) and a list of test words that are expected to occur as keys -in the lexicon. Prints the value for each test word. +### get_logger() +Get a logger that is a child of `sparv.modules`. Its usage is described in the +[Writing Sparv Plugins](developers-guide/writing-sparv-plugins#logging) section. **Arguments:** -- `lexicon`: A dictionary. -- `testwords`: An iterable containing strings that are expected to occur as keys in `lexicon`. +- `name`: The name of the current module (usually `__name__`) diff --git a/docs/developers-guide/writing-sparv-plugins.md b/docs/developers-guide/writing-sparv-plugins.md index 2cf76b4e..b642afdf 100644 --- a/docs/developers-guide/writing-sparv-plugins.md +++ b/docs/developers-guide/writing-sparv-plugins.md @@ -4,6 +4,11 @@ shipped with the main Sparv package none of these modules are hard-coded into th easily be extended with plugins. A plugin is a Sparv module that is not part of the main Sparv package. Writing a plugin is the recommended way of adding a new module to Sparv. +> [!NOTE] When writing a plugin please always prefix your Python package with a namespace followed by an underscore to +> mark which organisation or developer the plugin belongs to. This is necessary to avoid clashes in package names and +> obligatory plugin namespaces will be enforced in the future. In the example below we used the prefix "sbx_" (for +> Språkbanken Text). + When writing your first plugin we recommend that you take a look at the [Sparv plugin template](https://github.com/spraakbanken/sparv-plugin-template). The template contains an example of a small annotation module that converts tokens to uppercase. We will use this template in the examples below. @@ -12,8 +17,8 @@ module that converts tokens to uppercase. We will use this template in the examp ## Plugin Structure This is what a typical structure of a plugin may look like: ``` -sparv-uppercase/ -├── uppercase +sparv-sbx-uppercase/ +├── sbx_uppercase │ ├── uppercase.py │ └── __init__.py ├── LICENSE @@ -21,20 +26,12 @@ sparv-uppercase/ └── setup.py ``` -In the above example the `uppercase` directory is a Sparv module containing the [module code](#module-code) in +In the above example the `sbx_uppercase` directory is a Sparv module containing the [module code](#module-code) in `uppercase.py` and the mandatory [init file](#init-file) `__init__.py`. The [setup file](#setup-file) `setup.py` in the -root directory is needed in order to install the plugin. A plugin does not have to be stored in any particular place. As -long as the Sparv Pipeline is installed on your machine, you should be able to inject your plugin into the Sparv -Pipeline code using pipx (from the directory containing your plugin): -```bash -pipx inject sparv-pipeline ./sparv-uppercase -``` - -After the injection the plugin functionality should be available, and the plugged-in module should be treated just like -any other module within the Sparv Pipeline. +root directory is needed in order to install the plugin. -The readme and license files are not strictly recommended are not strictly necessary for the plugin to work but we -strongly recommend that you include these if you want to publish your plugin. +The readme and license files are not strictly necessary for the plugin to work but we strongly recommend that you +include these if you want to publish your plugin. ## Setup File @@ -44,20 +41,22 @@ a setup file (taken from the [Sparv plugin template](https://github.com/spraakba import setuptools setuptools.setup( - name="uppercase", + name="sparv-sbx-uppercase", version="0.1", description="Uppercase converter (example plug-in for Sparv)", license="MIT", - packages=["uppercase"], + packages=["sbx_uppercase"], python_requires=">=3.6", - install_requires=["sparv-pipeline>=4"], - entry_points={"sparv.plugin": ["uppercase = uppercase"]} + install_requires=["sparv-pipeline>=4,<5"], + entry_points={"sparv.plugin": ["sbx_uppercase = sbx_uppercase"]} ) ``` Make sure to include the name of your module (i.e. the directory containing the Sparv code) in `packages`. You also need to make sure that there is a `sparv.plugin` entry point in `entry_points` that points to your module. +We strongly encourage you to also include the fields `author` and `author_email`. + For more information about Python setup scripts check the [distutils documentation](https://docs.python.org/3/distutils/setupscript.html). @@ -75,7 +74,7 @@ Example of an `__init__.py` file: ```python """Example for a Sparv annotator that converts tokens to uppercase.""" -# from sparv import Config +# from sparv.api import Config from . import uppercase @@ -94,19 +93,19 @@ describing dependencies to other entities (e.g. annotations or models) handled o code for or uppercase example (taken from the [Sparv plugin template](https://github.com/spraakbanken/sparv-plugin-template): ```python -from sparv import Annotation, Output, annotator +from sparv.api import Annotation, Output, annotator @annotator("Convert every word to uppercase.") def uppercase(word: Annotation = Annotation(""), - out: Output = Output(":uppercase.upper")): + out: Output = Output(":sbx_uppercase.upper")): """Convert to uppercase.""" out.write([val.upper() for val in word.read()]) ``` In this script we import two classes from Sparv (`Annotation` and `Output`) and the `annotator` decorator. Please note -that nothing should be imported from the Sparv code unless it is directly available from the sparv package (i.e. `from -sparv import ...`), or the `sparv.util` sub-package. Any other sub-packages (like `sparv.core`) are for internal use -only, and are subject to change without notice. +that nothing should be imported from the Sparv code unless it is directly available from the sparv.api package (i.e. +`from sparv.api import ...`). Any other sub-packages (like `sparv.core`) are for internal use only, and are subject +to change without notice. Our `uppercase` function is decorated with `@annotator` which tells Sparv that this function can be used to produce one or more annotations. The first argument in the decorator is its description which is used for displaying help texts in @@ -117,7 +116,7 @@ hints to the Sparv classes `Annotation` and `Output` which indicate what depende config variables) must be satisfied before the function can do its job, and what it will produce. In this example Sparv will make sure that a word annotation exists before it will attempt to call the `uppercase` function, because it knows that `word` is an input since it is of type `Annotation`. It also knows that the function produces the output annotation -`:uppercase.upper`, so if any other module would request this annotation as input, it will run `uppercase` +`:sbx_uppercase.upper`, so if any other module would request this annotation as input, it will run `uppercase` prior to calling that module. A function decorated with a Sparv decorator should never be actively called by you or by another decorated function. @@ -139,8 +138,8 @@ Logging from Sparv modules is done with [Python's logging library](https://docs. Please use the provided `get_logger` wrapper when declaring your logger which takes care of importing the logging library and sets the correct module name in the log output: ```python -import sparv.util as util -logger = util.get_logger(__name__) +from sparv.api import get_logger +logger = get_logger(__name__) logger.error("An error was encountered!") ``` @@ -151,6 +150,36 @@ with the flag `--log [LOGLEVEL]`. Most commands support this flag. The user can file by using the `--log-to-file [LOGLEVEL]` flag. The log file will receive the current date and timestamp as filename and can be found inside `logs/` in the corpus directory. +### Progress bar +It is possible to add a progress bar for individual annotators by using the custom `progress()` logging method. To +initialize the progress bar, call the `logger.progress()` method, either without an argument, or while supplying the +total for the bar: `logger.progress(total=50)`. A progress bar initialized without a total will have to be provided with +a total before it can be used. It is also possible to change the total later. + +After the total has been set, call `progress()` again to update the progress. If not argument is supplied, the progress +is advanced by 1. To advance by another amount, use the keyword argument `advance=`. To set the progress to a specific +number, simply call the method with that number as the argument. See below for examples: + +```python +from sparv.api import get_logger +logger = get_logger(__name__) + +# Initialize progress bar with no known total +logger.progress() + +# Initialize bar with known total +logger.progress(total=50) + +# Advance progress by 1 +logger.progress() + +# Advance progress by 2 +logger.progress(advance=2) + +# Set progress to 5 +logger.progress(5) +``` + ## Error Messages When raising critical errors that should be displayed to the user (e.g. to tell the user that he/she did something @@ -158,17 +187,78 @@ wrong) you should use the [SparvErrorMessage class](developers-guide/utilities#S exception (and thus stop the current Sparv process) and notify the user of errors in a friendly way without displaying the usual Python traceback. ```python +from sparv.api import SparvErrorMessage + @annotator("Convert every word to uppercase") def uppercase(word: Annotation = Annotation(""), - out: Output = Output(":uppercase.upper"), - important_config_variable: str = Config("uppercase.some_setting")): + out: Output = Output(":sbx_uppercase.upper"), + important_config_variable: str = Config("sbx_uppercase.some_setting")): """Convert to uppercase.""" # Make sure important_config_variable is set by the user if not important_config_variable: - raise util.SparvErrorMessage("Please make sure to set the config variable 'uppercase.some_setting'!") + raise SparvErrorMessage("Please make sure to set the config variable 'sbx_uppercase.some_setting'!") ... ``` +## Languages and varieties +It is possible to restrict the use of an annotator, exporter, installer or modelbuilder to one or more specific +language(s). This is done by passing a list of ISO 639-3 language codes to the optional `language` parameter in the +decorator: +```python +@annotator("Convert every word to uppercase", language=["swe", "eng"]) +def ... +``` + +Sparv functions are only available for use if one of their languages match the language in the [corpus config +file](user-manual/corpus-configuration.md). If no language codes are provided in the decorator, the function is +available for any corpus. + +Sparv also supports language varieties which is useful when you want to write Sparv functions for a specific variety of +a language. For instance, Sparv has some built-in annotators that are restricted to corpora with historical Swedish from +the 1800's. They are marked with the language code `swe-1800`, where `swe` is the ISO 639-3 code for Swedish and `1800` +is an arbitrary string for this specific language variety. Sparv functions marked with `swe-1800` are available for +corpora that are configured as follows: +```yaml +metadata: + language: "swe" + variety: "1800" +``` +Note that all functions marked with `swe` will also be available for these corpora. + + +## Installing and Uninstalling Plugins + +A Sparv plugin can be installed from the [Python Package Index (PyPI)](https://pypi.org/), a remote public repository, +or from a local directory stored anywhere on your machine. As long as the Sparv Pipeline is installed on your machine, +you should be able to inject your plugin into the Sparv Pipeline code using pipx: +``` +pipx inject sparv-pipeline [pointer-to-sparv-plugin] +``` + +So if you are trying to install the `sparv-sbx-uppercase` plugin and it exists on PyPI you can install it like this: +``` +pipx inject sparv-pipeline sparv-sbx-uppercase +``` + +For installing it from a public repository from GitHub the install command looks something like this: +``` +pipx inject sparv-pipeline https://github.com/spraakbanken/sparv-plugin-template/archive/main.zip +``` + +For installation from a local directory run this (from the directory containing your plugin): +``` +pipx inject sparv-pipeline ./sparv-sbx-uppercase +``` + +After the injection the plugin functionality should be available, and the plugged-in module should be treated just like +any other module within the Sparv Pipeline. + +You can uninstall the plugin by running: +``` +pipx runpip sparv-pipeline uninstall [name-of-sparv-plugin] +``` +In this example `[name-of-sparv-plugin]` is `sparv-sbx-uppercase`. + ## Advanced Features This section contains documentation for more advanced features which may be used but are not necessary for writing @@ -191,7 +281,7 @@ def annotate( ... -@annotator("Create foo annotation for when bar is not available", order=2) +@annotator("Create foo annotation when bar is not available", order=2) def annotate_backoff( out: Output = Output("mymodule.foo")): ... @@ -210,7 +300,7 @@ A preload function is simply a function that takes a subset of the arguments fro is passed on to the annotator. Here is an example: ```python -from sparv import Annotation, Model, Output, annotator +from sparv.api import Annotation, Model, Output, annotator def preloader(model): diff --git a/docs/docsify/_coverpage.md b/docs/docsify/_coverpage.md index c6f73f50..8ba32595 100644 --- a/docs/docsify/_coverpage.md +++ b/docs/docsify/_coverpage.md @@ -4,7 +4,7 @@ > Språkbanken's text analysis tool -

version 4.1.1

+

version 5.0.0

version .\+ <\/p>/

version $SPARV_VERSION <\/p>/" _coverpage.md +if [[ $SPARV_VERSION =~ .*\.dev.* ]]; then + sed -i "s/# Sparv Pipeline Documentation.*/# Sparv Pipeline Documentation (development version)/" _coverpage.md +else + sed -i "s/# Sparv Pipeline Documentation.*/# Sparv Pipeline Documentation/" _coverpage.md +fi diff --git a/docs/docsify/sparv-pipeline.md b/docs/docsify/sparv-pipeline.md index ef12f8b2..3cdf34ec 100644 --- a/docs/docsify/sparv-pipeline.md +++ b/docs/docsify/sparv-pipeline.md @@ -4,6 +4,23 @@ The Sparv Pipeline is a text analysis tool run from the command line. Sparv is developed by [Språkbanken Text](https://spraakbanken.gu.se/). The [source code](https://github.com/spraakbanken/sparv-pipeline) is available under the -[MIT license](https://opensource.org/licenses/MIT). +[MIT license](https://opensource.org/licenses/MIT). If you have any questions, problems +or suggestions please contact . -If you have any questions, problems or suggestions please contact . +This documentation is also available as PDF. You can download the [user manual](https://github.com/spraakbanken/sparv-pipeline/releases/latest/download/user-manual.pdf) and the [developer's guide](https://github.com/spraakbanken/sparv-pipeline/releases/latest/download/developers-guide.pdf) from the [latest Sparv release on GitHub](https://github.com/spraakbanken/sparv-pipeline/releases/latest). + +> [!TIP] +> Did you know that you can get notified about new Sparv releases by subscribing to our GitHub repository? Here's how: +> 1. Log in to GitHub +> 2. Go to https://github.com/spraakbanken/sparv-pipeline +> 3. Click "Watch" in the upper right corner +> 4. Select "Custom" +> 5. Check the "Releases" box +> 6. Click "Apply" +> +> ![](_media/watch-releases.png) +> +> +> Depending on your [notification settings](https://github.com/settings/notifications) you will now recieve notifications about new Sparv releases on GitHub's website, via email or on your phone. diff --git a/docs/docsify/sync_doc.sh b/docs/docsify/sync_doc.sh index fef5df7e..4052f16a 100755 --- a/docs/docsify/sync_doc.sh +++ b/docs/docsify/sync_doc.sh @@ -9,7 +9,4 @@ source config.sh ./set_version.sh # Sync files -rsync -av ./ $user@$host:$path --exclude '_media' --exclude 'developers-guide' --exclude 'user-manual' --exclude '*.sh' --exclude '.gitignore' -rsync -av --delete ../user-manual $user@$host:$path -rsync -av --delete ../developers-guide $user@$host:$path -rsync -av --delete ../images/ $user@$host:$path/_media +rsync -rcLv --delete ./* $user@$host:${path:?} --exclude '*.sh' --exclude '.gitignore' diff --git a/docs/images/watch-releases.png b/docs/images/watch-releases.png new file mode 100644 index 00000000..7d8be444 Binary files /dev/null and b/docs/images/watch-releases.png differ diff --git a/docs/md2pdf/filter.py b/docs/md2pdf/filter.py index e7d437e6..c9fa21bd 100644 --- a/docs/md2pdf/filter.py +++ b/docs/md2pdf/filter.py @@ -21,6 +21,9 @@ def fix_document(key, value, _format, _meta): if first_string == "[!NOTE]": value[0]["c"][0] = Strong([Str("Note:")]) return BlockQuote(value) + elif first_string == "[!INFO]": + value[0]["c"][0] = Strong([Str("Info:")]) + return BlockQuote(value) elif first_string == "[!TIP]": value[0]["c"][0] = Strong([Str("Tip:")]) return BlockQuote(value) diff --git a/docs/md2pdf/make_pdf.sh b/docs/md2pdf/make_pdf.sh index a900a80e..2b3f6053 100755 --- a/docs/md2pdf/make_pdf.sh +++ b/docs/md2pdf/make_pdf.sh @@ -5,6 +5,7 @@ # Requires markdown and latex USER_MANUAL_FILES=" +../user-manual/quick-start.md ../user-manual/installation-and-setup.md ../user-manual/running-sparv.md ../user-manual/requirements-for-source-files.md @@ -36,7 +37,7 @@ function make_document { title: Sparv Pipeline $SPARV_VERSION - $3 author: | | Språkbanken Text - | Institutionen för svenska språket + | Institutionen för svenska, flerspråkighet och språkteknologi | Göteborgs universitet | | diff --git a/docs/md2pdf/settings_template.tex b/docs/md2pdf/settings_template.tex index 51736fb0..a3430f0f 100644 --- a/docs/md2pdf/settings_template.tex +++ b/docs/md2pdf/settings_template.tex @@ -5,12 +5,15 @@ \usepackage{fourier} % Use the Adobe Utopia font for the document \usepackage[english]{babel} % English language/hyphenation +\usepackage[scaled]{helvet} % Use Helvetica font +\renewcommand\familydefault{\sfdefault} + \usepackage{graphicx} % For including graphics \usepackage[dvipsnames]{xcolor} % Display colors \usepackage{pmboxdraw} % Display funny tree structure chars \usepackage{sectsty} % Allows customizing section commands -\allsectionsfont{\normalfont\scshape} % Make all sections centered, the default font and small caps +\allsectionsfont{\normalfont\bf} % Make all sections centered, the default font and bold \usepackage[a4paper, margin=3cm]{geometry} % Smaller margins % \linespread{1.1} % Line spacing diff --git a/docs/user-manual/corpus-configuration.md b/docs/user-manual/corpus-configuration.md index 9c56f9db..65e912da 100644 --- a/docs/user-manual/corpus-configuration.md +++ b/docs/user-manual/corpus-configuration.md @@ -6,12 +6,12 @@ how to process it. The [corpus config wizard](#corpus-config-wizard) can help yo examples of config files you can download the [example corpora](https://github.com/spraakbanken/sparv-pipeline/releases/latest/download/example_corpora.zip). -A minimal config file contains a corpus ID and a list of (automatic) annotations you want to be included in the output. +A minimal config file contains a list of (automatic) annotations you want to be included in the output. Here is an example of a small config file: ```yaml metadata: - # Corpus ID (Machine name, only lower case ASCII letters (a-z) and "-" allowed. No whitespace characters.) - id: mini-swe + # Language of the source files + language: swe export: # Automatic annotations to be included in the export annotations: @@ -53,11 +53,10 @@ options usually have default values which are defined by the module itself. When running Sparv your corpus config will be read and combined with Sparv's default config file (`config_default.yaml` in the [Sparv data directory](user-manual/installation-and-setup.md#setting-up-sparv)) and the default values defined by different Sparv modules. You can view the resulting configuration by running `sparv config`. Using the `config` command -you can also inspect specific config variables, e.g. `sparv config metadata` or `sparv config metadata.id`. All +you can also inspect specific config variables, e.g. `sparv config metadata` or `sparv config metadata.language`. All default values can be overridden in your own corpus config. There are a few config options that must be set (either through the default config or the corpus config): - - `metadata.id` - `metadata.language` (default: `swe`) - `import.importer` (default: `xml_import:parse`) - `export.annotations` @@ -65,28 +64,47 @@ There are a few config options that must be set (either through the default conf - `classes.sentence` (default: `segment.sentence`) +## Metadata Options +The `metadata` section of your corpus config contains metadata about your corpus that may be used by any Sparv module. + +- `metadata.id` defines the machine name of the corpus. It is required by some exporter modules. This string may contain + ascii letters, digits and dashes. + +- `metadata.name` is an optional human readable name of the corpus. This option is split into two fields, `eng` and + `swe` for defining a name in English and in Swedish. + +- `metadata.language` defines the language of the source files in the corpus. This should be an ISO 639-3 code. If not + specified it defaults to `swe`. Run `sparv languages` to list the supported languages along with their language codes. + +- `metadata.variety` is an optional field containing the language variety of the source files (if applicable). Run + `sparv languages` to list the supported varieties for each language. + +- `metadata.description` is an optional description for the corpus. It may consist of multiple lines. This option is + split into two fields, `eng` and `swe` for defining a name in English and in Swedish. + + ## Import Options -The `import` section of your corpus config is used to give Sparv some information about your input documents (i.e. your +The `import` section of your corpus config is used to give Sparv some information about your input files (i.e. your corpus). -- `import.source_dir` defines the location of your input documents and it defaults to `source`. Sparv will check the - source directory recursively for valid input documents to process. +- `import.source_dir` defines the location of your input files and it defaults to `source`. Sparv will check the + source directory recursively for valid input files to process. -- `import.importer` is used to tell Sparv which importer to use when processing your source documents. The setting you - want to choose depends on the format of your input documents. If your corpus is in XML you should choose - `xml_import:parse` (this is the default setting). If your corpus documents are in plain text, you should choose +- `import.importer` is used to tell Sparv which importer to use when processing your source files. The setting you + want to choose depends on the format of your input files. If your corpus is in XML you should choose + `xml_import:parse` (this is the default setting). If your corpus files are in plain text, you should choose `text_import:parse` instead. -- `import.document_annotation` specifies the annotation representing _one text document_, and any automatic text-level +- `import.text_annotation` specifies the annotation representing _one text_, and any automatic text-level annotations will be attached to this annotation. For XML source files this refers to one of the XML elements. For plain text source files a default `text` root annotation will be created automatically, and you won't have to change this setting. > [!NOTE] > This setting automatically sets the `text` [class](#annotation-classes). If you want to use an automatic - > annotation as the document annotation, you should not use this setting, and instead set the `text` class directly. + > annotation as the text annotation, you should not use this setting, and instead set the `text` class directly. -- `import.encoding` specifies the encoding of the source documents. It defaults to UTF-8. +- `import.encoding` specifies the encoding of the source files. It defaults to UTF-8. - `import.normalize` lets you normalize unicode symbols in the input using any of the following forms: 'NFC', 'NFKC', 'NFD', and 'NFKD'. It defaults to `NFC`. @@ -106,11 +124,11 @@ would like to keep in your output data (this only applies if your input data is everything will be kept in the output. If you do not want any source annotations to be included in your output you can set this option to `[]`. This will cause errors in the XML exports though because the root element must be listed as a source annotation. If you do list anything here, make sure that you include the root element (i.e. the -element that encloses all other included elements and text content) for each of your input documents. If you don't, +element that encloses all other included elements and text content) for each of your input files. If you don't, the resulting output XML will be invalid and Sparv won't be able to produce XML files. If you only want to produce other output formats than XML, you don't need to worry about this. -It is possible to rename elements and attributes present in your input data. Let's say your documents contain elements +It is possible to rename elements and attributes present in your input data. Let's say your files contain elements like this `

` and you would like them to look like this in the output `` (so you want to rename the element "article" and the attribute "name" to "text" and "title" respectively). For this you can use the following syntax: @@ -178,7 +196,7 @@ instead of the more compact: Hej ``` -`export.scramble_on` is a setting used by all the export formats that support scrambling. It controls what annotation +`export.scramble_on` is a setting used by all the export formats that support scrambling. It controls which annotation your corpus will be scrambled on. Typical settings are `export.scramble_on: ` or `export.scramble_on: `. For example, setting this to `` would lead to all paragraphs being randomly shuffled in the export, while the sentences and tokens within the paragraphs keep their original order. @@ -238,7 +256,7 @@ xml_import: - header/title/main-title as text:title - header/title/sub-title as text:subtitle - header/date as text:date -export: +xml_export: header_annotations: - not header - not another-header @@ -255,8 +273,24 @@ The output will look like this: ``` If you do want to keep the headers in the output (without them being analysed as corpus text), just list them without -the `not` prefix in `export.header_annotations`. If you don't specify anything at all in -`export.header_annotations` all your headers will be kept. +the `not` prefix in `xml_export.header_annotations`. If you don't specify anything at all in +`xml_export.header_annotations` all your headers will be kept. + + +## XML Namespaces +If the source data is in XML and contains namespaces Sparv will try to keep these intact in the XML output. +There are, however, two limitations: +1. Namespace declarations are always placed in the root element in the output, regardless of where they are in the + source data. +2. URIs and prefixes are assumed to be unique. A URI will automatically be associated with the first prefix that is + declared for that URI in the source file. + +When referring to elements or attributes containing namespaces in the corpus config file a special syntax is used. A +reference consists of the namespace prefix followed by `+`, followed by the tag or attribute name. E.g. the reference +for this element `` would be `sparv+myelement`. + +Namespaces may be removed upon import by setting `xml_import.remove_namespaces` to `true` in the corpus config. This may +however result in collisions in attributes containing namespaces in the source data. ## Annotation Classes @@ -300,6 +334,22 @@ export: - :malt.dephead_ref ``` +Re-defining annotation classes may also be necessary when your corpus data contains annotations (such as sentences or +tokens) that should be used as input to annotators. For example, if you have done manual sentence segmentation and +enclosed each sentence in an `` element, you can skip Sparv's automatic sentence segmentation by setting the sentence +class to this element: +```yaml +classes: + sentence: s + +xml_import: + elements: + - s +``` +> [!ATTENTION] Please note that you need to tell Sparv that `s` is an annotation imported from your corpus data. This is +> done by listing `s` under `xml_import.elements` as is done in the above example. + + ## Annotation Presets Annotation presets are collections of annotations which can be used instead of listing the contained annotations. For example, instead of listing all the SALDO annotations in your list of automatic annotations like this: @@ -485,7 +535,7 @@ mycorpus/ Sparv will automatically detect scripts placed here as long as your functions are registered in your config (see Step 2). Your annotator function must use one of the Sparv decorators (usually `@annotator`). Here is a code example for a simple annotator that converts all tokens to upper case: ```python -from sparv import Annotation, Output, annotator +from sparv.api import Annotation, Output, annotator @annotator("Convert every word to uppercase.") def uppercase(word: Annotation = Annotation(""), @@ -497,7 +547,7 @@ def uppercase(word: Annotation = Annotation(""), **Step 2**: Now register your custom annotator in your corpus config in the `custom_annotations` section so Sparv can find it. The name of your annotator is composed of: - the prefix `custom.` -- followed by the file name of the Python file without extension (`convert` in our example) +- followed by the filename of the Python file without extension (`convert` in our example) - followed by a colon - and finally the annotator name (`uppercase`) diff --git a/docs/user-manual/installation-and-setup.md b/docs/user-manual/installation-and-setup.md index 789ab9ac..0770210b 100644 --- a/docs/user-manual/installation-and-setup.md +++ b/docs/user-manual/installation-and-setup.md @@ -4,7 +4,7 @@ additional software that you may need to install in order to run all the analyse ## Prerequisites In order to install Sparv you will need a Unix-like environment (e.g. Linux, macOS or [Windows Subsystem for -Linux](https://docs.microsoft.com/en-us/windows/wsl/about)) with [Python 3.6.1](http://python.org/) or newer. +Linux](https://docs.microsoft.com/en-us/windows/wsl/about)) with [Python 3.6.2](http://python.org/) or newer. > [!NOTE] > Most of Sparv's features should work in a Windows environment as well, but since we don't do any testing on Windows @@ -17,13 +17,13 @@ We recommend using pipx, which will install Sparv in an isolated environment whi from anywhere. Begin by [installing pipx](https://pipxproject.github.io/pipx/installation/) if you haven't already: -```bash +``` python3 -m pip install --user pipx python3 -m pipx ensurepath ``` Once pipx is installed, run the following command to install the Sparv Pipeline: -```bash +``` pipx install sparv-pipeline ``` @@ -78,24 +78,37 @@ the Sparv Pipeline. In order to use it within the Sparv Pipeline it is enough to | | | |:---|:----------| |**Purpose** |Swedish named-entity recognition. Recommended for standard Swedish annotations. -|**Download** |[hfst-SweNER](http://www.ling.helsinki.fi/users/janiemi/finclarin/ner/hfst-swener-0.9.3.tgz) +|**Download** |[hfst-SweNER](http://urn.fi/urn%3Anbn%3Afi%3Alb-2021101202) |**Version compatible with Sparv** |0.9.3 -|**Dependencies** |[Python 2](https://www.python.org/download/releases/2.0/#download) - -The current version of hfst-SweNER expects to be run in a Python 2 environment while the Sparv Pipeline is written in -Python 3. Before installing hfst-SweNER you need make sure that it will be run with the correct version of Python by -replacing `python` with `python2` in all the Python scripts in the `hfst-swener-0.9.3/scripts` directory. The first line -in every script will then look like this: -```python -#! /usr/bin/env python2 + +> [!NOTE] +> hfst-SweNER requires a Unix-like environment. + +The current version of hfst-SweNER is written for Python 2 while Sparv uses Python 3, so before installing it needs to +be patched. After extracting the archive, go to the `hfst-swener-0.9.3/scripts` directory and create the file +`swener.patch` with the following contents: + ``` -On Unix systems this can be done by running the following command from within the `hfst-swener-0.9.3/scripts` -directory: -```bash -sed -i 's:#! \/usr/bin/env python:#! /usr/bin/env python2:g' *.py +--- convert-namex-tags.py ++++ convert-namex-tags.py +@@ -1 +1 @@ +-#! /usr/bin/env python ++#! /usr/bin/env python3 +@@ -34 +34 @@ +- elif isinstance(files, basestring): ++ elif isinstance(files, str): +@@ -73 +73 @@ +- return [s[start:start+partlen] for start in xrange(0, len(s), partlen)] ++ return [s[start:start+partlen] for start in range(0, len(s), partlen)] +@@ -132,3 +131,0 @@ +- sys.stdin = codecs.getreader('utf-8')(sys.stdin) +- sys.stdout = codecs.getwriter('utf-8')(sys.stdout) +- sys.stderr = codecs.getwriter('utf-8')(sys.stderr) ``` -After applying these changes please follow the installation instructions provided by hfst-SweNER. +Then simply run the command `patch < swener.patch`, which will make the necessary changes. + +After applying the patch, please follow the installation instructions provided by hfst-SweNER. ### Hunpos | | | @@ -109,11 +122,9 @@ Installation is done by unpacking and then adding the executables to your path ( Alternatively you can place the binaries inside your [Sparv data directory](#setting-up-sparv) under `bin`. If you are running a 64-bit OS, you might also have to install 32-bit compatibility libraries if Hunpos won't run: -```bash -sudo apt install ia32-libs ``` -On Arch Linux, activate the `multilib` repository and install `lib32-gcc-libs`. If that doesn't work, you might have to -compile Hunpos from source. +sudo apt install lib32z1 +``` On newer macOS you probably have to compile Hunpos from source. [This GitHub repo](https://github.com/mivoq/hunpos) has instructions that should work. @@ -137,19 +148,12 @@ Download and unpack the zip-file from the [MaltParser webpage](http://www.maltpa ### Corpus Workbench | | | |:---|:----------| -|**Purpose** |Creating corpus workbench binary files. You will only need it if you want to be able to search corpora with this tool. -|**Download** |[Corpus Workbench on SourceForge](http://cwb.sourceforge.net/beta.php) +|**Purpose** |Creating Corpus Workbench binary files. Only needed if you want to be able to search corpora with this tool. +|**Download** |[Corpus Workbench on SourceForge](https://cwb.sourceforge.io/download.php) |**License** |[GPL-3.0](https://www.gnu.org/licenses/gpl-3.0.html) -|**Version compatible with Sparv** |beta 3.4.21 (probably works with newer versions) - -Refer to the INSTALL text file for instructions on how to build and install on your system. CWB needs two directories -for storing the corpora, one for the data, and one for the corpus registry. You will have to create these directories, -and then set the environment variables `CWB_DATADIR` and `CORPUS_REGISTRY` and point them to the directories -you created. For example: -```bash -export CWB_DATADIR=~/cwb/data; -export CORPUS_REGISTRY=~/cwb/registry; -``` +|**Version compatible with Sparv** |beta 3.4.21 (most likely works with newer versions) + +Refer to the INSTALL text file for instructions on how to build and install on your system. ### Software for Analysing Other Languages than Swedish Sparv can use different third-party tools for analyzing corpora in other languages than Swedish. @@ -200,9 +204,9 @@ After downloading the software you need to have the `tree-tagger` binary in your |:---|:----------| |**Purpose** |Various analyses for English |**Download** |[Stanford CoreNLP webpage](https://stanfordnlp.github.io/CoreNLP/history.html) -|**Version compatible with Sparv** |4.0.0 (may work with newer versions) |**License** |[GPL-2.0](https://www.gnu.org/licenses/old-licenses/gpl-2.0.html) -|**Dependencies** |[Java](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) +|**Version compatible with Sparv** |4.0.0 (may work with newer versions) +|**Dependencies** |[Java](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) Please download, unzip and place contents inside the [Sparv data directory](#setting-up-sparv) under `bin/stanford_parser`. @@ -211,12 +215,13 @@ Please download, unzip and place contents inside the [Sparv data directory](#set |:---|:----------| |**Purpose** |Tokenisation, POS-tagging, lemmatisation and named entity recognition for [some languages](#software-for-analysing-other-languages-than-swedish) |**Download** |[FreeLing on GitHub](https://github.com/TALP-UPC/FreeLing/releases/tag/4.2) -|**Version compatible with Sparv** |4.2 |**License** |[AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html) +|**Version compatible with Sparv** |4.2 Please install the software (including the additional language data) according to the instructions provided by FreeLing. -You will also need to install the [sparv-freeling plugin](https://github.com/spraakbanken/sparv-freeling). Please follow -the installation instructions for the sparv-freeling module on [GitHub](https://github.com/spraakbanken/sparv-freeling) +Note that you will need to uncompress the source and language files in the same folder before compiling. +You will also need to install the [sparv-sbx-freeling plugin](https://github.com/spraakbanken/sparv-sbx-freeling). Please follow +the installation instructions for the sparv-sbx-freeling module on [GitHub](https://github.com/spraakbanken/sparv-sbx-freeling) in order to set up the plugin correctly. ## Plugins -The only available plugin for Sparv available so far is [the sparv-freeling -plugin](https://github.com/spraakbanken/sparv-freeling). Please refer to its GitHub page for installation instructions. -## Uninstalling Sparv +If you have the Sparv Pipeline installed on your machine, you can install plugins by injecting them into the Sparv +Pipeline code using pipx: +``` +pipx inject sparv-pipeline [pointer-to-sparv-plugin] +``` -To uninstall Sparv completely, manually delete the [Sparv data directory](#setting-up-sparv), and then run one of the -following commands, depending on whether you installed Sparv using pipx or pip. +The `pointer-to-sparv-plugin` can be a package available on the [Python Package Index (PyPI)](https://pypi.org/), a +remote public repository, or a local directory on your machine. -```bash -pipx uninstall sparv-pipeline -``` +For now there are two plugins available for Sparv: +[sparv-sbx-freeling](https://github.com/spraakbanken/sparv-sbx-freeling) and +[sparv-sbx-metadata](https://github.com/spraakbanken/sparv-sbx-metadata). Please refer to their GitHub page for more +information. -```bash -pip uninstall sparv-pipeline +Plugins can be uninstalled by running: +``` +pipx runpip sparv-pipeline uninstall [name-of-sparv-plugin] ``` + +## Uninstalling Sparv + +To uninstall Sparv completely, follow these steps: + +1. Run `sparv setup --reset` to unset [Sparv's data directory](#setting-up-sparv). The directory itself will not be + removed, but its location (if available) will be printed. +2. Manually delete the data directory. +3. Run one of the following commands, depending on whether you installed Sparv using pipx or pip: + + ``` + pipx uninstall sparv-pipeline + ``` + + ``` + pip uninstall sparv-pipeline + ``` diff --git a/docs/user-manual/quick-start.md b/docs/user-manual/quick-start.md new file mode 100644 index 00000000..f6ce151e --- /dev/null +++ b/docs/user-manual/quick-start.md @@ -0,0 +1,158 @@ +# Quick Start + +This quick start guide will get you started with Sparv in just a few minutes, and will guide you through +annotating your first corpus. For a more comprehensive [installation](user-manual/installation-and-setup.md) and +user guide, please refer to the full documentation. + +> [!INFO] +> Sparv is a command line application, and just like the +> [2004 Steven Spielberg movie](https://www.imdb.com/title/tt0362227/), this quick start guide takes place in a +> [terminal](https://en.wikipedia.org/wiki/Terminal_emulator). +> +> This guide should work both in a Unix-like environment and the Windows command line. + +## Installation + +Begin by making sure that you have [Python 3.6.2](http://python.org/) or newer installed by running the following +in your terminal: +``` +python3 --version +``` + +> [!NOTE] +> On some systems, the command may be called `python` instead of `python3`. + +Continue by [installing pipx](https://pipxproject.github.io/pipx/installation/) if you haven't already: +``` +python3 -m pip install --user pipx +python3 -m pipx ensurepath +``` + +Once pipx is installed, run the following command to install the Sparv Pipeline: +``` +pipx install sparv-pipeline +``` + +To verify that the installation was successful, try running Sparv which should print Sparv's command line help: +``` +sparv +``` + +Finish the installation by running the [Sparv setup](user-manual/installation-and-setup.md#sparv-data-directory) +command, to select a location for Sparv to save its models and configuration: +``` +sparv setup +``` + +## Creating a Corpus + +Now that Sparv is installed and working, let's try it out on a small corpus. + +Each corpus needs its own directory, so begin by creating one called `mycorpus`: +``` +mkdir mycorpus +cd mycorpus +``` + +In this directory, create another directory called `source`, where we will put the corpus source files (the files +containing the text we want to annotate): +``` +mkdir source +``` + +Next, use your favourite plain text editor (i.e. not Word) to create a source file in XML format, and put it in the +`source` directory. Make sure to save it in UTF-8 encoding. + +`document.xml` +```xml + + Ord, ord, ord. Här kommer några fler ord. + +``` + +> [!NOTE] +> The `source` directory may contain as many files as you want, but let's start with just this one. + +## Creating the Config File + +For Sparv to know what to do with your corpus, you first need to create a +[configuration file](user-manual/corpus-configuration.md). This can be accomplished +either by running the [corpus config wizard](user-manual/corpus-configuration.md#corpus-config-wizard), or by writing it +by hand. Using the wizard is usually easier, but for now, let's get our hands dirty and write it by hand! + +Use your text editor to create a file called `config.yaml` directly under your corpus directory. Remember to save it +in UTF-8 encoding. +The directory structure should now look like this: + +``` +mycorpus/ +├── config.yaml +└── source/ + └── document.xml +``` + +Add the following to the configuration file and save it: + +```yaml +metadata: + language: swe +import: + importer: xml_import:parse +export: + annotations: + - + - +``` + +The configuration file consists of different sections, each containing configuration variables and their values. First, +we have told Sparv the language of our corpus (Swedish). Second, in the `import` section, we have specified which of +Sparv's importer modules to use (we want the one for XML). Finally, in the `export` section, we have listed what +automatic annotations we want Sparv to add. For this simple corpus we only ask for sentence segmentation and +tokenisation. + +## Running Sparv + +If you have followed the above steps, everything should now be ready. Make sure that you are in the `mycorpus` folder, +and then run Sparv by typing: +``` +sparv run +``` + +After a short while, Sparv will tell you where the resulting files are saved. Let's have a look at one of them: + +`export/xml_export.pretty/document_export.xml` +```xml + + + + Ord + , + ord + , + ord + . + + + Här + kommer + några + fler + ord + . + + +``` + +## What's Next? + +Try adding some more annotations to your corpus by extending the annotations list in the corpus configuration. To find +out what annotations are available, use the `sparv modules` command. You can also try out the corpus configuration +wizard by running `sparv wizard`. + +It is also possible to annotate texts in other languages, e.g., English. Just change the line `language: swe` to +`language: eng` in the file `config.yaml`. Run `sparv languages` to see what languages are available in Sparv. + +> [!NOTE] +> Some annotations may require +> [additional software to be installed](user-manual/installation-and-setup.md#installing-additional-third-party-software) +> before you can use them. diff --git a/docs/user-manual/requirements-for-source-files.md b/docs/user-manual/requirements-for-source-files.md index ccfcfb47..4563cecb 100644 --- a/docs/user-manual/requirements-for-source-files.md +++ b/docs/user-manual/requirements-for-source-files.md @@ -8,12 +8,9 @@ requirements: 2. If your corpus is in XML format, make sure your **XML is valid** and that the text to be analysed is actual text (not attribute values). -3. Your source documents must all use the same file format, same file extension and (if applicable) the same markup. +3. Your source files must all use the same file format, same file extension and (if applicable) the same markup. -4. If your corpus is in XML format, make sure you don't have any elements or attributes called "not" as this is a - reserved keyword in the Sparv Pipeline. - -5. If your source documents are very large or if your corpus consists of a large number of tiny documents, Sparv +4. If your source files are very large or if your corpus consists of a large number of tiny files, Sparv may become quite slow. Very large files may also lead to memory problems. Try keeping the maximum file size per - document around 5-10 MB, and in the case of many tiny files, combining them into larger files if possible. - If your machine has a lot of memory, processing larger documents may work just fine. + file around 5-10 MB, and in the case of many tiny files, combining them into larger files if possible. + If your machine has a lot of memory, processing larger files may work just fine. diff --git a/docs/user-manual/running-sparv.md b/docs/user-manual/running-sparv.md index cd31a976..d5b8e886 100644 --- a/docs/user-manual/running-sparv.md +++ b/docs/user-manual/running-sparv.md @@ -1,6 +1,6 @@ # Running Sparv Sparv is run from the command line. Typically, you will want to run Sparv from within a corpus directory containing some -text documents (the corpus) and a [corpus config file](user-manual/corpus-configuration.md). A typical corpus directory +text files (the corpus) and a [corpus config file](user-manual/corpus-configuration.md). A typical corpus directory structure could look like this: ``` @@ -24,7 +24,7 @@ Annotating a corpus: Inspecting corpus details: config Display the corpus config - files List available corpus documents (input for Sparv) + files List available corpus source files (input for Sparv) Show annotation info: modules List available modules and annotations @@ -82,7 +82,7 @@ classes](user-manual/corpus-configuration.md#annotation-classes). **`sparv config`:** This command lets you inspect the configuration for your corpus. You can read more about this in the [section about corpus configuration](user-manual/corpus-configuration.md). -**`sparv files`:** By using this command you can list all available source documents belonging to your corpus. +**`sparv files`:** By using this command you can list all available source files belonging to your corpus. ## Setting Up the Sparv Pipeline **`sparv setup`** and **`sparv build-models`:** These commands are explained in the section [Setting Up @@ -93,20 +93,20 @@ Sparv](user-manual/installation-and-setup.md#setting-up-sparv). the specified file. Multiple arguments can be supplied. Example running the Stanza annotations (part-of-speech tagging and dependency parsing) for all input files: -```bash +``` sparv run-rule stanza:annotate ``` Example creating the part-of-speech annotation for the input file `document1`: -```bash +``` sparv create-file annotations/dokument1/segment.token/stanza.pos ``` **`sparv run-module`:** Run an annotator module independently (mostly for debugging). You must supply the module and the function you want to run and all the mandatory arguments. E.g. to run the hunpos msd tagging module on the input file called `document1` you could use the following command: -```bash -sparv run-module hunpos msdtag --out segment.token:hunpos.msd --word segment.token:misc.word --sentence segment.sentence --binary hunpos-tag --model hunpos/suc3_suc-tags_default-setting_utf8.model --morphtable hunpos/saldo_suc-tags.morphtable --patterns hunpos/suc.patterns --doc dokument1 +``` +sparv run-module hunpos msdtag --out segment.token:hunpos.msd --word segment.token:misc.word --sentence segment.sentence --binary hunpos-tag --model hunpos/suc3_suc-tags_default-setting_utf8.model --morphtable hunpos/saldo_suc-tags.morphtable --patterns hunpos/suc.patterns --encoding UTF-8 --source_file dokument1 ``` **`sparv preload`:** This command preloads annotators and their models and/or related binaries to speed up @@ -135,13 +135,13 @@ plan on using when running Sparv (e.g. `sparv run -j 4`), or the preloader might speeding things up. Example of starting the preloader with four parallel processes: -```bash +``` sparv preload --socket my_socket.sock --processes 4 ``` Once the preloader is up and running, use another terminal to annotate your corpus. To make Sparv use the preloader when annotating, use the `--socket` argument and point it to the same socket file created by the preloader. For example: -```bash +``` sparv run --socket my_socket.sock ``` @@ -151,6 +151,6 @@ would rather have Sparv wait for the preloader, use the `--force-preloader` flag To shut down the preloader, either press Ctrl-C in the preloader terminal, or use the command `sparv preload stop` while pointing it to the relevant socket. For example: -```bash +``` sparv preload stop --socket my_socket.sock ``` diff --git a/pytest.ini b/pytest.ini index 8a9cfdf1..ab6652dc 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,6 +5,7 @@ testpaths = markers = slow: tests that potentially take a long time to complete swe: tests for Swedish corpora + swehist: tests for corpora with historical Swedish treetagger: tests for Treetagger corpora freeling: tests for FreeLing corpora stanford: tests for Stanford Parser corpora diff --git a/setup.py b/setup.py index d445f6ba..5a52e047 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def get_version(rel_path): """Get version number from package.""" here = os.path.abspath(os.path.dirname(__file__)) - with open(os.path.join(here, rel_path)) as f: + with open(os.path.join(here, rel_path), encoding="utf-8") as f: for line in f: if line.startswith("__version__"): delim = '"' if '"' in line else "'" @@ -36,23 +36,26 @@ def get_readme(readme_path): license="MIT", packages=setuptools.find_namespace_packages(include=["sparv", "sparv.*"]), zip_safe=False, - python_requires=">=3.6.1", + python_requires=">=3.6.2", install_requires=[ "appdirs==1.4.4", "iso-639==0.4.5", - "nltk==3.5", - "python-dateutil==2.8.1", - "PyYAML==5.4.1", - "questionary==1.9.0", - "rich==10.0.0", - "snakemake==6.0.5", - "stanza==1.2", - "typing-inspect==0.6.0" + "docx2python==1.27.1", + "nltk==3.6.7", + "protobuf~=3.19.0", # Used by Stanza; see https://github.com/spraakbanken/sparv-pipeline/issues/161 + "python-dateutil==2.8.2", + "PyYAML==6.0", + "questionary==1.10.0", + "rich==11.0.0", + "snakemake==6.3.0", + "stanza==1.3.0", + "torch>=1.9.1", # Used by Stanza; see https://github.com/spraakbanken/sparv-pipeline/issues/82 + "typing-inspect==0.7.1" ], extras_require={ "dev": [ - "pandocfilters==1.4.3", - "pytest==5.4.3", + "pandocfilters==1.5.0", + "pytest==6.2.5", "pytest-sugar==0.9.4" ] }, diff --git a/sparv/__init__.py b/sparv/__init__.py index 52147126..edd0e0eb 100644 --- a/sparv/__init__.py +++ b/sparv/__init__.py @@ -1,48 +1,3 @@ """Main Sparv package.""" -from sparv.core.registry import annotator, exporter, importer, installer, modelbuilder, wizard -from sparv.util.classes import (AllDocuments, Annotation, AnnotationAllDocs, AnnotationCommonData, AnnotationData, - AnnotationDataAllDocs, Binary, BinaryDir, Config, Corpus, Document, Export, - ExportAnnotations, ExportInput, Headers, Language, Model, ModelOutput, Output, - OutputAllDocs, OutputCommonData, OutputData, OutputDataAllDocs, Source, - SourceAnnotations, SourceStructure, SourceStructureParser, Text, Wildcard) -__version__ = "4.1.1" - -# Only expose classes and functions that are meant to be used in modules -__all__ = [ - "annotator", - "exporter", - "importer", - "installer", - "modelbuilder", - "wizard", - "AllDocuments", - "Annotation", - "AnnotationAllDocs", - "AnnotationCommonData", - "AnnotationData", - "AnnotationDataAllDocs", - "Binary", - "BinaryDir", - "Config", - "Corpus", - "Document", - "Export", - "ExportAnnotations", - "ExportInput", - "Headers", - "Language", - "Model", - "ModelOutput", - "Output", - "OutputAllDocs", - "OutputCommonData", - "OutputData", - "OutputDataAllDocs", - "Source", - "SourceAnnotations", - "SourceStructure", - "SourceStructureParser", - "Text", - "Wildcard" -] +__version__ = "5.0.0" diff --git a/sparv/__main__.py b/sparv/__main__.py index bf98cde1..38918395 100644 --- a/sparv/__main__.py +++ b/sparv/__main__.py @@ -4,16 +4,11 @@ import sys from pathlib import Path -import snakemake -from snakemake.logging import logger - from sparv import __version__ -from sparv.core import log_handler, paths, setup -from sparv.core.paths import sparv_path # Check Python version -if sys.version_info < (3, 6, 1): - raise Exception("Python 3.6.1 or higher is required.") +if sys.version_info < (3, 6, 2): + raise Exception("Python 3.6.2 or higher is required.") class CustomArgumentParser(argparse.ArgumentParser): @@ -73,7 +68,7 @@ def main(): "", "Inspecting corpus details:", " config Display the corpus config", - " files List available corpus documents (input for Sparv)", + " files List available corpus source files (input for Sparv)", "", "Show annotation info:", " modules List available modules and annotations", @@ -117,7 +112,7 @@ def main(): # Inspect config_parser = subparsers.add_parser("config", description="Display the corpus configuration.") config_parser.add_argument("options", nargs="*", default=[], help="Specific options(s) in config to display") - subparsers.add_parser("files", description="List available corpus documents that can be annotated by Sparv.") + subparsers.add_parser("files", description="List available corpus source files that can be annotated by Sparv.") # Annotation info modules_parser = subparsers.add_parser("modules", description="List available modules and annotations.") @@ -136,6 +131,7 @@ def main(): setup_parser = subparsers.add_parser("setup", description="Set up the Sparv data directory. Run without arguments " "for interactive setup.") setup_parser.add_argument("-d", "--dir", help="Directory to use as Sparv data directory") + setup_parser.add_argument("--reset", action="store_true", help="Reset data directory setting.") models_parser = subparsers.add_parser("build-models", description=("Download and build the Sparv models. This is optional, as " @@ -158,10 +154,12 @@ def main(): runrule_parser.add_argument("-l", "--list", action="store_true", help="List available rules") runrule_parser.add_argument("-w", "--wildcards", nargs="*", metavar="WILDCARD", help="Supply values for wildcards using the format 'name=value'") + runrule_parser.add_argument("--force", action="store_true", help="Force recreation of target") createfile_parser = subparsers.add_parser("create-file", description=("Create specified file(s). " "The full path must be supplied and wildcards must be replaced.")) createfile_parser.add_argument("targets", nargs="*", default=["list"], help="File(s) to create") createfile_parser.add_argument("-l", "--list", action="store_true", help="List available files that can be created") + createfile_parser.add_argument("--force", action="store_true", help="Force recreation of target") preloader_parser = subparsers.add_parser("preload", description="Preload annotators and models") preloader_parser.add_argument("preload_command", nargs="?", default="start", choices=["start", "stop"]) @@ -171,14 +169,13 @@ def main(): # Add common arguments for subparser in [run_parser, runrule_parser]: - subparser.add_argument("-d", "--doc", nargs="+", default=[], help="Only annotate specified input document(s)") + subparser.add_argument("-f", "--file", nargs="+", default=[], help="Only annotate specified input file(s)") for subparser in [run_parser, runrule_parser, createfile_parser, models_parser, install_parser]: subparser.add_argument("-n", "--dry-run", action="store_true", help="Print summary of tasks without running them") - subparser.add_argument("-j", "--cores", type=int, metavar="N", help="Use at most N cores in parallel", + subparser.add_argument("-j", "--cores", type=int, nargs="?", const=0, metavar="N", + help="Use at most N cores in parallel; if N is omitted, use all available CPU cores", default=1) - subparser.add_argument("-v", "--verbose", action="store_true", - help="Show more info about currently running tasks") subparser.add_argument("--log", metavar="LOGLEVEL", const="info", help="Set the log level (default: 'warning' if --log is not specified, " "'info' if LOGLEVEL is not specified)", @@ -187,13 +184,17 @@ def main(): help="Set log level for logging to file (default: 'warning' if --log-to-file is not " "specified, 'info' if LOGLEVEL is not specified)", nargs="?", choices=["debug", "info", "warning", "error", "critical"]) + subparser.add_argument("--stats", action="store_true", help="Show summary of time spent per annotator") subparser.add_argument("--debug", action="store_true", help="Show debug messages") subparser.add_argument("--socket", help="Path to socket file created by the 'preload' command") subparser.add_argument("--force-preloader", action="store_true", help="Try to wait for preloader when it's busy") + subparser.add_argument("--simple", action="store_true", help="Show less details while running") # Add extra arguments to 'run' that we want to come last run_parser.add_argument("--unlock", action="store_true", help="Unlock the working directory") + run_parser.add_argument("--mark-complete", nargs="+", metavar="FILE", help="Mark output files as complete") + run_parser.add_argument("--rerun-incomplete", action="store_true", help="Rerun incomplete output files") # Backward compatibility if len(sys.argv) > 1 and sys.argv[1] == "make": @@ -209,6 +210,10 @@ def main(): run.main(unknown_args, log_level=args.log) sys.exit() else: + import snakemake + from snakemake.logging import logger + from snakemake.utils import available_cpu_count + from sparv.core import log_handler, paths, setup args = parser.parse_args() if args.command not in ("setup",): @@ -230,7 +235,10 @@ def main(): sys.exit(1) if args.command == "setup": - setup.run(args.dir) + if args.reset: + setup.reset() + else: + setup.run(args.dir) sys.exit(0) elif args.command == "wizard": from sparv.core.wizard import Wizard @@ -239,17 +247,30 @@ def main(): sys.exit(0) # Check that a corpus config file is available in the working dir + try: + config_exists = Path(args.dir or Path.cwd(), paths.config_file).is_file() + except PermissionError as e: + print(f"{e.strerror}: {e.filename!r}") + sys.exit(1) + if args.command not in ("build-models", "languages"): - if not Path(args.dir or Path.cwd(), paths.config_file).is_file(): + if not config_exists: print(f"No config file ({paths.config_file}) found in working directory.") sys.exit(1) + # For the 'build-models' command there needs to be a config file or a language parameter + elif args.command == "build-models": + if not config_exists and not args.language: + print("Models are built for a specific language. Please provide one with the --language param or run this " + f"from a directory that has a config file ({paths.config_file}).") + sys.exit(1) snakemake_args = {"workdir": args.dir} config = {"run_by_sparv": True} simple_target = False log_level = "" log_file_level = "" - verbose = False + simple_mode = False + stats = False pass_through = False dry_run = False @@ -278,15 +299,20 @@ def main(): snakemake_args["targets"] = ["preload_list"] elif args.command in ("run", "run-rule", "create-file", "install", "build-models"): + try: + cores = args.cores or available_cpu_count() + except NotImplementedError: + cores = 1 snakemake_args.update({ "dryrun": args.dry_run, - "cores": args.cores, + "cores": cores, "resources": {"threads": args.cores} }) # Never show progress bar for list commands or dry run if args.list or args.dry_run: simple_target = True + stats = args.stats dry_run = args.dry_run # Command: run @@ -295,6 +321,12 @@ def main(): snakemake_args["unlock"] = args.unlock simple_target = True pass_through = True + if args.mark_complete: + snakemake_args["cleanup_metadata"] = args.mark_complete + simple_target = True + pass_through = True + elif args.rerun_incomplete: + snakemake_args["force_incomplete"] = True if args.list: snakemake_args["targets"] = ["list_exports"] elif args.output: @@ -309,12 +341,17 @@ def main(): if args.list or snakemake_args["targets"] == ["list"]: snakemake_args["targets"] = ["list_targets"] simple_target = True + elif args.force: + # Rename all-files-rule to the related regular rule + snakemake_args["forcerun"] = [t.replace(":", "::") for t in args.targets] # Command: create-file elif args.command == "create-file": snakemake_args["targets"] = args.targets if args.list or snakemake_args["targets"] == ["list"]: snakemake_args["targets"] = ["list_files"] simple_target = True + elif args.force: + snakemake_args["forcerun"] = args.targets # Command: install elif args.command == "install": if args.list: @@ -335,9 +372,9 @@ def main(): log_level = args.log or "warning" log_file_level = args.log_to_file or "warning" - verbose = args.verbose + simple_mode = args.simple config.update({"debug": args.debug, - "doc": vars(args).get("doc", []), + "file": vars(args).get("file", []), "log_level": log_level, "log_file_level": log_file_level, "socket": args.socket, @@ -356,13 +393,13 @@ def main(): # Disable Snakemake's default log handler and use our own logger.log_handler = [] progress = log_handler.LogHandler(progressbar=not simple_target, log_level=log_level, log_file_level=log_file_level, - verbose=verbose, pass_through=pass_through, dry_run=dry_run) + simple=simple_mode, stats=stats, pass_through=pass_through, dry_run=dry_run) snakemake_args["log_handler"] = [progress.log_handler] config["log_server"] = progress.log_server # Run Snakemake - success = snakemake.snakemake(sparv_path / "core" / "Snakefile", config=config, **snakemake_args) + success = snakemake.snakemake(paths.sparv_path / "core" / "Snakefile", config=config, **snakemake_args) progress.stop() progress.cleanup() diff --git a/sparv/api/__init__.py b/sparv/api/__init__.py new file mode 100644 index 00000000..1328f04e --- /dev/null +++ b/sparv/api/__init__.py @@ -0,0 +1,12 @@ +"""Classes and methods for use by plugin modules.""" + +import sparv.core.io # Needed to avoid a circular import problem when importing the classes below +from sparv.core.misc import SparvErrorMessage, get_logger +from sparv.core.registry import annotator, exporter, importer, installer, modelbuilder, wizard + +from .classes import (AllSourceFilenames, Annotation, AnnotationAllSourceFiles, AnnotationCommonData, AnnotationData, + AnnotationDataAllSourceFiles, Binary, BinaryDir, Config, Corpus, Export, ExportAnnotations, + ExportAnnotationsAllSourceFiles, ExportInput, Headers, Language, Model, ModelOutput, Namespaces, + Output, OutputAllSourceFiles, OutputCommonData, OutputData, OutputDataAllSourceFiles, Source, + SourceAnnotations, SourceAnnotationsAllSourceFiles, SourceFilename, SourceStructure, + SourceStructureParser, Text, Wildcard) diff --git a/sparv/util/classes.py b/sparv/api/classes.py similarity index 69% rename from sparv/util/classes.py rename to sparv/api/classes.py index 2b1bb836..0e6bd5e2 100644 --- a/sparv/util/classes.py +++ b/sparv/api/classes.py @@ -1,20 +1,20 @@ """Classes used as default input for annotator functions.""" import gzip -import logging import os import pathlib import pickle import urllib.request import zipfile from abc import ABC, abstractmethod -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import sparv.core from sparv.core import io +from sparv.core.misc import get_logger from sparv.core.paths import models_dir -log = logging.getLogger(__name__) +logger = get_logger(__name__) class Base(ABC): @@ -59,12 +59,14 @@ class BaseAnnotation(Base): """An annotation or attribute used as input.""" data = False - all_docs = False + all_files = False common = False + is_input = True - def __init__(self, name: str = "", doc: Optional[str] = None): + def __init__(self, name: str = "", source_file: Optional[str] = None, is_input: bool = True): super().__init__(name) - self.doc = doc + self.source_file = source_file + self.is_input = is_input def expand_variables(self, rule_name: str = "") -> List[str]: """Update name by replacing references with annotation names, and [config] references with config values. @@ -94,26 +96,33 @@ def attribute_name(self) -> Optional[str]: return self.split()[1] or None def __eq__(self, other): - return type(self) == type(other) and self.name == other.name and self.doc == other.doc + return type(self) == type(other) and self.name == other.name and self.source_file == other.source_file def __hash__(self): - return hash(repr(self) + repr(self.doc)) + return hash(repr(self) + repr(self.source_file)) class Annotation(BaseAnnotation): - """Regular Annotation tied to one document.""" + """Regular Annotation tied to one source file.""" - def __init__(self, name: str = "", doc: Optional[str] = None): - super().__init__(name, doc=doc) - self.size = None + def __init__(self, name: str = "", source_file: Optional[str] = None, is_input: bool = True): + super().__init__(name, source_file=source_file) + self._size = None + self.is_input = is_input def exists(self) -> bool: """Return True if annotation file exists.""" - return io.annotation_exists(self.doc, self) + return io.annotation_exists(self.source_file, self) + + def get_size(self): + """Get number of values.""" + if self._size is None: + self._size = io.get_annotation_size(self.source_file, self) + return self._size def read(self, allow_newlines: bool = False): """Yield each line from the annotation.""" - return io.read_annotation(self.doc, self, allow_newlines=allow_newlines) + return io.read_annotation(self.source_file, self, allow_newlines=allow_newlines) def get_children(self, child: BaseAnnotation, orphan_alert=False, preserve_parent_annotation_order=False): """Return two lists. @@ -121,7 +130,7 @@ def get_children(self, child: BaseAnnotation, orphan_alert=False, preserve_paren The first one is a list with n (= total number of parents) elements where every element is a list of indices in the child annotation. The second one is a list of orphans, i.e. containing indices in the child annotation that have no parent. - Both parents and children are sorted according to their position in the source document, unless + Both parents and children are sorted according to their position in the source file, unless preserve_parent_annotation_order is set to True, in which case the parents keep the order from the parent annotation. """ @@ -148,8 +157,8 @@ def get_children(self, child: BaseAnnotation, orphan_alert=False, preserve_paren break if parent_span is None or parent_span[0] > child_span[0]: if orphan_alert: - log.warning("Child '%s' missing parent; closest parent is %s", - child_i, parent_i or previous_parent_i) + logger.warning("Child '%s' missing parent; closest parent is %s", + child_i, parent_i or previous_parent_i) orphans.append(child_i) else: parent_children[-1][1].append(child_i) @@ -191,8 +200,8 @@ def get_parents(self, parent: BaseAnnotation, orphan_alert: bool = False): break if parent_span is None or parent_span[0] > child_span[0]: if orphan_alert: - log.warning("Child '%s' missing parent; closest parent is %s", - child_i, parent_i or previous_parent_i) + logger.warning("Child '%s' missing parent; closest parent is %s", + child_i, parent_i or previous_parent_i) child_parents.append((child_i, None)) else: child_parents.append((child_i, parent_i)) @@ -207,8 +216,8 @@ def read_parents_and_children(self, parent: BaseAnnotation, child: BaseAnnotatio Reorder them according to span position, but keep original index information. """ - parent_spans = sorted(enumerate(io.read_annotation_spans(self.doc, parent, decimals=True)), key=lambda x: x[1]) - child_spans = sorted(enumerate(io.read_annotation_spans(self.doc, child, decimals=True)), key=lambda x: x[1]) + parent_spans = sorted(enumerate(io.read_annotation_spans(self.source_file, parent, decimals=True)), key=lambda x: x[1]) + child_spans = sorted(enumerate(io.read_annotation_spans(self.source_file, child, decimals=True)), key=lambda x: x[1]) # Only use sub-positions if both parent and child have them if parent_spans and child_spans: @@ -221,19 +230,17 @@ def read_parents_and_children(self, parent: BaseAnnotation, child: BaseAnnotatio def read_attributes(self, annotations: Union[List[BaseAnnotation], Tuple[BaseAnnotation, ...]], with_annotation_name: bool = False, allow_newlines: bool = False): """Yield tuples of multiple attributes on the same annotation.""" - return io.read_annotation_attributes(self.doc, annotations, with_annotation_name=with_annotation_name, + return io.read_annotation_attributes(self.source_file, annotations, with_annotation_name=with_annotation_name, allow_newlines=allow_newlines) def read_spans(self, decimals=False, with_annotation_name=False): """Yield the spans of the annotation.""" - return io.read_annotation_spans(self.doc, self, decimals=decimals, + return io.read_annotation_spans(self.source_file, self, decimals=decimals, with_annotation_name=with_annotation_name) def create_empty_attribute(self): """Return a list filled with None of the same size as this annotation.""" - if self.size is None: - self.size = len(list(self.read_spans())) - return [None] * self.size + return [None] * self.get_size() class AnnotationData(BaseAnnotation): @@ -241,72 +248,76 @@ class AnnotationData(BaseAnnotation): data = True - def __init__(self, name: str = "", doc: Optional[str] = None): - super().__init__(name, doc=doc) + def __init__(self, name: str = "", source_file: Optional[str] = None): + super().__init__(name, source_file=source_file) - def read(self, doc: Optional[str] = None): + def read(self, source_file: Optional[str] = None): """Read arbitrary string data from annotation file.""" - return io.read_data(self.doc or doc, self) + return io.read_data(self.source_file or source_file, self) def exists(self): """Return True if annotation file exists.""" - return io.data_exists(self.doc, self) + return io.data_exists(self.source_file, self) -class AnnotationAllDocs(BaseAnnotation): - """Regular annotation but document must be specified for all actions. +class AnnotationAllSourceFiles(BaseAnnotation): + """Regular annotation but source file must be specified for all actions. - Use as input to an annotator to require the specificed annotation for every document in the corpus. + Use as input to an annotator to require the specificed annotation for every source file in the corpus. """ - all_docs = True + all_files = True def __init__(self, name: str = ""): super().__init__(name) - self.size = None + self._size = {} - def read(self, doc: str): + def read(self, source_file: str): """Yield each line from the annotation.""" - return io.read_annotation(doc, self) + return io.read_annotation(source_file, self) - def read_spans(self, doc: str, decimals=False, with_annotation_name=False): + def read_spans(self, source_file: str, decimals=False, with_annotation_name=False): """Yield the spans of the annotation.""" - return io.read_annotation_spans(doc, self, decimals=decimals, with_annotation_name=with_annotation_name) + return io.read_annotation_spans(source_file, self, decimals=decimals, with_annotation_name=with_annotation_name) @staticmethod - def read_attributes(doc: str, annotations: Union[List[BaseAnnotation], Tuple[BaseAnnotation, ...]], + def read_attributes(source_file: str, annotations: Union[List[BaseAnnotation], Tuple[BaseAnnotation, ...]], with_annotation_name: bool = False, allow_newlines: bool = False): """Yield tuples of multiple attributes on the same annotation.""" - return io.read_annotation_attributes(doc, annotations, with_annotation_name=with_annotation_name, + return io.read_annotation_attributes(source_file, annotations, with_annotation_name=with_annotation_name, allow_newlines=allow_newlines) - def create_empty_attribute(self, doc: str): + def get_size(self, source_file: str): + """Get number of values.""" + if self._size.get(source_file) is None: + self._size[source_file] = io.get_annotation_size(source_file, self) + return self._size[source_file] + + def create_empty_attribute(self, source_file: str): """Return a list filled with None of the same size as this annotation.""" - if self.size is None: - self.size = len(list(self.read_spans(doc))) - return [None] * self.size + return [None] * self.get_size(source_file) - def exists(self, doc: str): + def exists(self, source_file: str): """Return True if annotation file exists.""" - return io.annotation_exists(doc, self) + return io.annotation_exists(source_file, self) -class AnnotationDataAllDocs(BaseAnnotation): - """Data annotation but document must be specified for all actions.""" +class AnnotationDataAllSourceFiles(BaseAnnotation): + """Data annotation but source file must be specified for all actions.""" - all_docs = True + all_files = True data = True def __init__(self, name: str = ""): super().__init__(name) - def read(self, doc: str): + def read(self, source_file: str): """Read arbitrary string data from annotation file.""" - return io.read_data(doc, self) + return io.read_data(source_file, self) - def exists(self, doc: str): + def exists(self, source_file: str): """Return True if annotation file exists.""" - return io.data_exists(doc, self) + return io.data_exists(source_file, self) class AnnotationCommonData(BaseAnnotation): @@ -327,12 +338,12 @@ class BaseOutput(BaseAnnotation): """Base class for all Output classes.""" data = False - all_docs = False + all_files = False common = False def __init__(self, name: str = "", cls: Optional[str] = None, description: Optional[str] = None, - doc: Optional[str] = None): - super().__init__(name, doc) + source_file: Optional[str] = None): + super().__init__(name, source_file) self.cls = cls self.description = description @@ -341,39 +352,39 @@ class Output(BaseOutput): """Regular annotation or attribute used as output.""" def __init__(self, name: str = "", cls: Optional[str] = None, description: Optional[str] = None, - doc: Optional[str] = None): - super().__init__(name, cls, description=description, doc=doc) + source_file: Optional[str] = None): + super().__init__(name, cls, description=description, source_file=source_file) - def write(self, values, append: bool = False, allow_newlines: bool = False, doc: Optional[str] = None): + def write(self, values, append: bool = False, allow_newlines: bool = False, source_file: Optional[str] = None): """Write an annotation to file. Existing annotation will be overwritten. 'values' should be a list of values. """ - io.write_annotation(self.doc or doc, self, values, append, allow_newlines) + io.write_annotation(self.source_file or source_file, self, values, append, allow_newlines) def exists(self): """Return True if annotation file exists.""" - return io.annotation_exists(self.doc, self) + return io.annotation_exists(self.source_file, self) -class OutputAllDocs(BaseOutput): - """Regular annotation or attribute used as output, but document must be specified for all actions.""" +class OutputAllSourceFiles(BaseOutput): + """Regular annotation or attribute used as output, but source file must be specified for all actions.""" - all_docs = True + all_files = True def __init__(self, name: str = "", cls: Optional[str] = None, description: Optional[str] = None): super().__init__(name, cls, description=description) - def write(self, values, doc: str, append: bool = False, allow_newlines: bool = False): + def write(self, values, source_file: str, append: bool = False, allow_newlines: bool = False): """Write an annotation to file. Existing annotation will be overwritten. 'values' should be a list of values. """ - io.write_annotation(doc, self, values, append, allow_newlines) + io.write_annotation(source_file, self, values, append, allow_newlines) - def exists(self, doc: str): + def exists(self, source_file: str): """Return True if annotation file exists.""" - return io.annotation_exists(doc, self) + return io.annotation_exists(source_file, self) class OutputData(BaseOutput): @@ -382,38 +393,38 @@ class OutputData(BaseOutput): data = True def __init__(self, name: str = "", cls: Optional[str] = None, description: Optional[str] = None, - doc: Optional[str] = None): - super().__init__(name, cls, description=description, doc=doc) + source_file: Optional[str] = None): + super().__init__(name, cls, description=description, source_file=source_file) def write(self, value, append: bool = False): """Write arbitrary string data to annotation file.""" - io.write_data(self.doc, self, value, append) + io.write_data(self.source_file, self, value, append) def exists(self): """Return True if annotation file exists.""" - return io.data_exists(self.doc, self) + return io.data_exists(self.source_file, self) -class OutputDataAllDocs(BaseOutput): - """Data annotation used as output, but document must be specified for all actions.""" +class OutputDataAllSourceFiles(BaseOutput): + """Data annotation used as output, but source file must be specified for all actions.""" - all_docs = True + all_files = True data = True def __init__(self, name: str = "", cls: Optional[str] = None, description: Optional[str] = None): super().__init__(name, cls, description=description) - def read(self, doc: str): + def read(self, source_file: str): """Read arbitrary string data from annotation file.""" - return io.read_data(doc, self) + return io.read_data(source_file, self) - def write(self, value, doc: str, append: bool = False): + def write(self, value, source_file: str, append: bool = False): """Write arbitrary string data to annotation file.""" - io.write_data(doc, self, value, append) + io.write_data(source_file, self, value, append) - def exists(self, doc: str): + def exists(self, source_file: str): """Return True if annotation file exists.""" - return io.data_exists(doc, self) + return io.data_exists(source_file, self) class OutputCommonData(BaseOutput): @@ -433,40 +444,40 @@ def write(self, value, append: bool = False): class Text: """Corpus text.""" - def __init__(self, doc: Optional[str] = None): - self.doc = doc + def __init__(self, source_file: Optional[str] = None): + self.source_file = source_file def read(self) -> str: """Get corpus text.""" - return io.read_data(self.doc, io.TEXT_FILE) + return io.read_data(self.source_file, io.TEXT_FILE) def write(self, text): """Write text to the designated file of a corpus. text is a unicode string. """ - io.write_data(self.doc, io.TEXT_FILE, text) + io.write_data(self.source_file, io.TEXT_FILE, text) def __repr__(self): return "" class SourceStructure(BaseAnnotation): - """Every annotation available in a source document.""" + """Every annotation available in a source file.""" data = True - def __init__(self, doc): - super().__init__(io.STRUCTURE_FILE, doc) + def __init__(self, source_file): + super().__init__(io.STRUCTURE_FILE, source_file) def read(self): """Read structure file.""" - return io.read_data(self.doc, self) + return io.read_data(self.source_file, self) def write(self, structure): - """Sort the document's structural elements and write structure file.""" + """Sort the source file's structural elements and write structure file.""" structure.sort() - io.write_data(self.doc, self, "\n".join(structure)) + io.write_data(self.source_file, self, "\n".join(structure)) class Headers(BaseAnnotation): @@ -474,32 +485,53 @@ class Headers(BaseAnnotation): data = True - def __init__(self, doc): - super().__init__(io.HEADERS_FILE, doc) + def __init__(self, source_file): + super().__init__(io.HEADERS_FILE, source_file) def read(self) -> List[str]: """Read headers file.""" - return io.read_data(self.doc, self).splitlines() + return io.read_data(self.source_file, self).splitlines() def write(self, header_annotations: List[str]): """Write headers file.""" - io.write_data(self.doc, self, "\n".join(header_annotations)) + io.write_data(self.source_file, self, "\n".join(header_annotations)) def exists(self): """Return True if headers file exists.""" - return io.data_exists(self.doc, self) + return io.data_exists(self.source_file, self) + +class Namespaces(BaseAnnotation): + """Namespace mapping (URI to prefix) for a source file.""" -class Document(str): - """Name of a source document.""" + data = True + + def __init__(self, source_file): + super().__init__(io.NAMESPACE_FILE, source_file) + + def read(self): + """Read namespace file and parse it into a dict.""" + try: + lines = io.read_data(self.source_file, self).split("\n") + return dict(l.split(" ") for l in lines) + except FileNotFoundError: + return {} + + def write(self, namespaces: Dict[str, str]): + """Write namespace file.""" + io.write_data(self.source_file, self, "\n".join([f"{k} {v}" for k, v in namespaces.items()])) + + +class SourceFilename(str): + """Name of a source file.""" class Corpus(str): """Name of the corpus.""" -class AllDocuments(List[str]): - """List with names of all source documents.""" +class AllSourceFilenames(List[str]): + """List with names of all source files.""" class Config(str): @@ -554,18 +586,18 @@ def write(self, data): """Write arbitrary string data to models directory.""" file_path = self.path os.makedirs(file_path.parent, exist_ok=True) - with open(file_path, "w") as f: + with open(file_path, "w", encoding="utf-8") as f: f.write(data) # Update file modification time even if nothing was written os.utime(file_path, None) - log.info("Wrote %d bytes: %s", len(data), self.name) + logger.info("Wrote %d bytes: %s", len(data), self.name) def read(self): """Read arbitrary string data from file in models directory.""" file_path = self.path - with open(file_path) as f: + with open(file_path, encoding="utf-8") as f: data = f.read() - log.debug("Read %d bytes: %s", len(data), self.name) + logger.debug("Read %d bytes: %s", len(data), self.name) return data def write_pickle(self, data, protocol=-1): @@ -576,24 +608,25 @@ def write_pickle(self, data, protocol=-1): pickle.dump(data, f, protocol=protocol) # Update file modification time even if nothing was written os.utime(file_path, None) - log.info("Wrote %d bytes: %s", len(data), self.name) + logger.info("Wrote %d bytes: %s", len(data), self.name) def read_pickle(self): """Read pickled data from file in models directory.""" file_path = self.path with open(file_path, "rb") as f: data = pickle.load(f) - log.debug("Read %d bytes: %s", len(data), self.name) + logger.debug("Read %d bytes: %s", len(data), self.name) return data def download(self, url: str): """Download file from url and save to modeldir.""" os.makedirs(self.path.parent, exist_ok=True) + logger.debug("Downloading from: %s", url) try: urllib.request.urlretrieve(url, self.path) - log.info("Successfully downloaded %s", self.name) + logger.info("Successfully downloaded %s", self.name) except Exception as e: - log.error("Download from %s failed", url) + logger.error("Download of %s from %s failed", self.name, url) raise e def unzip(self): @@ -601,7 +634,7 @@ def unzip(self): out_dir = self.path.parent with zipfile.ZipFile(self.path) as z: z.extractall(out_dir) - log.info("Successfully unzipped %s", self.name) + logger.info("Successfully unzipped %s", self.name) def ungzip(self, out: str): """Unzip gzip file inside modeldir.""" @@ -609,7 +642,7 @@ def ungzip(self, out: str): data = z.read() with open(out, "wb") as f: f.write(data) - log.info("Successfully unzipped %s", out) + logger.info("Successfully unzipped %s", out) def remove(self, raise_errors: bool = False): """Remove model file from disk.""" @@ -642,27 +675,21 @@ class Source: def __init__(self, source_dir: str = ""): self.source_dir = source_dir - def get_path(self, doc: Document, extension: str): - """Get the path of a document.""" + def get_path(self, source_file: SourceFilename, extension: str): + """Get the path of a source file.""" if not extension.startswith("."): extension = "." + extension - if ":" in doc: - doc_name, _, doc_chunk = doc.partition(":") - source_file = pathlib.Path(self.source_dir, doc_name, doc_chunk + extension) + if ":" in source_file: + file_name, _, file_chunk = source_file.partition(":") + source_file = pathlib.Path(self.source_dir, file_name, file_chunk + extension) else: - source_file = pathlib.Path(self.source_dir, doc + extension) + source_file = pathlib.Path(self.source_dir, source_file + extension) return source_file class Export(str): """Export directory and filename pattern.""" - def __new__(cls, name: str, *args, **kwargs): - return super().__new__(cls, name) - - def __init__(self, name: str, absolute_path: bool = False): - self.absolute_path = absolute_path - class ExportInput(str): """Export directory and filename pattern, used as input.""" @@ -670,9 +697,8 @@ class ExportInput(str): def __new__(_cls, val: str, *args, **kwargs): return super().__new__(_cls, val) - def __init__(self, val: str, all_docs: bool = False, absolute_path: bool = False): - self.all_docs = all_docs - self.absolute_path = absolute_path + def __init__(self, val: str, all_files: bool = False): + self.all_files = all_files class ExportAnnotations(List[Tuple[Annotation, Optional[str]]]): @@ -687,7 +713,7 @@ def __init__(self, config_name: str, items=(), is_input: bool = True): self.is_input = is_input -class ExportAnnotationsAllDocs(List[Tuple[AnnotationAllDocs, Optional[str]]]): +class ExportAnnotationsAllSourceFiles(List[Tuple[AnnotationAllSourceFiles, Optional[str]]]): """List of annotations to include in export.""" # If is_input = False the annotations won't be added to the rule's input. @@ -702,13 +728,17 @@ def __init__(self, config_name: str, items=(), is_input: bool = True): class SourceAnnotations(List[Tuple[Annotation, Optional[str]]]): """List of source annotations to include in export.""" - # If is_input = False the annotations won't be added to the rule's input. - is_input = True + def __init__(self, config_name: str, items=()): + list.__init__(self, items) + self.config_name = config_name - def __init__(self, config_name: str, items=(), is_input: bool = True): + +class SourceAnnotationsAllSourceFiles(List[Tuple[AnnotationAllSourceFiles, Optional[str]]]): + """List of source annotations to include in export.""" + + def __init__(self, config_name: str, items=()): list.__init__(self, items) self.config_name = config_name - self.is_input = is_input class Language(str): @@ -734,7 +764,8 @@ def __init__(self, source_dir: pathlib.Path): def setup(self): """Return a list of wizard dictionaries with questions needed for setting up the class. - Answers to the questions will automatically be saved to self.answers.""" + Answers to the questions will automatically be saved to self.answers. + """ return {} @abstractmethod @@ -749,5 +780,6 @@ def get_annotations(self, corpus_config: dict) -> List[str]: def get_plain_annotations(self, corpus_config: dict) -> List[str]: """Return a list of plain annotations without attributes. - Each value has the format 'annotation'.""" + Each value has the format 'annotation'. + """ return [e for e in self.get_annotations(corpus_config) if ":" not in e] diff --git a/sparv/api/util/__init__.py b/sparv/api/util/__init__.py new file mode 100644 index 00000000..63122115 --- /dev/null +++ b/sparv/api/util/__init__.py @@ -0,0 +1,8 @@ +"""Utility functions and constants used by plugin modules.""" + +from . import constants +from . import export +from . import install +from . import misc +from . import mysql_wrapper +from . import system diff --git a/sparv/util/constants.py b/sparv/api/util/constants.py similarity index 86% rename from sparv/util/constants.py rename to sparv/api/util/constants.py index 3d3d36f7..aa0f7d96 100644 --- a/sparv/util/constants.py +++ b/sparv/api/util/constants.py @@ -11,6 +11,8 @@ # Namespace to be used in case annotation names collide and sparv_namespace is not set in config SPARV_DEFAULT_NAMESPACE = "sparv" +# Char used in annotations to separate a prefix from its tag name in XML namespaces +XML_NAMESPACE_SEP = "+" # Encodings: UTF8 = "UTF-8" diff --git a/sparv/util/export.py b/sparv/api/util/export.py similarity index 71% rename from sparv/util/export.py rename to sparv/api/util/export.py index b7dde516..42fdf817 100644 --- a/sparv/util/export.py +++ b/sparv/api/util/export.py @@ -1,25 +1,25 @@ """Util functions for corpus export.""" -import logging +import re import xml.etree.ElementTree as etree from collections import OrderedDict, defaultdict from copy import deepcopy from itertools import combinations from typing import Any, List, Optional, Tuple, Union -from sparv import util +from sparv.api import (Annotation, AnnotationAllSourceFiles, ExportAnnotations, ExportAnnotationsAllSourceFiles, + Headers, Namespaces, SourceStructure, SparvErrorMessage, get_logger, util) from sparv.core import io -from sparv.util import SPARV_DEFAULT_NAMESPACE, misc -from sparv.util.classes import (Annotation, AnnotationAllDocs, ExportAnnotations, ExportAnnotationsAllDocs, Headers, - SourceStructure) -log = logging.getLogger(__name__) +from .constants import SPARV_DEFAULT_NAMESPACE, XML_NAMESPACE_SEP + +logger = get_logger(__name__) def gather_annotations(annotations: List[Annotation], export_names, header_annotations=None, - doc: Optional[str] = None, + source_file: Optional[str] = None, flatten: bool = True, split_overlaps: bool = False): """Calculate the span hierarchy and the annotation_dict containing all annotation elements and attributes. @@ -28,7 +28,7 @@ def gather_annotations(annotations: List[Annotation], annotations: List of annotations to include export_names: Dictionary that maps from annotation names to export names header_annotations: List of header annotations - doc: The document name + source_file: The source filename flatten: Whether to return the spans as a flat list split_overlaps: Whether to split up overlapping spans """ @@ -82,16 +82,27 @@ def __lt__(self, other_span): 2. end position (larger indices first) 3. the calculated element hierarchy """ - def get_sort_key(span, sub_positions=False): + def get_sort_key(span, sub_positions=False, empty_span=False): """Return a sort key for span which makes span comparison possible.""" hierarchy_index = elem_hierarchy.index(span.name) if span.name in elem_hierarchy else -1 - if sub_positions: - return (span.start, span.start_sub), (-span.end, -span.end_sub), hierarchy_index + if empty_span: + if sub_positions: + return (span.start, span.start_sub), hierarchy_index, (span.end, span.end_sub) + else: + return span.start, hierarchy_index, span.end else: - return span.start, -span.end, hierarchy_index - + if sub_positions: + return (span.start, span.start_sub), (-span.end, -span.end_sub), hierarchy_index + else: + return span.start, -span.end, hierarchy_index + + # Sort empty spans according to hierarchy or put them first + if (self.start, self.start_sub) == (self.end, self.end_sub) or ( + other_span.start, other_span.start_sub) == (other_span.end, other_span.end_sub): + sort_key1 = get_sort_key(self, empty_span=True) + sort_key2 = get_sort_key(other_span, empty_span=True) # Both spans have sub positions - if self.start_sub and other_span.start_sub: + elif self.start_sub and other_span.start_sub: sort_key1 = get_sort_key(self, sub_positions=True) sort_key2 = get_sort_key(other_span, sub_positions=True) # At least one of the spans does not have sub positions @@ -99,9 +110,7 @@ def get_sort_key(span, sub_positions=False): sort_key1 = get_sort_key(self) sort_key2 = get_sort_key(other_span) - if sort_key1 < sort_key2: - return True - return False + return sort_key1 < sort_key2 if header_annotations is None: header_annotations = [] @@ -120,19 +129,32 @@ def get_sort_key(span, sub_positions=False): if attr and not annotation_dict[base_name].get(attr): annotation_dict[base_name][attr] = list(annotation.read()) elif is_header: - annotation_dict[base_name][util.HEADER_CONTENTS] = list( - Annotation(f"{base_name}:{util.HEADER_CONTENTS}", doc=doc).read(allow_newlines=True)) + try: + annotation_dict[base_name][util.constants.HEADER_CONTENTS] = list( + Annotation(f"{base_name}:{util.constants.HEADER_CONTENTS}", source_file=source_file).read( + allow_newlines=True)) + except FileNotFoundError: + raise SparvErrorMessage(f"Could not find data for XML header '{base_name}'. " + "Was this element listed in 'xml_import.header_elements'?") # Calculate hierarchy (if needed) and sort the span objects - elem_hierarchy = calculate_element_hierarchy(doc, spans_list) + elem_hierarchy = calculate_element_hierarchy(source_file, spans_list) sorted_spans = sorted(spans_list) # Add position information to sorted_spans spans_dict = defaultdict(list) for span in sorted_spans: + # Treat empty spans differently if span.start == span.end: - spans_dict[span.start].append(("open", span)) - spans_dict[span.end].append(("close", span)) + insert_index = len(spans_dict[span.start]) + if span.name in elem_hierarchy: + for i, (instruction, s) in enumerate(spans_dict[span.start]): + if instruction == "close": + if s.name in elem_hierarchy and elem_hierarchy.index(s.name) < elem_hierarchy.index(span.name): + insert_index = i + break + spans_dict[span.start].insert(insert_index, ("open", span)) + spans_dict[span.end].insert(insert_index + 1, ("close", span)) else: # Append opening spans; prepend closing spans spans_dict[span.start].append(("open", span)) @@ -196,7 +218,7 @@ def _handle_overlaps(spans_dict): subposition_shift += 1 -def calculate_element_hierarchy(doc, spans_list): +def calculate_element_hierarchy(source_file, spans_list): """Calculate the hierarchy for spans with identical start and end positions. If two spans A and B have identical start and end positions, go through all occurrences of A and B @@ -204,9 +226,20 @@ def calculate_element_hierarchy(doc, spans_list): """ # Find elements with identical spans span_duplicates = defaultdict(set) + start_positions = defaultdict(set) + end_positions = defaultdict(set) + empty_span_starts = set() for span in spans_list: span_duplicates[(span.start, span.end)].add(span.name) - span_duplicates = [v for k, v in span_duplicates.items() if len(v) > 1] + start_positions[span.start].add(span.name) + end_positions[span.end].add(span.name) + if span.start == span.end: + empty_span_starts.add(span.start) + span_duplicates = [v for v in span_duplicates.values() if len(v) > 1] + # Add empty spans and spans with identical start positions + for span_start in empty_span_starts: + span_duplicates.append(start_positions[span_start]) + span_duplicates.append(end_positions[span_start]) # Flatten structure unclear_spans = set([elem for elem_set in span_duplicates for elem in elem_set]) @@ -216,17 +249,18 @@ def calculate_element_hierarchy(doc, spans_list): # Order each pair into [parent, children] ordered_pairs = set() for a, b in relation_pairs: - a_annot = Annotation(a, doc=doc) - b_annot = Annotation(b, doc=doc) + a_annot = Annotation(a, source_file=source_file) + b_annot = Annotation(b, source_file=source_file) a_parent = len([i for i in (b_annot.get_parents(a_annot)) if i is not None]) b_parent = len([i for i in (a_annot.get_parents(b_annot)) if i is not None]) if a_parent > b_parent: ordered_pairs.add((a, b)) - else: + elif a_parent < b_parent: ordered_pairs.add((b, a)) hierarchy = [] - error_msg = "Something went wrong while sorting annotation elements. Could there be circular relations?" + error_msg = ("Something went wrong while sorting annotation elements. Could there be circular relations? " + "The following elements could not be sorted: ") # Loop until all unclear_spans are processed while unclear_spans: size = len(unclear_spans) @@ -240,25 +274,26 @@ def calculate_element_hierarchy(doc, spans_list): if pair[0] == span: ordered_pairs.remove(pair) # Check that unclear_spans is getting smaller, otherwise there might be circularity - assert len(unclear_spans) < size, error_msg + assert len(unclear_spans) < size, error_msg + " ".join(unclear_spans) return hierarchy -def get_available_source_annotations(doc: Optional[str] = None, docs: Optional[List[str]] = None) -> List[str]: - """Get a list of available annotations generated from the source, either for a single document or multiple.""" - assert doc or docs, "Either 'doc' or 'docs' must be provided" +def get_available_source_annotations(source_file: Optional[str] = None, + source_files: Optional[List[str]] = None) -> List[str]: + """Get a list of available annotations generated from the source, either for a single source file or multiple.""" + assert source_file or source_files, "Either 'source_file' or 'source_files' must be provided" available_source_annotations = set() - if docs: - for d in docs: + if source_files: + for d in source_files: available_source_annotations.update(SourceStructure(d).read().split()) else: - available_source_annotations.update(SourceStructure(doc).read().split()) + available_source_annotations.update(SourceStructure(source_file).read().split()) return sorted(available_source_annotations) -def get_source_annotations(source_annotation_names: Optional[List[str]], doc: Optional[str] = None, - docs: Optional[List[str]] = None): +def get_source_annotations(source_annotation_names: Optional[List[str]], source_file: Optional[str] = None, + source_files: Optional[List[str]] = None): """Given a list of source annotation names (and possible export names), return a list of annotation objects. If no names are provided all available source annotations will be returnd. @@ -268,33 +303,34 @@ def get_source_annotations(source_annotation_names: Optional[List[str]], doc: Op return [] # Get list of available source annotation names - available_source_annotations = get_available_source_annotations(doc, docs) + available_source_annotations = get_available_source_annotations(source_file, source_files) # Parse source_annotation_names - annotation_names = misc.parse_annotation_list(source_annotation_names, available_source_annotations) + annotation_names = util.misc.parse_annotation_list(source_annotation_names, available_source_annotations) # Make sure source_annotations doesn't include annotations not in source - source_annotations = [(Annotation(a[0], doc) if doc else AnnotationAllDocs(a[0]), a[1]) for a in + source_annotations = [(Annotation(a[0], source_file) if source_file else AnnotationAllSourceFiles(a[0]), a[1]) for a in annotation_names if a[0] in available_source_annotations] return source_annotations -def get_annotation_names(annotations: Union[ExportAnnotations, ExportAnnotationsAllDocs], +def get_annotation_names(annotations: Union[ExportAnnotations, ExportAnnotationsAllSourceFiles], source_annotations=None, - doc: Optional[str] = None, docs: Optional[List[str]] = None, + source_file: Optional[str] = None, source_files: Optional[List[str]] = None, token_name: Optional[str] = None, remove_namespaces=False, keep_struct_names=False, sparv_namespace: Optional[str] = None, - source_namespace: Optional[str] = None): + source_namespace: Optional[str] = None, + xml_mode: Optional[bool] = False): """Get a list of annotations, token attributes and a dictionary for renamed annotations. Args: annotations: List of elements:attributes (annotations) to include. - source_annotations: List of elements:attributes from the original document to include. If not specified, + source_annotations: List of elements:attributes from the source file to include. If not specified, everything will be included. - doc: Name of the source document. - docs: List of names of source documents (alternative to `doc`). + source_file: Name of the source file. + source_files: List of names of source files (alternative to `source_file`). token_name: Name of the token annotation. remove_namespaces: Remove all namespaces in export_names unless names are ambiguous. keep_struct_names: For structural attributes (anything other than token), include the annotation base name @@ -307,7 +343,7 @@ def get_annotation_names(annotations: Union[ExportAnnotations, ExportAnnotations export names. """ # Get source annotations - source_annotations = get_source_annotations(source_annotations, doc, docs) + source_annotations = get_source_annotations(source_annotations, source_file, source_files) # Combine all annotations all_annotations = _remove_duplicates(annotations + source_annotations) @@ -319,33 +355,41 @@ def get_annotation_names(annotations: Union[ExportAnnotations, ExportAnnotations else: token_attributes = [] + # Get XML namespaces + xml_namespaces = Namespaces(source_file).read() + export_names = _create_export_names(all_annotations, token_name, remove_namespaces, keep_struct_names, - source_annotations, sparv_namespace, source_namespace) + source_annotations, sparv_namespace, source_namespace, xml_namespaces, + xml_mode=xml_mode) return [i[0] for i in all_annotations], token_attributes, export_names def get_header_names(header_annotation_names: Optional[List[str]], - doc: Optional[str] = None, - docs: Optional[List[str]] = None): + source_file: Optional[str] = None, + source_files: Optional[List[str]] = None): """Get a list of header annotations and a dictionary for renamed annotations.""" # Get source_header_names from headers file if it exists source_header_names = [] - if docs: - for d in docs: - h = Headers(d) + if source_files: + for f in source_files: + h = Headers(f) if h.exists(): source_header_names.extend(h.read()) source_header_names = list(set(source_header_names)) - elif Headers(doc).exists(): - source_header_names = Headers(doc).read() + elif Headers(source_file).exists(): + source_header_names = Headers(source_file).read() # Parse header_annotation_names and convert to annotations - annotation_names = misc.parse_annotation_list(header_annotation_names, source_header_names) - header_annotations = [(Annotation(a[0], doc) if doc else AnnotationAllDocs(a[0]), a[1]) for a in + annotation_names = util.misc.parse_annotation_list(header_annotation_names, source_header_names) + header_annotations = [(Annotation(a[0], source_file) if source_file else AnnotationAllSourceFiles(a[0]), a[1]) for a in annotation_names] - export_names = _create_export_names(header_annotations, None, False, keep_struct_names=False) + # Get XML namespaces + xml_namespaces = Namespaces(source_file).read() + + export_names = _create_export_names(header_annotations, None, False, keep_struct_names=False, + xml_namespaces=xml_namespaces, xml_mode=True) return [a[0] for a in header_annotations], export_names @@ -359,13 +403,15 @@ def _remove_duplicates(annotation_tuples): return list(new_annotations.items()) -def _create_export_names(annotations: List[Tuple[Union[Annotation, AnnotationAllDocs], Any]], +def _create_export_names(annotations: List[Tuple[Union[Annotation, AnnotationAllSourceFiles], Any]], token_name: Optional[str], remove_namespaces: bool, keep_struct_names: bool, source_annotations: list = [], sparv_namespace: Optional[str] = None, - source_namespace: Optional[str] = None): + source_namespace: Optional[str] = None, + xml_namespaces: Optional[dict] = None, + xml_mode: Optional[bool] = False): """Create dictionary for renamed annotations.""" if remove_namespaces: def shorten(annotation): @@ -375,10 +421,19 @@ def shorten(annotation): segment.token -> token segment.token:saldo.baseform -> segment.token:baseform """ + def remove_before_dot(name): + # Always remove "custom." + if name.startswith("custom."): + name = name[7:] + # Remove everything before first "." + if "." in name: + name = name.split(".", 1)[1] + return name + if annotation.attribute_name: - short = io.join_annotation(annotation.annotation_name, annotation.attribute_name.split(".")[-1]) + short = io.join_annotation(annotation.annotation_name, remove_before_dot(annotation.attribute_name)) else: - short = io.join_annotation(annotation.annotation_name.split(".")[-1], None) + short = io.join_annotation(remove_before_dot(annotation.annotation_name), None) return short # Create short names dictionary and count @@ -386,7 +441,7 @@ def shorten(annotation): short_names = {} for annotation, new_name in annotations: name = annotation.name - # Don't remove namespaces from elements and attributes contained in the original documents + # Don't remove namespaces from elements and attributes contained in the source files if (annotation, new_name) in source_annotations: short_name = name else: @@ -436,11 +491,32 @@ def shorten(annotation): source_namespace) export_names = _check_name_collision(export_names, source_annotations) + # Take care of XML namespaces + export_names = {k: _get_xml_tagname(v, xml_namespaces, xml_mode) for k, v in export_names.items()} + return export_names +def _get_xml_tagname(tag, xml_namespaces, xml_mode=False): + """Take care of namespaces by looking up URIs for prefixes (if xml_mode=True) or by converting to dot notation.""" + sep = re.escape(XML_NAMESPACE_SEP) + m = re.match(fr"(.*){sep}(.+)", tag) + if m: + if xml_mode: + # Replace prefix+tag with {uri}tag + uri = xml_namespaces.get(m.group(1), "") + if not uri: + raise SparvErrorMessage(f"You are trying to export the annotation '{tag}' but no URI was found for the " + f"namespace prefix '{m.group(1)}'!") + return re.sub(fr"(.*){sep}(.+)", fr"{{{uri}}}\2", tag) + elif m.group(1): + # Replace "prefix+tag" with "prefix.tag", skip this for default namespaces + return re.sub(fr"(.*){sep}(.+)", fr"\1.\2", tag) + return tag + + def _add_global_namespaces(export_names: dict, - annotations: List[Tuple[Union[Annotation, AnnotationAllDocs], Any]], + annotations: List[Tuple[Union[Annotation, AnnotationAllSourceFiles], Any]], source_annotations: list, sparv_namespace: Optional[str] = None, source_namespace: Optional[str] = None): @@ -467,7 +543,8 @@ def _check_name_collision(export_names, source_annotations): # Get annotations with identical export attribute names reverse_index = defaultdict(set) for k, v in export_names.items(): - reverse_index[v].add(k) + if ":" in k: + reverse_index[v].add(k) possible_collisions = {k: [Annotation(v) for v in values] for k, values in reverse_index.items() if len(values) > 1} # Only keep the ones with matching element names for attr, values in possible_collisions.items(): @@ -482,15 +559,15 @@ def _check_name_collision(export_names, source_annotations): source_annot = annots[0] if annots[0].name in source_names else annots[1] new_name = SPARV_DEFAULT_NAMESPACE + "." + export_names[sparv_annot.name] export_names[sparv_annot.name] = new_name - log.info("Changing name of automatic annotation '{}' to '{}' due to collision with '{}'.".format( - sparv_annot.name, new_name, source_annot.name)) + logger.info("Changing name of automatic annotation '{}' to '{}' due to collision with '{}'.".format( + sparv_annot.name, new_name, source_annot.name)) # Warn the user if we cannot resolve collisions automatically else: annots_string = "\n".join([f"{a.name} ({'source' if a.name in source_names else 'sparv'} annotation)" for a in annots]) - log.warning("The following annotations are exported with the same name ({}) and might overwrite " - "each other: \n\n{}\n\nIf you want to keep all of these annotations you can change their " - "export names.".format(attr, annots_string)) + logger.warning("The following annotations are exported with the same name ({}) and might overwrite " + "each other: \n\n{}\n\nIf you want to keep all of these annotations you can change " + "their export names.".format(attr, annots_string)) return export_names ################################################################################ @@ -521,7 +598,7 @@ def _reorder_spans(span_positions, chunk_name: str, chunk_order): for _pos, instruction, span in span_positions: if instruction == "open": - if span.name == chunk_name: + if span.name == chunk_name and current_s_index is None: # Check current_s_index to avoid nested chunks current_s_index = int(chunk_order[span.index]) for temp_instruction, temp_span in temp_stack: @@ -552,7 +629,8 @@ def _reorder_spans(span_positions, chunk_name: str, chunk_order): if current_s_index is not None: # Encountered child to chunk new_s_order[current_s_index].append((instruction, span)) - if span.name == chunk_name: + # If chunk, check index to make sure it's the right chunk and not a nested one + if span.name == chunk_name and int(chunk_order[span.index]) == current_s_index: last_s_index = current_s_index current_s_index = None else: @@ -565,23 +643,32 @@ def _reorder_spans(span_positions, chunk_name: str, chunk_order): def _fix_parents(new_s_order, chunk_name): """Go through new_span_positions, remove duplicate opened parents and close parents.""" open_parents = [] - for s_index, chunk in sorted(new_s_order.items()): + new_s_order_indices = sorted(new_s_order.keys()) + for i, s_index in enumerate(new_s_order_indices): + chunk = new_s_order[s_index] is_parent = True + current_chunk_index = None for instruction, span in chunk: if instruction == "open": - if span.name == chunk_name: + if span.name == chunk_name and current_chunk_index is None: is_parent = False + current_chunk_index = span.index elif is_parent: open_parents.append((instruction, span)) - else: - if span.name == chunk_name: + else: # "close" + # If chunk, check index to make sure it's the right chunk and not a nested one + if span.name == chunk_name and span.index == current_chunk_index: is_parent = True + current_chunk_index = None elif is_parent: if open_parents[-1][1] == span: open_parents.pop() # Check next chunk: close parents in current chunk that are not part of next chunk and # remove already opened parents from next chunk - next_chunk = new_s_order.get(s_index + 1, []) + if i < len(new_s_order_indices) - 1: + next_chunk = new_s_order[new_s_order_indices[i + 1]] + else: + next_chunk = [] for p in reversed(open_parents): if p in next_chunk: next_chunk.remove(p) diff --git a/sparv/util/install.py b/sparv/api/util/install.py similarity index 74% rename from sparv/util/install.py rename to sparv/api/util/install.py index 4445d901..2d542091 100644 --- a/sparv/util/install.py +++ b/sparv/api/util/install.py @@ -1,16 +1,16 @@ """Util functions for installations on remote servers.""" -import logging import os import subprocess from glob import glob -from sparv.util import system +from sparv.api import get_logger +from sparv.api.util import system -log = logging.getLogger(__name__) +logger = get_logger(__name__) -def install_file(host, local_file, remote_file): +def install_file(local_file, host=None, remote_file=None): """Rsync a file to a target host.""" system.rsync(local_file, host, remote_file) @@ -40,11 +40,11 @@ def install_mysql(host, db_name, sqlfile): for sqlf in sqlfiles: file_count += 1 if not os.path.exists(sqlf): - log.error("Missing SQL file: %s", sqlf) + logger.error("Missing SQL file: %s", sqlf) elif os.path.getsize(sqlf) < 10: - log.info("Skipping empty file: %s (%d/%d)", sqlf, file_count, file_total) + logger.info("Skipping empty file: %s (%d/%d)", sqlf, file_count, file_total) else: - log.info("Installing MySQL database: %s, source: %s (%d/%d)", db_name, sqlf, file_count, file_total) + logger.info("Installing MySQL database: %s, source: %s (%d/%d)", db_name, sqlf, file_count, file_total) subprocess.check_call('cat %s | ssh %s "mysql %s"' % (sqlf, host, db_name), shell=True) @@ -52,6 +52,6 @@ def install_mysql_dump(host, db_name, tables): """Copy selected tables (including data) from local to remote MySQL database.""" if isinstance(tables, str): tables = tables.split() - log.info("Copying MySQL database: %s, tables: %s", db_name, ", ".join(tables)) + logger.info("Copying MySQL database: %s, tables: %s", db_name, ", ".join(tables)) subprocess.check_call('mysqldump %s %s | ssh %s "mysql %s"' % (db_name, " ".join(tables), host, db_name), shell=True) diff --git a/sparv/util/misc.py b/sparv/api/util/misc.py similarity index 81% rename from sparv/util/misc.py rename to sparv/api/util/misc.py index 541c395a..ba175e50 100644 --- a/sparv/util/misc.py +++ b/sparv/api/util/misc.py @@ -1,42 +1,15 @@ """Misc util functions.""" -import logging import pathlib import re import unicodedata -from collections import defaultdict, OrderedDict -from typing import List, Optional, Union, Tuple +from collections import OrderedDict, defaultdict +from typing import List, Optional, Tuple, Union -from .classes import Annotation, Model +from sparv.api import get_logger +from sparv.api.classes import Annotation, Model -_log = logging.getLogger(__name__) - - -class SparvErrorMessage(Exception): - """Exception used to notify users of errors in a friendly way without displaying traceback.""" - - start_marker = "<<>>" - end_marker = "<<>>" - - def __init__(self, message, module="", function=""): - """Raise an error and notify user of the problem in a friendly way. - - Args: - message: Error message. - module: Name of module where error occurred (optional, not used in Sparv modules) - function: Name of function where error occurred (optional, not used in Sparv modules) - """ - self.message = message - # Alter message before calling base class - super().__init__("{}{}\n{}\n{}{}".format(SparvErrorMessage.start_marker, module, function, message, - SparvErrorMessage.end_marker)) - - -def get_logger(name): - """Get a logger that is a child of 'sparv.modules'.""" - if not name.startswith("sparv.modules"): - name = "sparv.modules." + name - return logging.getLogger(name) +logger = get_logger(__name__) def parse_annotation_list(annotation_names: Optional[List[str]], all_annotations: Optional[List[str]] = None, @@ -71,7 +44,7 @@ def parse_annotation_list(annotation_names: Optional[List[str]], all_annotations result: OrderedDict = OrderedDict() for a in annotation_names: # Check if this annotation should be omitted - if a.startswith("not "): + if a.startswith("not ") and " as " not in a: omit_annotations.add(a[4:]) elif a == "...": include_rest = True @@ -95,7 +68,7 @@ def parse_annotation_list(annotation_names: Optional[List[str]], all_annotations # Add all_annotations to result if required if include_rest and all_annotations: - for a in set(all_annotations).difference(omit_annotations): + for a in [a for a in all_annotations if not a in omit_annotations]: if a not in result: result[a] = None plain_name, _ = Annotation(a).split() @@ -104,7 +77,7 @@ def parse_annotation_list(annotation_names: Optional[List[str]], all_annotations # Add annotations names without attributes to result if required if add_plain_annotations: - for a in possible_plain_annotations.difference(plain_annotations): + for a in sorted(possible_plain_annotations.difference(plain_annotations)): if a not in result: result[a] = None @@ -198,9 +171,9 @@ def test_lexicon(lexicon: dict, testwords): Takes a dictionary ('lexicon') and a list of test words that are expected to occur as keys in 'lexicon'. Prints the value for each test word. """ - _log.info("Testing annotations...") + logger.info("Testing annotations...") for key in testwords: - _log.info(" %s = %s", key, lexicon.get(key)) + logger.info(" %s = %s", key, lexicon.get(key)) class PickledLexicon: @@ -211,11 +184,11 @@ def __init__(self, picklefile: Union[pathlib.Path, Model], verbose=True): import pickle picklefile_path: pathlib.Path = picklefile.path if isinstance(picklefile, Model) else picklefile if verbose: - _log.info("Reading lexicon: %s", picklefile) + logger.info("Reading lexicon: %s", picklefile) with open(picklefile_path, "rb") as F: self.lexicon = pickle.load(F) if verbose: - _log.info("OK, read %d words", len(self.lexicon)) + logger.info("OK, read %d words", len(self.lexicon)) def lookup(self, key, default=set()): """Lookup a key in the lexicon.""" diff --git a/sparv/util/mysql_wrapper.py b/sparv/api/util/mysql_wrapper.py similarity index 97% rename from sparv/util/mysql_wrapper.py rename to sparv/api/util/mysql_wrapper.py index f8b12a0e..cd5549b6 100644 --- a/sparv/util/mysql_wrapper.py +++ b/sparv/api/util/mysql_wrapper.py @@ -1,11 +1,11 @@ """Util function for creating mysql files.""" -import logging import os +from sparv.api import get_logger from . import system -log = logging.getLogger(__name__) +logger = get_logger(__name__) # Max size of SQL statement MAX_ALLOWED_PACKET = 900000 @@ -42,9 +42,9 @@ def execute(self, sql, *args): # Execute SQL statement out, err = system.call_binary(self.binaries, self.arguments, sql % args, encoding=self.encoding) if out: - log.info("MySQL: %s", out) + logger.info("MySQL: %s", out) if err: - log.error("MySQL: %s", err) + logger.error("MySQL: %s", err) # return out def create_table(self, table, drop, columns, primary=None, indexes=None, constraints=None, **kwargs): diff --git a/sparv/util/system.py b/sparv/api/util/system.py similarity index 79% rename from sparv/util/system.py rename to sparv/api/util/system.py index 9fb35519..8b754eea 100644 --- a/sparv/util/system.py +++ b/sparv/api/util/system.py @@ -1,16 +1,15 @@ """System utility functions.""" import errno -import logging import os import shutil import subprocess -import sys from typing import Optional, Union import sparv.core.paths as paths +from sparv.api import get_logger, SparvErrorMessage -log = logging.getLogger(__name__) +logger = get_logger(__name__) def kill_process(process): @@ -27,7 +26,7 @@ def kill_process(process): def clear_directory(path): """Create a new empty dir. - Remove it's contents if it already exists. + Remove its contents if it already exists. """ shutil.rmtree(path, ignore_errors=True) os.makedirs(path, exist_ok=True) @@ -65,11 +64,11 @@ def call_binary(name, arguments=(), stdin="", raw_command=None, search_paths=(), If return_command is set, then the process is returned. """ - from subprocess import Popen, PIPE + from subprocess import PIPE, Popen assert isinstance(arguments, (list, tuple)) assert isinstance(stdin, (str, list, tuple)) - binary = find_binary(name, search_paths) + binary = find_binary(name, search_paths, raise_error=True) if raw_command: use_shell = True command = raw_command % binary @@ -81,7 +80,7 @@ def call_binary(name, arguments=(), stdin="", raw_command=None, search_paths=(), stdin = "\n".join(stdin) if encoding is not None and isinstance(stdin, str): stdin = stdin.encode(encoding) - log.info("CALL: %s", " ".join(str(c) for c in command) if not raw_command else command) + logger.info("CALL: %s", " ".join(str(c) for c in command) if not raw_command else command) command = Popen(command, shell=use_shell, stdin=PIPE, stdout=PIPE, stderr=(None if verbose else PIPE), @@ -92,9 +91,9 @@ def call_binary(name, arguments=(), stdin="", raw_command=None, search_paths=(), stdout, stderr = command.communicate(stdin) if not allow_error and command.returncode: if stdout: - log.info(stdout.decode()) + logger.info(stdout.decode()) if stderr: - log.warning(stderr.decode()) + logger.warning(stderr.decode()) raise OSError("%s returned error code %d" % (binary, command.returncode)) if encoding: stdout = stdout.decode(encoding) @@ -141,26 +140,34 @@ def find_binary(name: Union[str, list], search_paths=(), executable: bool = True return path_to_bin if raise_error: - raise LookupError("Couldn't find binary: %s\nSearched in: %s\nFor binary names: %s" % - (name[0], ", ".join(search_paths), ", ".join(binary))) + err_msg = f"Couldn't find binary: {name[0]}\nSearched in: {', '.join(search_paths)}\n" + if len(name) > 1: + err_msg += f"For binary names: {', '.join(name)}" + raise SparvErrorMessage(err_msg) else: return None -def rsync(local, host, remote=None): +def rsync(local, host=None, remote=None): """Transfer files and/or directories using rsync. When syncing directories, extraneous files in destination dirs are deleted. """ + assert host or remote, "Either 'host' or 'remote' must be set." if remote is None: remote = local + remote_dir = os.path.dirname(remote) + if os.path.isdir(local): - remote_dir = os.path.dirname(remote) - log.info("Copying directory: %s => %s", local, remote) - args = ["--recursive", "--delete", "%s/" % local] + logger.info(f"Copying directory: {local} => {host + ':' if host else ''}{remote}") + args = ["--recursive", "--delete", f"{local}/"] else: - remote_dir = os.path.dirname(remote) - log.info("Copying file: %s => %s", local, remote) + logger.info(f"Copying file: {local} => {host + ':' if host else ''}{remote}") args = [local] - subprocess.check_call(["ssh", host, "mkdir -p '%s'" % remote_dir]) - subprocess.check_call(["rsync"] + args + ["%s:%s" % (host, remote)]) + + if host: + subprocess.check_call(["ssh", host, f"mkdir -p '{remote_dir}'"]) + subprocess.check_call(["rsync"] + args + [f"{host}:{remote}"]) + else: + subprocess.check_call(["mkdir", "-p", f"'{remote_dir}'"]) + subprocess.check_call(["rsync"] + args + [remote]) diff --git a/sparv/util/tagsets/__init__.py b/sparv/api/util/tagsets/__init__.py similarity index 55% rename from sparv/util/tagsets/__init__.py rename to sparv/api/util/tagsets/__init__.py index 7f978cd3..b81d643f 100644 --- a/sparv/util/tagsets/__init__.py +++ b/sparv/api/util/tagsets/__init__.py @@ -1,3 +1,2 @@ -from .tagmappings import join_tag, mappings, split_tag, tags from .pos_to_upos import pos_to_upos from .suc_to_feats import suc_to_feats diff --git a/sparv/util/tagsets/pos_to_upos.py b/sparv/api/util/tagsets/pos_to_upos.py similarity index 100% rename from sparv/util/tagsets/pos_to_upos.py rename to sparv/api/util/tagsets/pos_to_upos.py diff --git a/sparv/util/tagsets/suc_to_feats.py b/sparv/api/util/tagsets/suc_to_feats.py similarity index 100% rename from sparv/util/tagsets/suc_to_feats.py rename to sparv/api/util/tagsets/suc_to_feats.py diff --git a/sparv/api/util/tagsets/tagmappings.py b/sparv/api/util/tagsets/tagmappings.py new file mode 100644 index 00000000..7c737c52 --- /dev/null +++ b/sparv/api/util/tagsets/tagmappings.py @@ -0,0 +1,1420 @@ +"""This module contains translations between Saldo, SUC, Parole and Granska-ish tagsets. + +The Parole and SUC tags are described here: + http://spraakbanken.gu.se/parole/tags.phtml + +* Constants: + +TAGSEP = ".": a non-space separator between parts of POS/MSD attributes + +* Functions: + +split_tag: splits a SUC or Saldo tag into a pair (pos/part-of-speech, msd/morphology) +join_tag: joins a SUC or Saldo {pos:.., msd:..} record into a tag + +* Tagsets: + +simple_tags: the pos part of SUC tags +suc_tags: all SUC tags +parole_tags: all Parole tags +granska_tags: all Granska-ish tags +saldo_tags: all Saldo tags + +* Dictionaries with descriptions: + +suc_descriptions: 1-1 mapping between SUC tags and a Swedish description + +* Dictionaries for tag conversion: + +suc_to_simple: manu-1 mapping between SUC (msd) and SUC (pos) + +suc_to_parole: 1-1 mapping between SUC and Parole +parole_to_suc: 1-1 mapping between Parole and SUC + +granska_to_parole: many-1 mapping between Granska-ish and Parole +granska_to_suc: many-1 mapping between Granska-ish and SUC +parole_to_granska: 1-many mapping between Parole and Granska-ish +suc_to_granska: 1-many mapping between SUC and Granska-ish + +saldo_to_suc: 1-many mapping between Saldo and SUC +saldo_to_granska: 1-many mapping between Saldo and Granska-ish +saldo_to_parole: 1-many mapping between Saldo and Parole +saldo_to_saldo: 1-many identity mapping of Saldo tags +""" + +TAGSEP = "." + + +def split_tag(tag, sep=TAGSEP): + """Split a tag "X.Y.Z" into a tuple ("X", "Y.Z").""" + pos_msd = tag.split(sep, 1) + if len(pos_msd) == 1: + return pos_msd[0], "" + else: + return pos_msd + + +def join_tag(tag, sep=TAGSEP): + """Join a complex tag into a string. + + The tag can be a dict {"pos": pos, "msd": msd} or a tuple (pos, msd). + """ + if isinstance(tag, dict): + pos, msd = tag["pos"], tag["msd"] + else: + pos, msd = tag + return pos + sep + msd if msd else pos + + +suc_descriptions = { + "AB": "adverb", + "AB.AN": "adverb förkortning", + "AB.KOM": "adverb komparativ", + "AB.POS": "adverb positiv", + "AB.SMS": "adverb sammansättningsform", + "AB.SUV": "adverb superlativ", + "MAD": "meningsskiljande interpunktion", + "MID": "interpunktion", + "PAD": "interpunktion", + "DT.AN": "determinerare förkortning", + "DT.MAS.SIN.DEF": "determinerare maskulinum singularis bestämd", + "DT.MAS.SIN.IND": "determinerare maskulinum singularis obestämd", + "DT.NEU.SIN.DEF": "determinerare neutrum singularis bestämd", + "DT.NEU.SIN.IND": "determinerare neutrum singularis obestämd", + "DT.NEU.SIN.IND+DEF": "determinerare neutrum singularis obestämd/bestämd", + "DT.UTR.SIN.DEF": "determinerare utrum singularis bestämd", + "DT.UTR.SIN.IND": "determinerare utrum singularis obestämd", + "DT.UTR.SIN.IND+DEF": "determinerare utrum singularis obestämd/bestämd", + "DT.UTR+NEU.PLU.DEF": "determinerare utrum/neutrum pluralis bestämd", + "DT.UTR+NEU.PLU.IND": "determinerare utrum/neutrum pluralis obestämd", + "DT.UTR+NEU.PLU.IND+DEF": "determinerare utrum/neutrum pluralis obestämd/bestämd", + "DT.UTR+NEU.SIN.DEF": "determinerare utrum/neutrum singularis bestämd", + "DT.UTR+NEU.SIN.IND": "determinerare utrum/neutrum singularis obestämd", + "DT.UTR+NEU.SIN+PLU.IND": "determinerare utrum/neutrum singularis/pluralis obestämd", + "HA": "frågande/relativt adverb", + "HD.NEU.SIN.IND": "frågande/relativ determinerare neutrum singularis obestämd", + "HD.UTR.SIN.IND": "frågande/relativ determinerare utrum singularis obestämd", + "HD.UTR+NEU.PLU.IND": "frågande/relativ determinerare utrum/neutrum pluralis obestämd", + "HP.-.-.-": "frågande/relativt pronomen", + "HP.NEU.SIN.IND": "frågande/relativt pronomen neutrum singularis obestämd", + "HP.NEU.SIN.IND.SMS": "frågande/relativt pronomen neutrum singularis obestämd sammansättningsform", + "HP.UTR.SIN.IND": "frågande/relativt pronomen utrum singularis obestämd", + "HP.UTR+NEU.PLU.IND": "frågande/relativt pronomen utrum/neutrum pluralis obestämd", + "HS.DEF": "frågande/relativt possesivt pronomen bestämd", + "IE": "infinitivmärke", + "IN": "interjektion", + "JJ.AN": "adjektiv förkortning", + "JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.GEN": "adjektiv komparativ utrum/neutrum singularis/pluralis obestämd/bestämd genitiv", + "JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.NOM": "adjektiv komparativ utrum/neutrum singularis/pluralis obestämd/bestämd nominativ", + "JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.SMS": "adjektiv komparativ utrum/neutrum singularis/pluralis obestämd/bestämd sammansättningsform", + "JJ.POS.MAS.SIN.DEF.GEN": "adjektiv positiv maskulinum singularis bestämd genitiv", + "JJ.POS.MAS.SIN.DEF.NOM": "adjektiv positiv maskulinum singularis bestämd nominativ", + "JJ.POS.NEU.SIN.IND.GEN": "adjektiv positiv neutrum singularis obestämd genitiv", + "JJ.POS.NEU.SIN.IND.NOM": "adjektiv positiv neutrum singularis obestämd nominativ", + "JJ.POS.NEU.SIN.IND+DEF.NOM": "adjektiv positiv neutrum singularis obestämd/bestämd nominativ", + "JJ.POS.UTR.-.-.SMS": "adjektiv positiv utrum sammansättningsform", + "JJ.POS.UTR.SIN.IND.GEN": "adjektiv positiv utrum singularis obestämd genitiv", + "JJ.POS.UTR.SIN.IND.NOM": "adjektiv positiv utrum singularis obestämd nominativ", + "JJ.POS.UTR.SIN.IND+DEF.NOM": "adjektiv positiv utrum singularis obestämd/bestämd nominativ", + "JJ.POS.UTR+NEU.-.-.SMS": "adjektiv positiv utrum/neutrum sammansättningsform", + "JJ.POS.UTR+NEU.PLU.IND.NOM": "adjektiv positiv utrum/neutrum pluralis obestämd nominativ", + "JJ.POS.UTR+NEU.PLU.IND+DEF.GEN": "adjektiv positiv utrum/neutrum pluralis obestämd/bestämd genitiv", + "JJ.POS.UTR+NEU.PLU.IND+DEF.NOM": "adjektiv positiv utrum/neutrum pluralis obestämd/bestämd nominativ", + "JJ.POS.UTR+NEU.SIN.DEF.GEN": "adjektiv positiv utrum/neutrum singularis bestämd genitiv", + "JJ.POS.UTR+NEU.SIN.DEF.NOM": "adjektiv positiv utrum/neutrum singularis bestämd nominativ", + "JJ.POS.UTR+NEU.SIN+PLU.IND.NOM": "adjektiv positiv utrum/neutrum singularis/pluralis obestämd nominativ", + "JJ.POS.UTR+NEU.SIN+PLU.IND+DEF.NOM": "adjektiv positiv utrum/neutrum singularis/pluralis obestämd/bestämd nominativ", + "JJ.SUV.MAS.SIN.DEF.GEN": "adjektiv superlativ maskulinum singularis bestämd genitiv", + "JJ.SUV.MAS.SIN.DEF.NOM": "adjektiv superlativ maskulinum singularis bestämd nominativ", + "JJ.SUV.UTR+NEU.PLU.DEF.NOM": "adjektiv superlativ utrum/neutrum pluralis bestämd nominativ", + "JJ.SUV.UTR+NEU.PLU.IND.NOM": "adjektiv superlativ utrum/neutrum pluralis obestämd nominativ", + "JJ.SUV.UTR+NEU.SIN+PLU.DEF.NOM": "adjektiv superlativ utrum/neutrum singularis/pluralis bestämd nominativ", + "JJ.SUV.UTR+NEU.SIN+PLU.IND.NOM": "adjektiv superlativ utrum/neutrum singularis/pluralis obestämd nominativ", + "KN": "konjunktion", + "KN.AN": "konjunktion förkortning", + "NN.-.-.-.-": "substantiv", + "NN.-.-.-.SMS": "substantiv sammansättningsform", + "NN.AN": "substantiv förkortning", + "NN.NEU.-.-.-": "substantiv neutrum", + "NN.NEU.-.-.SMS": "substantiv neutrum sammansättningsform", + "NN.NEU.PLU.DEF.GEN": "substantiv neutrum pluralis bestämd genitiv", + "NN.NEU.PLU.DEF.NOM": "substantiv neutrum pluralis bestämd nominativ", + "NN.NEU.PLU.IND.GEN": "substantiv neutrum pluralis obestämd genitiv", + "NN.NEU.PLU.IND.NOM": "substantiv neutrum pluralis obestämd nominativ", + "NN.NEU.SIN.DEF.GEN": "substantiv neutrum singularis bestämd genitiv", + "NN.NEU.SIN.DEF.NOM": "substantiv neutrum singularis bestämd nominativ", + "NN.NEU.SIN.IND.GEN": "substantiv neutrum singularis obestämd genitiv", + "NN.NEU.SIN.IND.NOM": "substantiv neutrum singularis obestämd nominativ", + "NN.UTR.-.-.-": "substantiv utrum", + "NN.UTR.-.-.SMS": "substantiv utrum sammansättningsform", + "NN.UTR.PLU.DEF.GEN": "substantiv utrum pluralis bestämd genitiv", + "NN.UTR.PLU.DEF.NOM": "substantiv utrum pluralis bestämd nominativ", + "NN.UTR.PLU.IND.GEN": "substantiv utrum pluralis obestämd genitiv", + "NN.UTR.PLU.IND.NOM": "substantiv utrum pluralis obestämd nominativ", + "NN.UTR.SIN.DEF.GEN": "substantiv utrum singularis bestämd genitiv", + "NN.UTR.SIN.DEF.NOM": "substantiv utrum singularis bestämd nominativ", + "NN.UTR.SIN.IND.GEN": "substantiv utrum singularis obestämd genitiv", + "NN.UTR.SIN.IND.NOM": "substantiv utrum singularis obestämd nominativ", + "PC.AN": "particip förkortning", + "PC.PRF.MAS.SIN.DEF.GEN": "particip perfekt maskulinum singularis bestämd genitiv", + "PC.PRF.MAS.SIN.DEF.NOM": "particip perfekt maskulinum singularis bestämd nominativ", + "PC.PRF.NEU.SIN.IND.NOM": "particip perfekt neutrum singularis obestämd nominativ", + "PC.PRF.UTR.SIN.IND.GEN": "particip perfekt utrum singularis obestämd genitiv", + "PC.PRF.UTR.SIN.IND.NOM": "particip perfekt utrum singularis obestämd nominativ", + "PC.PRF.UTR+NEU.PLU.IND+DEF.GEN": "particip perfekt utrum/neutrum pluralis obestämd/bestämd genitiv", + "PC.PRF.UTR+NEU.PLU.IND+DEF.NOM": "particip perfekt utrum/neutrum pluralis obestämd/bestämd nominativ", + "PC.PRF.UTR+NEU.SIN.DEF.GEN": "particip perfekt utrum/neutrum singularis bestämd genitiv", + "PC.PRF.UTR+NEU.SIN.DEF.NOM": "particip perfekt utrum/neutrum singularis bestämd nominativ", + "PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.GEN": "particip presens utrum/neutrum singularis/pluralis obestämd/bestämd genitiv", + "PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.NOM": "particip presens utrum/neutrum singularis/pluralis obestämd/bestämd nominativ", + "PL": "partikel", + "PL.SMS": "partikel sammansättningsform", + "PM.GEN": "egennamn genitiv", + "PM.NOM": "egennamn nominativ", + "PM.SMS": "egennamn sammansättningsform", + "PN.MAS.SIN.DEF.SUB+OBJ": "pronomen maskulinum singularis bestämd subjektsform/objektsform", + "PN.NEU.SIN.DEF.SUB+OBJ": "pronomen neutrum singularis bestämd subjektsform/objektsform", + "PN.NEU.SIN.IND.SUB+OBJ": "pronomen neutrum singularis obestämd subjektsform/objektsform", + "PN.UTR.PLU.DEF.OBJ": "pronomen utrum pluralis bestämd objektsform", + "PN.UTR.PLU.DEF.SUB": "pronomen utrum pluralis bestämd subjektsform", + "PN.UTR.SIN.DEF.OBJ": "pronomen utrum singularis bestämd objektsform", + "PN.UTR.SIN.DEF.SUB": "pronomen utrum singularis bestämd subjektsform", + "PN.UTR.SIN.DEF.SUB+OBJ": "pronomen utrum singularis bestämd subjektsform/objektsform", + "PN.UTR.SIN.IND.SUB": "pronomen utrum singularis obestämd subjektsform", + "PN.UTR.SIN.IND.SUB+OBJ": "pronomen utrum singularis obestämd subjektsform/objektsform", + "PN.UTR+NEU.PLU.DEF.OBJ": "pronomen utrum/neutrum pluralis bestämd objektsform", + "PN.UTR+NEU.PLU.DEF.SUB": "pronomen utrum/neutrum pluralis bestämd subjektsform", + "PN.UTR+NEU.PLU.DEF.SUB+OBJ": "pronomen utrum/neutrum pluralis bestämd subjektsform/objektsform", + "PN.UTR+NEU.PLU.IND.SUB+OBJ": "pronomen utrum/neutrum pluralis obestämd subjektsform/objektsform", + "PN.UTR+NEU.SIN+PLU.DEF.OBJ": "pronomen utrum/neutrum singularis/pluralis bestämd objektsform", + "PP": "preposition", + "PP.AN": "preposition förkortning", + "PP.SMS": "preposition sammansättningsforms", + "PS.AN": "possesivt pronomen förkortning", + "PS.NEU.SIN.DEF": "possesivt pronomen neutrum singularis bestämd", + "PS.UTR.SIN.DEF": "possesivt pronomen utrum singularis bestämd", + "PS.UTR+NEU.PLU.DEF": "possesivt pronomen utrum/neutrum pluralis bestämd", + "PS.UTR+NEU.SIN+PLU.DEF": "possesivt pronomen utrum/neutrum singularis/pluralis bestämd", + "RG.GEN": "grundtal genitiv", + "RG.MAS.SIN.DEF.NOM": "grundtal singularis bestämd nominativ", + "RG.NEU.SIN.IND.NOM": "grundtal neutrum singularis obestämd nominativ", + "RG.NOM": "grundtal nominativ", + "RG.SMS": "grundtal sammansättningsform", + "RG.UTR.SIN.IND.NOM": "grundtal utrum singularis obestämd nominativ", + "RG.UTR+NEU.SIN.DEF.NOM": "grundtal utrum/neutrum singularis bestämd nominativ", + "RO.GEN": "ordningstal genitiv", + "RO.MAS.SIN.IND+DEF.GEN": "ordningstal maskulinum singularis obestämd/bestämd genitiv", + "RO.MAS.SIN.IND+DEF.NOM": "ordningstal maskulinum singularis obestämd/bestämd nominativ", + "RO.NOM": "ordningstal nominativ", + "RO.UTR+NEU.SIN+PLU.IND+DEF.SMS": "ordningstal utrum/neutrum singularis/pluralis obestämd/bestämd sammansättningsform", + "SN": "subjunktion", + "UO": "utländskt ord", + "VB.AN": "verb förkortning", + "VB.IMP.AKT": "verb imperativ aktiv", + "VB.IMP.SFO": "verb imperativ s-form", + "VB.INF.AKT": "verb infinitiv aktiv", + "VB.INF.SFO": "verb infinitiv s-form", + "VB.KON.PRS.AKT": "verb konjunktiv presens aktiv", + "VB.KON.PRT.AKT": "verb konjunktiv preteritum aktiv", + "VB.KON.PRT.SFO": "verb konjunktiv preteritum s-form", + "VB.PRS.AKT": "verb presens aktiv", + "VB.PRS.SFO": "verb presens s-form", + "VB.PRT.AKT": "verb preteritum aktiv", + "VB.PRT.SFO": "verb preteritum s-form", + "VB.SMS": "verb sammansättningsform", + "VB.SUP.AKT": "verb supinum aktiv", + "VB.SUP.SFO": "verb supinum s-form", +} + + +# This is automatically created from Saldo by saldo.saldo_model.extract_tags() +saldo_tags = { + "ab c", + "ab invar", + "ab komp", + "ab pos", + "ab sms", + "ab super", + "aba invar", + "abh c", + "abh invar", + "abh sms", + "abm invar", + "al pl def", + "al pl indef", + "al sg n def", + "al sg n indef", + "al sg u def", + "al sg u indef", + "av c", + "av invar", + "av komp gen", + "av komp nom", + "av pos def pl gen", + "av pos def pl nom", + "av pos def sg masc gen", + "av pos def sg masc nom", + "av pos def sg no_masc gen", + "av pos def sg no_masc nom", + "av pos indef pl gen", + "av pos indef pl nom", + "av pos indef sg n gen", + "av pos indef sg n nom", + "av pos indef sg u gen", + "av pos indef sg u nom", + "av sms", + "av super def masc gen", + "av super def masc nom", + "av super def no_masc gen", + "av super def no_masc nom", + "av super indef gen", + "av super indef nom", + "ava c", + "ava invar", + "ava sms", + "avh c", + "avh komp gen", + "avh komp nom", + "avh pos def pl gen", + "avh pos def pl nom", + "avh pos def sg masc gen", + "avh pos def sg masc nom", + "avh pos def sg no_masc gen", + "avh pos def sg no_masc nom", + "avh pos indef pl gen", + "avh pos indef pl nom", + "avh pos indef sg n gen", + "avh pos indef sg n nom", + "avh pos indef sg u gen", + "avh pos indef sg u nom", + "avh sms", + "avh super def masc gen", + "avh super def masc nom", + "avh super def no_masc gen", + "avh super def no_masc nom", + "avh super indef gen", + "avh super indef nom", + "avm c", + "avm invar", + "avm komp nom", + "avm pos def pl gen", + "avm pos def pl nom", + "avm pos def sg masc gen", + "avm pos def sg masc nom", + "avm pos def sg no_masc gen", + "avm pos def sg no_masc nom", + "avm pos indef pl gen", + "avm pos indef pl nom", + "avm pos indef sg n gen", + "avm pos indef sg n nom", + "avm pos indef sg u gen", + "avm pos indef sg u nom", + "avm sms", + "avm super def masc nom", + "avm super def no_masc nom", + "avm super indef nom", + "in invar", + "inm invar", + "kn invar", + "kna c", + "kna invar", + "kna sms", + "mxc c", + "mxc sms", + "nl c", + "nl gen num n", + "nl gen num u", + "nl gen ord masc", + "nl gen ord no_masc", + "nl nom num n", + "nl nom num u", + "nl nom ord masc", + "nl nom ord no_masc", + "nlm c", + "nlm invar", + "nlm sms", + "nn n ci", + "nn n cm", + "nn n pl def gen", + "nn n pl def nom", + "nn n pl indef gen", + "nn n pl indef nom", + "nn n sg def gen", + "nn n sg def nom", + "nn n sg indef gen", + "nn n sg indef nom", + "nn n sms", + "nn p ci", + "nn p cm", + "nn p pl def gen", + "nn p pl def nom", + "nn p pl indef gen", + "nn p pl indef nom", + "nn p sms", + "nn u ci", + "nn u cm", + "nn u pl def gen", + "nn u pl def nom", + "nn u pl indef gen", + "nn u pl indef nom", + "nn u sg def gen", + "nn u sg def nom", + "nn u sg indef gen", + "nn u sg indef nom", + "nn u sms", + "nn v ci", + "nn v cm", + "nn v pl def gen", + "nn v pl def nom", + "nn v pl indef gen", + "nn v pl indef nom", + "nn v sg def gen", + "nn v sg def nom", + "nn v sg indef gen", + "nn v sg indef nom", + "nn v sms", + "nna n ci", + "nna n cm", + "nna n pl def gen", + "nna n pl def nom", + "nna n pl indef gen", + "nna n pl indef nom", + "nna n sg def gen", + "nna n sg def nom", + "nna n sg indef gen", + "nna n sg indef nom", + "nna n sms", + "nna u ci", + "nna u cm", + "nna u pl def gen", + "nna u pl def nom", + "nna u pl indef gen", + "nna u pl indef nom", + "nna u sg def gen", + "nna u sg def nom", + "nna u sg indef gen", + "nna u sg indef nom", + "nna u sms", + "nna v ci", + "nna v cm", + "nna v pl def gen", + "nna v pl def nom", + "nna v pl indef gen", + "nna v pl indef nom", + "nna v sg def gen", + "nna v sg def nom", + "nna v sg indef gen", + "nna v sg indef nom", + "nna v sms", + "nnh n sg def gen", + "nnh n sg def nom", + "nnh u ci", + "nnh u cm", + "nnh u pl def gen", + "nnh u pl def nom", + "nnh u pl indef gen", + "nnh u pl indef nom", + "nnh u sg def gen", + "nnh u sg def nom", + "nnh u sg indef gen", + "nnh u sg indef nom", + "nnh u sms", + "nnm n ci", + "nnm n cm", + "nnm n pl def gen", + "nnm n pl def nom", + "nnm n pl indef gen", + "nnm n pl indef nom", + "nnm n sg def gen", + "nnm n sg def nom", + "nnm n sg indef gen", + "nnm n sg indef nom", + "nnm n sms", + "nnm p pl def gen", + "nnm p pl def nom", + "nnm p pl indef gen", + "nnm p pl indef nom", + "nnm u ci", + "nnm u cm", + "nnm u pl def gen", + "nnm u pl def nom", + "nnm u pl indef gen", + "nnm u pl indef nom", + "nnm u sg def gen", + "nnm u sg def nom", + "nnm u sg indef gen", + "nnm u sg indef nom", + "nnm u sms", + "nnm v ci", + "nnm v cm", + "nnm v pl def gen", + "nnm v pl def nom", + "nnm v pl indef gen", + "nnm v pl indef nom", + "nnm v sg def gen", + "nnm v sg def nom", + "nnm v sg indef gen", + "nnm v sg indef nom", + "nnm v sms", + "pm f ph ci", + "pm f ph cm", + "pm f ph gen", + "pm f ph nom", + "pm f ph pl def gen", + "pm f ph pl def nom", + "pm f ph pl indef gen", + "pm f ph pl indef nom", + "pm f ph sg def gen", + "pm f ph sg def nom", + "pm f ph sg indef gen", + "pm f ph sg indef nom", + "pm f ph sms", + "pm f pm ci", + "pm f pm cm", + "pm f pm gen", + "pm f pm nom", + "pm f pm pl def gen", + "pm f pm pl def nom", + "pm f pm pl indef gen", + "pm f pm pl indef nom", + "pm f pm sg def gen", + "pm f pm sg def nom", + "pm f pm sg indef gen", + "pm f pm sg indef nom", + "pm f pm sms", + "pm h ph ci", + "pm h ph cm", + "pm h ph gen", + "pm h ph nom", + "pm h ph pl def gen", + "pm h ph pl def nom", + "pm h ph pl indef gen", + "pm h ph pl indef nom", + "pm h ph sg def gen", + "pm h ph sg def nom", + "pm h ph sg indef gen", + "pm h ph sg indef nom", + "pm h ph sms", + "pm m ph ci", + "pm m ph cm", + "pm m ph gen", + "pm m ph nom", + "pm m ph pl def gen", + "pm m ph pl def nom", + "pm m ph pl indef gen", + "pm m ph pl indef nom", + "pm m ph sg def gen", + "pm m ph sg def nom", + "pm m ph sg indef gen", + "pm m ph sg indef nom", + "pm m ph sms", + "pm m pm gen", + "pm m pm nom", + "pm n aa gen", + "pm n aa nom", + "pm n ac gen", + "pm n ac nom", + "pm n ap gen", + "pm n ap nom", + "pm n aw gen", + "pm n aw nom", + "pm n es gen", + "pm n es nom", + "pm n la gen", + "pm n la nom", + "pm n lf gen", + "pm n lf nom", + "pm n lg gen", + "pm n lg nom", + "pm n lp gen", + "pm n lp nom", + "pm n oa gen", + "pm n oa nom", + "pm n oc gen", + "pm n oc nom", + "pm n oe gen", + "pm n oe nom", + "pm n og gen", + "pm n og nom", + "pm n op gen", + "pm n op nom", + "pm n os gen", + "pm n os nom", + "pm n wm gen", + "pm n wm nom", + "pm n wp gen", + "pm n wp nom", + "pm p lg gen", + "pm p lg nom", + "pm p oc gen", + "pm p oc nom", + "pm u aa gen", + "pm u aa nom", + "pm u ae gen", + "pm u ae nom", + "pm u ag gen", + "pm u ag nom", + "pm u ap gen", + "pm u ap nom", + "pm u eh gen", + "pm u eh nom", + "pm u la gen", + "pm u la nom", + "pm u lf gen", + "pm u lf nom", + "pm u lg gen", + "pm u lg nom", + "pm u ls gen", + "pm u ls nom", + "pm u oc gen", + "pm u oc nom", + "pm u oe gen", + "pm u oe nom", + "pm u og gen", + "pm u og nom", + "pm u op gen", + "pm u op nom", + "pm u pa gen", + "pm u pa nom", + "pm u pc gen", + "pm u pc nom", + "pm u pm gen", + "pm u pm nom", + "pm u tz gen", + "pm u tz nom", + "pm u wa gen", + "pm u wa nom", + "pm u wb gen", + "pm u wb nom", + "pm u wc gen", + "pm u wc nom", + "pm u wn gen", + "pm u wn nom", + "pm v lf gen", + "pm v lf nom", + "pm v lg gen", + "pm v lg nom", + "pma h ph gen", + "pma h ph nom", + "pma n aa gen", + "pma n aa nom", + "pma n af gen", + "pma n af nom", + "pma n am gen", + "pma n am nom", + "pma n lp gen", + "pma n lp nom", + "pma n oa gen", + "pma n oa nom", + "pma n oe gen", + "pma n oe nom", + "pma n og gen", + "pma n og nom", + "pma n om gen", + "pma n om nom", + "pma n op gen", + "pma n op nom", + "pma n os gen", + "pma n os nom", + "pma n tm gen", + "pma n tm nom", + "pma n wb gen", + "pma n wb nom", + "pma u wn gen", + "pma u wn nom", + "pma w oc gen", + "pma w oc nom", + "pma w ph gen", + "pma w ph nom", + "pma w pm gen", + "pma w pm nom", + "pmm f ph gen", + "pmm f ph nom", + "pmm f pm gen", + "pmm f pm nom", + "pmm h ph gen", + "pmm h ph nom", + "pmm m pa gen", + "pmm m pa nom", + "pmm m ph gen", + "pmm m ph nom", + "pmm m pm gen", + "pmm m pm nom", + "pmm n eh gen", + "pmm n eh nom", + "pmm n lf gen", + "pmm n lf nom", + "pmm n lg gen", + "pmm n lg nom", + "pmm n lp gen", + "pmm n lp nom", + "pmm n oc gen", + "pmm n oc nom", + "pmm n oe gen", + "pmm n oe nom", + "pmm n og gen", + "pmm n og nom", + "pmm n op gen", + "pmm n op nom", + "pmm n wm gen", + "pmm n wm nom", + "pmm n wn gen", + "pmm n wn nom", + "pmm p ph gen", + "pmm p ph nom", + "pmm p pm gen", + "pmm p pm nom", + "pmm u aa gen", + "pmm u aa nom", + "pmm u ag gen", + "pmm u ag nom", + "pmm u aw gen", + "pmm u aw nom", + "pmm u ec gen", + "pmm u ec nom", + "pmm u eh gen", + "pmm u eh nom", + "pmm u en gen", + "pmm u en nom", + "pmm u er gen", + "pmm u er nom", + "pmm u es gen", + "pmm u es nom", + "pmm u la gen", + "pmm u la nom", + "pmm u lg gen", + "pmm u lg nom", + "pmm u ls gen", + "pmm u ls nom", + "pmm u oe gen", + "pmm u oe nom", + "pmm u og gen", + "pmm u og nom", + "pmm u op gen", + "pmm u op nom", + "pmm u tb gen", + "pmm u tb nom", + "pmm u tm gen", + "pmm u tm nom", + "pmm u wb gen", + "pmm u wb nom", + "pmm u wc gen", + "pmm u wc nom", + "pmm u wn gen", + "pmm u wn nom", + "pmm v lf gen", + "pmm v lf nom", + "pn ack", + "pn c", + "pn invar", + "pn komp gen", + "pn komp nom", + "pn nom", + "pn p1 pl ack", + "pn p1 pl nom", + "pn p1 pl poss pl", + "pn p1 pl poss sg n", + "pn p1 pl poss sg u", + "pn p1 sg ack", + "pn p1 sg nom", + "pn p1 sg poss pl", + "pn p1 sg poss sg n", + "pn p1 sg poss sg u", + "pn p2 pl ack", + "pn p2 pl nom", + "pn p2 pl poss pl", + "pn p2 pl poss sg n", + "pn p2 pl poss sg u", + "pn p2 sg ack", + "pn p2 sg nom", + "pn p2 sg poss pl", + "pn p2 sg poss sg n", + "pn p2 sg poss sg u", + "pn p3 pl ack", + "pn p3 pl nom", + "pn p3 pl poss pl", + "pn p3 pl poss sg n", + "pn p3 pl poss sg u", + "pn p3 sg ack", + "pn p3 sg nom", + "pn p3 sg poss pl", + "pn p3 sg poss sg n", + "pn p3 sg poss sg u", + "pn pl gen", + "pn pl nom", + "pn pos def pl gen", + "pn pos def pl nom", + "pn pos def sg masc gen", + "pn pos def sg masc nom", + "pn pos def sg no_masc gen", + "pn pos def sg no_masc nom", + "pn pos indef pl gen", + "pn pos indef pl nom", + "pn pos indef sg n gen", + "pn pos indef sg n nom", + "pn pos indef sg u gen", + "pn pos indef sg u nom", + "pn poss pl", + "pn poss sg n", + "pn poss sg u", + "pn sg n gen", + "pn sg n nom", + "pn sg u gen", + "pn sg u nom", + "pn sms", + "pn super def masc gen", + "pn super def masc nom", + "pn super def no_masc gen", + "pn super def no_masc nom", + "pn super indef gen", + "pn super indef nom", + "pnm gen", + "pnm invar", + "pnm nom", + "pp invar", + "ppa c", + "ppa invar", + "ppa sms", + "ppm c", + "ppm invar", + "ppm sms", + "sn invar", + "snm c", + "snm invar", + "snm sms", + "ssm c", + "ssm invar", + "ssm sms", + "sxc c", + "sxc sms", + "vb c", + "vb imper", + "vb inf aktiv", + "vb inf s-form", + "vb pres ind aktiv", + "vb pres ind s-form", + "vb pres konj aktiv", + "vb pres konj s-form", + "vb pres_part gen", + "vb pres_part nom", + "vb pret ind aktiv", + "vb pret ind s-form", + "vb pret konj aktiv", + "vb pret konj s-form", + "vb pret_part def pl gen", + "vb pret_part def pl nom", + "vb pret_part def sg masc gen", + "vb pret_part def sg masc nom", + "vb pret_part def sg no_masc gen", + "vb pret_part def sg no_masc nom", + "vb pret_part indef pl gen", + "vb pret_part indef pl nom", + "vb pret_part indef sg n gen", + "vb pret_part indef sg n nom", + "vb pret_part indef sg u gen", + "vb pret_part indef sg u nom", + "vb sms", + "vb sup aktiv", + "vb sup s-form", + "vba c", + "vba invar", + "vba sms", + "vbm imper", + "vbm inf aktiv", + "vbm inf s-form", + "vbm pres ind aktiv", + "vbm pres ind s-form", + "vbm pres konj aktiv", + "vbm pres konj s-form", + "vbm pres_part gen", + "vbm pres_part nom", + "vbm pret ind aktiv", + "vbm pret ind s-form", + "vbm pret konj aktiv", + "vbm pret konj s-form", + "vbm pret_part def pl gen", + "vbm pret_part def pl nom", + "vbm pret_part def sg masc gen", + "vbm pret_part def sg masc nom", + "vbm pret_part def sg no_masc gen", + "vbm pret_part def sg no_masc nom", + "vbm pret_part indef pl gen", + "vbm pret_part indef pl nom", + "vbm pret_part indef sg n gen", + "vbm pret_part indef sg n nom", + "vbm pret_part indef sg u gen", + "vbm pret_part indef sg u nom", + "vbm sup aktiv", + "vbm sup s-form" +} + + +# Mapping from SALDO POS tags (as found in lemgrams) to SUC POS tags +saldo_pos_to_suc = { + "nn": ["NN"], + "av": ["JJ"], + "vb": ["VB"], + "pm": ["PM"], + "ab": ["AB"], + "in": ["IN"], + "pp": ["PP"], + "pn": ["PN"], + "sn": ["SN"], + "kn": ["KN"], + "ie": ["IE"], + "abh": ["AB"], + "nnm": ["NN"], + "nna": ["NN"], + "avh": ["JJ"], + "avm": ["JJ"], + "ava": ["JJ"], + "vbm": ["VB"], + "pmm": ["PM"], + "abm": ["AB"], + "aba": ["AB"], + "pnm": ["PN"], + "inm": ["IN"], + "ppm": ["PP"], + "ppa": ["PP"], + "knm": ["KN"], + "kna": ["KN"], + "snm": ["SN"], + "nl": ["RG", "RO"], + "nlm": ["RG", "RO"], + "al": ["DT"], + "pma": ["PM"] +} + + +suc_to_parole = { + "AB": "RG0S", + "AB.AN": "RG0A", + "AB.KOM": "RGCS", + "AB.POS": "RGPS", + "AB.SMS": "RG0C", + "AB.SUV": "RGSS", + "MAD": "FE", + "MID": "FI", + "PAD": "FP", + "DT.AN": "D0@00@A", + "DT.MAS.SIN.DEF": "DF@MS@S", + "DT.MAS.SIN.IND": "DI@MS@S", + "DT.NEU.SIN.DEF": "DF@NS@S", + "DT.NEU.SIN.IND": "DI@NS@S", + "DT.NEU.SIN.IND+DEF": "D0@NS@S", + "DT.UTR.SIN.DEF": "DF@US@S", + "DT.UTR.SIN.IND": "DI@US@S", + "DT.UTR.SIN.IND+DEF": "D0@US@S", + "DT.UTR+NEU.PLU.DEF": "DF@0P@S", + "DT.UTR+NEU.PLU.IND": "DI@0P@S", + "DT.UTR+NEU.PLU.IND+DEF": "D0@0P@S", + "DT.UTR+NEU.SIN.DEF": "DF@0S@S", + "DT.UTR+NEU.SIN.IND": "DI@0S@S", + "DT.UTR+NEU.SIN+PLU.IND": "DI@00@S", + "HA": "RH0S", + "HD.NEU.SIN.IND": "DH@NS@S", + "HD.UTR.SIN.IND": "DH@US@S", + "HD.UTR+NEU.PLU.IND": "DH@0P@S", + "HP.-.-.-": "PH@000@S", + "HP.NEU.SIN.IND": "PH@NS0@S", + "HP.NEU.SIN.IND.SMS": "PH@NS0@C", + "HP.UTR.SIN.IND": "PH@US0@S", + "HP.UTR+NEU.PLU.IND": "PH@0P0@S", + "HS.DEF": "PE@000@S", + "IE": "CIS", + "IN": "I", + "JJ.AN": "AQ00000A", + "JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.GEN": "AQC00G0S", + "JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.NOM": "AQC00N0S", + "JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.SMS": "AQC0000C", + "JJ.POS.MAS.SIN.DEF.GEN": "AQPMSGDS", + "JJ.POS.MAS.SIN.DEF.NOM": "AQPMSNDS", + "JJ.POS.NEU.SIN.IND.GEN": "AQPNSGIS", + "JJ.POS.NEU.SIN.IND.NOM": "AQPNSNIS", + "JJ.POS.NEU.SIN.IND+DEF.NOM": "AQPNSN0S", + "JJ.POS.UTR.-.-.SMS": "AQPU000C", + "JJ.POS.UTR.SIN.IND.GEN": "AQPUSGIS", + "JJ.POS.UTR.SIN.IND.NOM": "AQPUSNIS", + "JJ.POS.UTR.SIN.IND+DEF.NOM": "AQPUSN0S", + "JJ.POS.UTR+NEU.-.-.SMS": "AQP0000C", + "JJ.POS.UTR+NEU.PLU.IND.NOM": "AQP0PNIS", + "JJ.POS.UTR+NEU.PLU.IND+DEF.GEN": "AQP0PG0S", + "JJ.POS.UTR+NEU.PLU.IND+DEF.NOM": "AQP0PN0S", + "JJ.POS.UTR+NEU.SIN.DEF.GEN": "AQP0SGDS", + "JJ.POS.UTR+NEU.SIN.DEF.NOM": "AQP0SNDS", + "JJ.POS.UTR+NEU.SIN+PLU.IND.NOM": "AQP00NIS", + "JJ.POS.UTR+NEU.SIN+PLU.IND+DEF.NOM": "AQP00N0S", + "JJ.SUV.MAS.SIN.DEF.GEN": "AQSMSGDS", + "JJ.SUV.MAS.SIN.DEF.NOM": "AQSMSNDS", + "JJ.SUV.UTR+NEU.PLU.DEF.NOM": "AQS0PNDS", + "JJ.SUV.UTR+NEU.PLU.IND.NOM": "AQS0PNIS", + "JJ.SUV.UTR+NEU.SIN+PLU.DEF.NOM": "AQS00NDS", + "JJ.SUV.UTR+NEU.SIN+PLU.IND.NOM": "AQS00NIS", + "KN": "CCS", + "KN.AN": "CCA", + "NN.-.-.-.-": "NC000@0S", + "NN.-.-.-.SMS": "NC000@0C", + "NN.AN": "NC000@0A", + "NN.NEU.-.-.-": "NCN00@0S", + "NN.NEU.-.-.SMS": "NCN00@0C", + "NN.NEU.PLU.DEF.GEN": "NCNPG@DS", + "NN.NEU.PLU.DEF.NOM": "NCNPN@DS", + "NN.NEU.PLU.IND.GEN": "NCNPG@IS", + "NN.NEU.PLU.IND.NOM": "NCNPN@IS", + "NN.NEU.SIN.DEF.GEN": "NCNSG@DS", + "NN.NEU.SIN.DEF.NOM": "NCNSN@DS", + "NN.NEU.SIN.IND.GEN": "NCNSG@IS", + "NN.NEU.SIN.IND.NOM": "NCNSN@IS", + "NN.UTR.-.-.-": "NCU00@0S", + "NN.UTR.-.-.SMS": "NCU00@0C", + "NN.UTR.PLU.DEF.GEN": "NCUPG@DS", + "NN.UTR.PLU.DEF.NOM": "NCUPN@DS", + "NN.UTR.PLU.IND.GEN": "NCUPG@IS", + "NN.UTR.PLU.IND.NOM": "NCUPN@IS", + "NN.UTR.SIN.DEF.GEN": "NCUSG@DS", + "NN.UTR.SIN.DEF.NOM": "NCUSN@DS", + "NN.UTR.SIN.IND.GEN": "NCUSG@IS", + "NN.UTR.SIN.IND.NOM": "NCUSN@IS", + "PC.AN": "AF00000A", + "PC.PRF.MAS.SIN.DEF.GEN": "AF0MSGDS", + "PC.PRF.MAS.SIN.DEF.NOM": "AF0MSNDS", + "PC.PRF.NEU.SIN.IND.NOM": "AF0NSNIS", + "PC.PRF.UTR.SIN.IND.GEN": "AF0USGIS", + "PC.PRF.UTR.SIN.IND.NOM": "AF0USNIS", + "PC.PRF.UTR+NEU.PLU.IND+DEF.GEN": "AF00PG0S", + "PC.PRF.UTR+NEU.PLU.IND+DEF.NOM": "AF00PN0S", + "PC.PRF.UTR+NEU.SIN.DEF.GEN": "AF00SGDS", + "PC.PRF.UTR+NEU.SIN.DEF.NOM": "AF00SNDS", + "PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.GEN": "AP000G0S", + "PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.NOM": "AP000N0S", + "PL": "QS", + "PL.SMS": "QC", + "PM.GEN": "NP00G@0S", + "PM.NOM": "NP00N@0S", + "PM.SMS": "NP000@0C", + "PN.MAS.SIN.DEF.SUB+OBJ": "PF@MS0@S", + "PN.NEU.SIN.DEF.SUB+OBJ": "PF@NS0@S", + "PN.NEU.SIN.IND.SUB+OBJ": "PI@NS0@S", + "PN.UTR.PLU.DEF.OBJ": "PF@UPO@S", + "PN.UTR.PLU.DEF.SUB": "PF@UPS@S", + "PN.UTR.SIN.DEF.OBJ": "PF@USO@S", + "PN.UTR.SIN.DEF.SUB": "PF@USS@S", + "PN.UTR.SIN.DEF.SUB+OBJ": "PF@US0@S", + "PN.UTR.SIN.IND.SUB": "PI@USS@S", + "PN.UTR.SIN.IND.SUB+OBJ": "PI@US0@S", + "PN.UTR+NEU.PLU.DEF.OBJ": "PF@0PO@S", + "PN.UTR+NEU.PLU.DEF.SUB": "PF@0PS@S", + "PN.UTR+NEU.PLU.DEF.SUB+OBJ": "PF@0P0@S", + "PN.UTR+NEU.PLU.IND.SUB+OBJ": "PI@0P0@S", + "PN.UTR+NEU.SIN+PLU.DEF.OBJ": "PF@00O@S", + "PP": "SPS", + "PP.AN": "SPA", + "PP.SMS": "SPC", + "PS.AN": "PS@000@A", + "PS.NEU.SIN.DEF": "PS@NS0@S", + "PS.UTR.SIN.DEF": "PS@US0@S", + "PS.UTR+NEU.PLU.DEF": "PS@0P0@S", + "PS.UTR+NEU.SIN+PLU.DEF": "PS@000@S", + "RG.GEN": "MC00G0S", + "RG.MAS.SIN.DEF.NOM": "MCMSNDS", + "RG.NEU.SIN.IND.NOM": "MCNSNIS", + "RG.NOM": "MC00N0S", + "RG.SMS": "MC0000C", + "RG.UTR.SIN.IND.NOM": "MCUSNIS", + "RG.UTR+NEU.SIN.DEF.NOM": "MC0SNDS", + "RO.GEN": "MO00G0S", + "RO.MAS.SIN.IND+DEF.GEN": "MOMSG0S", + "RO.MAS.SIN.IND+DEF.NOM": "MOMSN0S", + "RO.NOM": "MO00N0S", + "RO.UTR+NEU.SIN+PLU.IND+DEF.SMS": "MO0000C", + "SN": "CSS", + "UO": "XF", + "VB.AN": "V@000A", + "VB.IMP.AKT": "V@M0AS", + "VB.IMP.SFO": "V@M0SS", + "VB.INF.AKT": "V@N0AS", + "VB.INF.SFO": "V@N0SS", + "VB.KON.PRS.AKT": "V@SPAS", + "VB.KON.PRT.AKT": "V@SIAS", + "VB.KON.PRT.SFO": "V@SISS", + "VB.PRS.AKT": "V@IPAS", + "VB.PRS.SFO": "V@IPSS", + "VB.PRT.AKT": "V@IIAS", + "VB.PRT.SFO": "V@IISS", + "VB.SMS": "V@000C", + "VB.SUP.AKT": "V@IUAS", + "VB.SUP.SFO": "V@IUSS", +} + + +# This mapping, courtesy of Eva Forsbom +granska_to_parole = { + "pc.an": "AF00000A", + "pc.prf.utr+neu.plu.ind+def.gen": "AF00PG0S", + "pc.prf.utr+neu.plu.ind+def.nom": "AF00PN0S", + "pc.prf.utr+neu.sin.def.gen": "AF00SGDS", + "pc.prf.utr+neu.sin.def.nom": "AF00SNDS", + "pc.prf.mas.sin.def.gen": "AF0MSGDS", + "pc.prf.mas.sin.def.nom": "AF0MSNDS", + "pc.prf.neu.sin.ind.nom": "AF0NSNIS", + "pc.prf.utr.sin.ind.gen": "AF0USGIS", + "pc.prf.utr.sin.ind.nom": "AF0USNIS", + "pc.prs.utr+neu.sin+plu.ind+def.gen": "AP000G0S", + "pc.prs.utr+neu.sin+plu.ind+def.nom": "AP000N0S", + "jj.an": "AQ00000A", + "jj.kom.utr+neu.sin+plu.ind+def.sms": "AQC0000C", + "jj.kom.utr+neu.sin+plu.ind+def.gen": "AQC00G0S", + "jj.kom.utr+neu.sin+plu.ind+def.nom": "AQC00N0S", + "jj.pos.utr+neu.-.-.sms": "AQP0000C", + "jj.pos.utr+neu.sin+plu.ind+def.nom": "AQP00N0S", + "jj.pos.utr+neu.sin+plu.ind.nom": "AQP00NIS", + "jj.pos.utr+neu.plu.ind+def.gen": "AQP0PG0S", + "jj.pos.utr+neu.plu.ind+def.nom": "AQP0PN0S", + "jj.pos.utr+neu.plu.ind.nom": "AQP0PNIS", + "jj.pos.utr+neu.sin.def.gen": "AQP0SGDS", + "jj.pos.utr+neu.sin.def.nom": "AQP0SNDS", + "jj.pos.mas.sin.def.gen": "AQPMSGDS", + "jj.pos.mas.sin.def.nom": "AQPMSNDS", + "jj.pos.neu.sin.ind.gen": "AQPNSGIS", + "jj.pos.neu.sin.ind+def.nom": "AQPNSN0S", + "jj.pos.neu.sin.ind.nom": "AQPNSNIS", + "jj.pos.utr.-.-.sms": "AQPU000C", + "jj.pos.utr.sin.ind.gen": "AQPUSGIS", + "jj.pos.utr.sin.ind+def.nom": "AQPUSN0S", + "jj.pos.utr.sin.ind.nom": "AQPUSNIS", + "jj.suv.utr+neu.sin+plu.def.nom": "AQS00NDS", + "jj.suv.utr+neu.sin+plu.ind.nom": "AQS00NIS", + "jj.suv.utr+neu.plu.def.nom": "AQS0PNDS", + "jj.suv.utr+neu.plu.ind.nom": "AQS0PNIS", + "jj.suv.mas.sin.def.gen": "AQSMSGDS", + "jj.suv.mas.sin.def.nom": "AQSMSNDS", + "kn.an": "CCA", + "kn": "CCS", + "ie": "CIS", + "sn": "CSS", + "dt.an": "D0@00@A", + "dt.utr+neu.plu.ind+def": "D0@0P@S", + "dt.neu.sin.ind+def": "D0@NS@S", + "dt.utr.sin.ind+def": "D0@US@S", + "dt.utr+neu.plu.def": "DF@0P@S", + "dt.utr+neu.sin.def": "DF@0S@S", + "dt.mas.sin.def": "DF@MS@S", + "dt.neu.sin.def": "DF@NS@S", + "dt.utr.sin.def": "DF@US@S", + "hd.utr+neu.plu.ind": "DH@0P@S", + "hd.neu.sin.ind": "DH@NS@S", + "hd.utr.sin.ind": "DH@US@S", + "dt.utr+neu.sin+plu.ind": "DI@00@S", + "dt.utr+neu.plu.ind": "DI@0P@S", + "dt.utr+neu.sin.ind": "DI@0S@S", + "dt.mas.sin.ind": "DI@MS@S", + "dt.neu.sin.ind": "DI@NS@S", + "dt.utr.sin.ind": "DI@US@S", + "mad": "FE", + "mid": "FI", + "pad": "FP", + "in": "I", + "rg.sms": "MC0000C", + "rg.gen": "MC00G0S", + "rg.nom": "MC00N0S", + "rg.sin.nom": "MC00N0S", + "rg.neu.sin.ind.nom": "MCNSNIS", + "rg.utr.sin.ind.nom": "MCUSNIS", + "rg.mas.sin.def.nom": "MCMSNDS", + "rg utr.neu.sin.def.nom": "MC0SNDS", + "ro.sms": "MO0000C", + "ro.gen": "MO00G0S", + "ro.nom": "MO00N0S", + "ro.sin.nom": "MO00N0S", + "ro.mas.sin.ind+def.gen": "MOMSG0S", + "ro.mas.sin.ind+def.nom": "MOMSN0S", + "nn.an": "NC000@0A", + "nn.-.-.-.sms": "NC000@0C", + "nn.-.-.-.-": "NC000@0S", + "nn.neu.-.-.sms": "NCN00@0C", + "nn.neu.-.-.-": "NCN00@0S", + "nn.neu.plu.def.gen": "NCNPG@DS", + "nn.neu.plu.ind.gen": "NCNPG@IS", + "nn.neu.plu.def.nom": "NCNPN@DS", + "nn.neu.plu.ind.nom": "NCNPN@IS", + "nn.neu.sin.def.gen": "NCNSG@DS", + "nn.neu.sin.ind.gen": "NCNSG@IS", + "nn.neu.sin.def.nom": "NCNSN@DS", + "nn.neu.sin.ind.nom": "NCNSN@IS", + "nn.utr.-.-.sms": "NCU00@0C", + "nn.utr.-.-.-": "NCU00@0S", + "nn.utr.plu.def.gen": "NCUPG@DS", + "nn.utr.plu.ind.gen": "NCUPG@IS", + "nn.utr.plu.def.nom": "NCUPN@DS", + "nn.utr.plu.ind.nom": "NCUPN@IS", + "nn.utr.sin.def.gen": "NCUSG@DS", + "nn.utr.sin.ind.gen": "NCUSG@IS", + "nn.utr.sin.def.nom": "NCUSN@DS", + "nn.utr.sin.def.nom.dat": "NCUSN@DS", + "nn.utr.sin.ind.nom": "NCUSN@IS", + "nn.utr.sin.ind.nom.dat": "NCUSN@IS", + "pm.sms": "NP000@0C", + "pm.gen": "NP00G@0S", + "pm.nom": "NP00N@0S", + "pn.utr+neu.sin+plu.def.obj": "PF@00O@S", + "pn.utr+neu.plu.def.sub+obj": "PF@0P0@S", + "pn.utr+neu.plu.def.obj": "PF@0PO@S", + "pn.utr+neu.plu.def.sub": "PF@0PS@S", + "pn.mas.sin.def.sub+obj": "PF@MS0@S", + "pn.neu.sin.def.sub+obj": "PF@NS0@S", + "pn.utr.plu.def.obj": "PF@UPO@S", + "pn.utr.plu.def.sub": "PF@UPS@S", + "pn.utr.sin.def.sub+obj": "PF@US0@S", + "pn.utr.sin.def.obj": "PF@USO@S", + "pn.utr.sin.def.sub": "PF@USS@S", + "hs.def": "PE@000@S", + "hp.-.-.-": "PH@000@S", + "hp.utr+neu.plu.ind": "PH@0P0@S", + "hp.neu.sin.ind.sms": "PH@NS0@C", + "hp.neu.sin.ind": "PH@NS0@S", + "hp.utr.sin.ind": "PH@US0@S", + "pn.utr+neu.plu.ind.sub+obj": "PI@0P0@S", + "pn.neu.sin.ind.sub+obj": "PI@NS0@S", + "pn.utr.sin.ind.sub+obj": "PI@US0@S", + "pn.utr.sin.ind.sub": "PI@USS@S", + "ps.an": "PS@000@A", + "ps.utr+neu.sin+plu.def": "PS@000@S", + "ps.utr+neu.plu.def": "PS@0P0@S", + "ps.neu.sin.def": "PS@NS0@S", + "ps.utr.sin.def": "PS@US0@S", + "pl": "QS", + "pl.sms": "QC", + "ab.an": "RG0A", + "ab.sms": "RG0C", + "ab": "RG0S", + "ab.kom": "RGCS", + "ab.pos": "RGPS", + "ab.suv": "RGSS", + "ha": "RH0S", + "pp.an": "SPA", + "pp.sms": "SPC", + "pp": "SPS", + "vb.an": "V@000A", + "vb.sms": "V@000C", + "vb.prt.akt": "V@IIAS", + "vb.prt.akt.aux": "V@IIAS", + "vb.prt.akt.kop": "V@IIAS", + "vb.prt.sfo": "V@IISS", + "vb.prt.sfo.kop": "V@IISS", + "vb.prs.akt": "V@IPAS", + "vb.prs.akt.aux": "V@IPAS", + "vb.prs.akt.kop": "V@IPAS", + "vb.prs.sfo": "V@IPSS", + "vb.prs.sfo.kop": "V@IPSS", + "vb.sup.akt": "V@IUAS", + "vb.sup.akt.kop": "V@IUAS", + "vb.sup.sfo": "V@IUSS", + "vb.imp.akt": "V@M0AS", + "vb.imp.akt.aux": "V@M0AS", + "vb.imp.akt.kop": "V@M0AS", + "vb.imp.sfo": "V@M0SS", + "vb.inf.akt": "V@N0AS", + "vb.inf.akt.aux": "V@N0AS", + "vb.inf.akt.kop": "V@N0AS", + "vb.inf.sfo": "V@N0SS", + "vb.kon.prt.akt": "V@SIAS", + "vb.kon.prt.sfo": "V@SISS", + "vb.kon.prs.akt": "V@SPAS", + "uo": "XF", +} + +parole_to_suc = dict((parole, suc) for (suc, parole) in list(suc_to_parole.items())) + +granska_to_suc = dict((granska, parole_to_suc[parole]) for (granska, parole) in list(granska_to_parole.items())) + +parole_to_granska = {} +for granska, parole in list(granska_to_parole.items()): + parole_to_granska.setdefault(parole, set()).add(granska) + +suc_to_granska = dict((suc, parole_to_granska[parole]) for (suc, parole) in list(suc_to_parole.items())) + +suc_tags = set(suc_descriptions) + +suc_to_simple = dict((suc, split_tag(suc)[0]) for suc in suc_tags) + +simple_tags = set(suc_to_simple.values()) + +granska_tags = set(granska_to_parole) + +parole_tags = set(parole_to_suc) + + +assert suc_tags == set(suc_to_parole.keys()) +assert suc_tags == set(suc_to_granska.keys()) +assert suc_tags == set(parole_to_suc.values()) +assert suc_tags == set(granska_to_suc.values()) + +assert granska_tags == set(granska_to_parole.keys()) +assert granska_tags == set(granska_to_suc.keys()) +assert granska_tags == set().union(*list(parole_to_granska.values())) +assert granska_tags == set().union(*list(suc_to_granska.values())) + +assert parole_tags == set(parole_to_suc.keys()) +assert parole_tags == set(parole_to_granska.keys()) +assert parole_tags == set(suc_to_parole.values()) +assert parole_tags == set(granska_to_parole.values()) + + +###################################################################### +# Here we automatically create the 1-many dictionaries +# saldo_to_suc and saldo_to_parole + +saldo_params_to_suc = { + "u": "UTR", + "n": "NEU", + "masc": "MAS", + "no_masc": "UTR+NEU", + "komp": "KOM", + "super": "SUV", + "pl": "PLU", + "sg": "SIN", + "indef": "IND", + "pres_part": "PCPRS", + "pret_part": "PCPRT", + "imper": "IMP", + "aktiv": "AKT", + "s-form": "SFO", + "ind": "INDIKATIV", + "konj": "KON", + "pres": "PRS", + "pret": "PRT", +} + +# SALDO to SUC mapping +_suc_tag_replacements = [ + (r"(IN|KN|PP)", r"\1"), + (r"SN", r"(SN|IE)"), # ie doesn't exist in SALDO anymore + (r"(AB|KN|PP|VB)A", r"\1 AN"), + (r"[MS]XC", r"(NN|JJ|AB) .* SMS"), + + (r"ABH? INVAR", r"(AB|PL|HA)"), + (r"ABH? (KOM|POS|SMS|SUV)", r"AB \1"), + + (r"AL PLU (DEF|IND)", r"DT UTR+NEU PLU \1"), + (r"AL SIN (UTR|NEU) (DEF|IND)", r"DT \1 SIN \2"), + + (r"AV INVAR", r"(JJ POS|PC PRS) .* NOM"), + (r"AVH? POS IND SIN NEU NOM", r"(AB|AB POS|(JJ POS|PC PRF) NEU SIN IND NOM)"), # snabbt + (r"AVH? POS (DEF|IND) (SIN|PLU) (MAS|NEU|UTR|UTR\+NEU) (NOM|GEN)", r"(JJ POS|PC PRF) \3 \2 (\1|IND+DEF) \4"), # ind/def doesn't exist in SALDO + (r"AVH? POS (DEF|IND) PLU (NOM|GEN)", r"(JJ POS|PC PRF) UTR+NEU PLU (\1|IND+DEF) \2"), # ind/def doesn't exist in SALDO + # (r"AV POS .* (SIN|PLU) .*(NOM|GEN)", r"(JJ POS|PC PRF) .* \1 .* \2"), + (r"AVH? KOM NOM", r"(JJ KOM .* NOM|AB KOM)"), + (r"AVH? SUV IND NOM", r"(JJ SUV .* NOM|AB SUV)"), + (r"AVH? (KOM|SUV) .*(NOM|GEN)", r"JJ \1 .* \2"), + (r"AVH? SMS", r"JJ .* SMS"), + (r"AVA", r"AB AN"), + + (r"NL (NOM|GEN)", r"(RG|RO) .*\1"), + + (r"NN (V|P) (SIN|PLU) (IND|DEF) (NOM|GEN)", r"NN (UTR|NEU|-) (\2|-) (\3|-) (\4|-)"), + (r"NNH? (UTR|NEU) (SIN|PLU) (IND|DEF) (NOM|GEN)", r"NN (\1|-) (\2|-) (\3|-) (\4|-)"), + (r"NNH? .* SMS", r"NN .* SMS"), + (r"NNA .* SMS", r"(NN|PM) .* SMS"), + (r"NNA .* (SIN|PLU) (IND|DEF) (NOM|GEN)", r"NN (AN|.* \1 \2 \3)"), + + (r"PMA .* (NOM|GEN)", r"PM \1"), + (r"PM .* (NOM|GEN)", r"PM \1"), + (r"PM .* SMS", r"PM .* SMS"), + + (r"PN .*POSS", r"(PS|HS)"), + (r"PN KOM GEN", r"PS"), + (r"PN SUV (IND|DEF)", r"JJ SUV .* \1"), + (r"PN (P1|P2|P3) (SIN|PLU)", r"PN .* \2 DEF"), + (r"PN POS .*(SIN|PLU)", r"PN .* \1"), + (r"PN PLU NOM", r"(PN .* PLU|DT UTR+NEU PLU .*|JJ POS UTR+NEU PLU .* NOM)"), + (r"PN PLU GEN", r"(PN .* PLU|DT UTR+NEU PLU .*|PS UTR+NEU SIN+PLU DEF)"), + (r"PN SIN UTR NOM", r"(PN (UTR|MAS) SIN|DT UTR SIN .*|JJ POS UTR SIN IND NOM)"), + (r"PN SIN UTR GEN", r"(PN (UTR|MAS) SIN|DT UTR SIN .*|PS UTR+NEU SIN+PLU DEF)"), + (r"PN SIN NEU NOM", r"(PN NEU SIN|DT NEU SIN .*|JJ POS NEU SIN IND NOM)"), + (r"PN SIN NEU GEN", r"(PN NEU SIN|DT NEU SIN .*|PS UTR+NEU SIN+PLU DEF)"), + (r"PN (ACK|NOM|INVAR|KOM|SMS)", r"(PN|HP|HS)"), + + (r"VB (INF|SUP) (AKT|SFO)", r"VB \1 \2"), + (r"VB (PRS|PRT) .* (AKT|SFO)", r"VB .*\1 \2"), + (r"VB PCPRS (NOM|GEN)", r"PC PRS .* \1"), + (r"VB PCPRT .* (PLU|SIN) .*(NOM|GEN)", r"PC PRF .* \1 .* \2"), + (r"VB (IMP|SMS)", r"VB \1"), + + # Compounds + (r"ABH? C", r"AB"), + (r"AVH? C", r"JJ"), + (r"VB C", r"VB"), + (r"NNA? (UTR|NEU) (CI|CM)", r"NN (\1|-) - - -"), + (r"NNA? (V|P) (CI|CM)", r"NN (UTR|NEU|-) - - -"), + (r"NNH? (UTR|NEU) (CI|CM)", r"NN (\1|-) - - -"), + + (r"PM .* (CI|CM)", r"PM"), + (r"PN C", r"PN"), + (r"NL C", r"(RG|RO)"), +] + + +def _make_saldo_to_suc(compound=False): + import re + tagmap = {} + for saldotag in saldo_tags: + params = saldotag.split() + if not compound: + if saldotag.endswith((" c", " ci", " cm")) or not params or (len(params[0]) == 3 and params[0].endswith(("m", "h"))): + # Skip multiword units and compound/end syllables + continue + else: + if not params or (len(params[0]) == 3 and params[0].endswith("m")): + # Skip multiword units + continue + paramstr = " ".join(saldo_params_to_suc.get(prm, prm.upper()) for prm in params) + for (pre, post) in _suc_tag_replacements: + m = re.match(pre, paramstr) + if m: + break + if m is None: + print(paramstr) + print() + sucfilter = m.expand(post).replace(" ", r"\.").replace("+", r"\+") + tagmap[saldotag] = set(suctag for suctag in suc_tags + if re.match(sucfilter, suctag)) + return tagmap + + +saldo_to_suc = _make_saldo_to_suc() +saldo_to_suc_compound = _make_saldo_to_suc(compound=True) # For use with the compound module + +saldo_to_parole = dict((saldotag, set(suc_to_parole[suctag] for suctag in suctags)) + for saldotag, suctags in list(saldo_to_suc.items())) + +saldo_to_granska = dict((saldotag, set().union(*(suc_to_granska[suctag] for suctag in suctags))) + for saldotag, suctags in list(saldo_to_suc.items())) + +saldo_to_saldo = dict((saldotag, {saldotag}) for saldotag in saldo_tags) + + +mappings = { + "granska_to_parole": granska_to_parole, + "granska_to_suc": granska_to_suc, + "parole_to_granska": parole_to_granska, + "parole_to_suc": parole_to_suc, + "saldo_to_granska": saldo_to_granska, + "saldo_to_parole": saldo_to_parole, + "saldo_to_saldo": saldo_to_saldo, + "saldo_to_suc_compound": saldo_to_suc_compound, + "saldo_to_suc": saldo_to_suc, + "saldo_pos_to_suc": saldo_pos_to_suc, + "suc_descriptions": suc_descriptions, + "suc_to_granska": suc_to_granska, + "suc_to_parole": suc_to_parole, + "suc_to_simple": suc_to_simple, + "saldo_params_to_suc": saldo_params_to_suc, +} + +tags = { + "granska_tags": granska_tags, + "parole_tags": parole_tags, + "saldo_tags": saldo_tags, + "simple_tags": simple_tags, + "suc_tags": suc_tags, +} diff --git a/sparv/core/Snakefile b/sparv/core/Snakefile index 3c3c154e..00950345 100644 --- a/sparv/core/Snakefile +++ b/sparv/core/Snakefile @@ -1,7 +1,6 @@ """Snakefile used by Snakemake.""" from pathlib import Path -import snakemake.io from rich import box from rich.highlighter import ReprHighlighter from rich.padding import Padding @@ -9,7 +8,7 @@ from rich.table import Table from rich.text import Text from snakemake.logging import logger -from sparv import util +from sparv.api import SparvErrorMessage, util from sparv.core import config as sparv_config from sparv.core import paths, registry, snake_utils, snake_prints from sparv.core.console import console @@ -36,7 +35,7 @@ def make_rules(config_missing: bool) -> None: try: snake_storage.preloader_info = preload.get_preloader_info(config["socket"]) except ConnectionRefusedError: - raise util.SparvErrorMessage("Could not connect to the socket '{}'".format(config["socket"])) + raise SparvErrorMessage("Could not connect to the socket '{}'".format(config["socket"])) # Create rules for all available annotation functions @@ -94,10 +93,11 @@ def make_rule(module_name: str, f_name: str, annotator_info: dict, config_missin f_name=rule_storage.f_name, parameters=snake_utils.get_parameters(rule_storage), export_dirs=rule_storage.export_dirs, - doc=snake_utils.doc_value(rule_storage), + source_file=snake_utils.file_value(rule_storage), use_preloader=rule_storage.use_preloader, socket=config.get("socket"), - force_preloader=config.get("force_preloader", False) + force_preloader=config.get("force_preloader", False), + compression=sparv_config.get("sparv.compression") resources: **resources # We use "script" instead of "run" since with "run" the whole Snakefile would have to be reloaded for every # single job, due to how Snakemake creates processes for run-jobs. @@ -118,23 +118,28 @@ def make_all_files_rule(rule_storage: snake_utils.RuleStorage) -> None: dependencies = rule_storage.outputs if not rule_storage.abstract else rule_storage.inputs - # Prepend work dir to paths if needed (usually included in the {doc} wildcard but here it needs to be explicit) + # Prepend work dir to paths if needed (usually included in the {file} wildcard but here it needs to be explicit) rule_outputs = [paths.work_dir / o if not (paths.work_dir in o.parents or paths.export_dir in o.parents) else o for o in dependencies] - # Expand {doc} wildcard to every corpus document + # Expand {file} wildcard to every corpus file rule_outputs = expand(rule_outputs, - doc=snake_utils.get_doc_values(config, snake_storage), + file=snake_utils.get_file_values(config, snake_storage), **snake_utils.get_wildcard_values(config)) - # Convert paths to IOFile objects so Snakemake knows which rule they come from (in case of ambiguity) - rule_outputs = [snakemake.io.IOFile(f, rule=sm_rule) for f in rule_outputs] - rule: name: rule_storage.target_name input: rule_outputs + if not rule_storage.abstract: + # Set rule dependencies for every file, so Snakemake knows which rule to use in case of ambiguity. + # Converting the values of rule_outputs to snakemake.io.IOFile objects is not enough, since file paths must match + # for that to work (which they don't do once we've expanded the {file} wildcard). + this_sm_rule = getattr(rules, rule_storage.target_name).rule + for f in this_sm_rule.input: + this_sm_rule.dependencies[f] = sm_rule + # Init the storage for some essential variables involving all rules snake_storage = snake_utils.SnakeStorage() @@ -150,9 +155,9 @@ sparv_config.apply_presets() # Add classes from config to registry registry.annotation_classes["config_classes"] = sparv_config.config.get("classes", {}) -# Set text class from document annotation +# Set text class from text annotation if not config_missing: - sparv_config.handle_document_annotation() + sparv_config.handle_text_annotation() # Let exporters and importers inherit config values from 'export' and 'import' sections for module in registry.modules: @@ -164,7 +169,7 @@ for module in registry.modules: # Collect list of all explicitly used annotations (without class expansion) for key in registry.annotation_sources: - registry.explicit_annotations_raw.update(a[0] for a in util.parse_annotation_list(sparv_config.get(key, []))) + registry.explicit_annotations_raw.update(a[0] for a in util.misc.parse_annotation_list(sparv_config.get(key, []))) # Figure out classes from annotation usage registry.find_implicit_classes() @@ -173,7 +178,7 @@ registry.find_implicit_classes() for key in registry.annotation_sources: registry.explicit_annotations.update( registry.expand_variables(a[0])[0] - for a in util.parse_annotation_list(sparv_config.get(key, []))) + for a in util.misc.parse_annotation_list(sparv_config.get(key, []))) # Load modules and create automatic rules make_rules(config_missing) @@ -188,11 +193,11 @@ sparv_config.validate_config() reverse_config_usage = snake_utils.get_reverse_config_usage() # Abort if all selected targets require source files but no files are available -if not config_missing and selected_targets and not snake_utils.get_source_files(snake_storage.source_files): +if not config_missing and selected_targets and not snake_storage.source_files: named_targets = set(i for i, _ in snake_storage.named_targets) named_targets.update(("export_corpus", "install_corpus")) if set(selected_targets).issubset(named_targets): - raise util.SparvErrorMessage("No source files available!") + raise SparvErrorMessage("No source files available!") # ============================================================================== # Static Snakemake Rules @@ -235,8 +240,10 @@ rule languages: # Rule to list all annotation presets rule presets: run: - resolved_presets = dict( - (preset, sparv_config.resolve_presets(sparv_config.presets[preset])) for preset in sparv_config.presets) + resolved_presets = {} + for preset, annots in sparv_config.presets.items(): + preset_annotations, _ = sparv_config.resolve_presets(sparv_config.presets[preset], {}, {}) + resolved_presets[preset] = preset_annotations snake_prints.prettyprint_yaml(resolved_presets) @@ -281,17 +288,37 @@ rule list_targets: # Rule to list all exports rule list_exports: run: - print() - table = Table(title="Available corpus output formats (exports)", box=box.SIMPLE, show_header=False, - title_justify="left") - table.add_column(no_wrap=True) - table.add_column() + selected_export_names = sparv_config.get("export.default", []) + selected_exports = [(t, d, l) for t, d, l in sorted(snake_storage.export_targets) if t in selected_export_names] + other_exports = [(t, d, l) for t, d, l in sorted(snake_storage.export_targets) if not t in selected_export_names] - for target, desc, language in sorted(snake_storage.export_targets): - if not language or sparv_config.get("metadata.language") in language: - table.add_row(target, desc) - console.print(table) - console.print(" Default: xml_export:pretty") + if selected_exports: + print() + table = Table(title="Selected corpus exports (output formats)", box=box.SIMPLE, show_header=False, + title_justify="left") + table.add_column(no_wrap=True) + table.add_column() + for target, desc, language in selected_exports: + if not language or registry.check_language(sparv_config.get("metadata.language"), language, + sparv_config.get("metadata.variety")): + table.add_row(target, desc) + console.print(table) + + if other_exports: + print() + title = "Other available corpus exports (output formats)" + table = Table(title=title, box=box.SIMPLE, show_header=False, title_justify="left") + table.add_column(no_wrap=True) + table.add_column() + for target, desc, language in other_exports: + if not language or registry.check_language(sparv_config.get("metadata.language"), language, + sparv_config.get("metadata.variety")): + table.add_row(target, desc) + console.print(table) + + console.print("[i]Note:[/i] Use the 'export.default' section in your corpus configuration to select what " + "exports should be produced when running 'sparv run' without arguments. If 'export.default' is " + "not specified, 'xml_export:pretty' is run by default.") # Rule to list all input files @@ -299,7 +326,8 @@ rule files: run: from rich.columns import Columns print("Available input files:\n") - console.print(Columns(sorted(snake_utils.get_source_files(snake_storage.source_files)), column_first=True, + # Convert to Text to get rid of syntax highlighting + console.print(Columns([Text(f) for f in sorted(snake_storage.source_files)], column_first=True, padding=(0, 3))) @@ -366,25 +394,27 @@ rule list_installs: # Rule for making exports defined in corpus config if "export_corpus" in selected_targets: + export_targets = snake_utils.get_export_targets(snake_storage, rules, + file=snake_utils.get_file_values(config, snake_storage), wildcards=snake_utils.get_wildcard_values(config)) + rule export_corpus: - input: - snake_utils.get_export_targets(snake_storage, rules, doc=snake_utils.get_doc_values(config, snake_storage), - wildcards=snake_utils.get_wildcard_values(config)) + input: [f for r in export_targets for f in r[1]] + # Set rule dependencies for every file, so Snakemake knows which rule to use in case of ambiguity + for r, ff in export_targets: + if r is None: + continue + for f in ff: + rules.export_corpus.rule.dependencies[f] = r # Rule for making installations if "install_corpus" in selected_targets: install_inputs = snake_utils.get_install_outputs(snake_storage, config.get("install_types")) - if config.get("install_types") and not install_inputs: - raise util.SparvErrorMessage("Unknown installation{}: {}".format( - "s" if len(config.get("install_types")) > 1 else "", - ", ".join(config.get("install_types")) - )) - elif not install_inputs: - raise util.SparvErrorMessage("Please specify what you would like to install, either by supplying arguments " - "(e.g. `sparv install xml_export:install_original`) or by adding installations in " - "The `install` section of your corpus config file.\n" - "You can list available installations with `sparv install -l`.") + if not install_inputs: + raise SparvErrorMessage("Please specify what you would like to install, either by supplying arguments " + "(e.g. `sparv install xml_export:install`) or by adding installations in " + "The `install` section of your corpus config file.\n" + "You can list available installations with `sparv install -l`.") else: rule install_corpus: input: @@ -405,7 +435,8 @@ rule list_models: table.add_column(no_wrap=True) table.add_column() for target, desc, language in sorted(snake_storage.model_targets): - if language and sparv_config.get("metadata.language") in language: + if language and registry.check_language(sparv_config.get("metadata.language"), language, + sparv_config.get("metadata.variety")): table.add_row(target, desc) console.print(table) @@ -470,9 +501,9 @@ rule preload: preload.serve(config["socket"], config["processes"], snake_storage) elif config["preload_command"] == "stop": if not Path(config["socket"]).is_socket(): - raise util.SparvErrorMessage(f"Socket file '{config['socket']}' doesn't exist or isn't a socket.") + raise SparvErrorMessage(f"Socket file '{config['socket']}' doesn't exist or isn't a socket.") elif not preload.stop(config.get("socket")): - raise util.SparvErrorMessage(f"Could not connect to socket '{config.get('socket')}'.") + raise SparvErrorMessage(f"Could not connect to socket '{config.get('socket')}'.") rule preload_list: diff --git a/sparv/core/config.py b/sparv/core/config.py index 49985e75..835376eb 100644 --- a/sparv/core/config.py +++ b/sparv/core/config.py @@ -1,7 +1,6 @@ """Functions for parsing the Sparv configuration files.""" import copy -import logging from collections import defaultdict from functools import reduce from pathlib import Path @@ -10,10 +9,10 @@ import yaml import yaml.scanner -from sparv import util from sparv.core import paths, registry +from sparv.core.misc import SparvErrorMessage, get_logger -log = logging.getLogger(__name__) +logger = get_logger(__name__) DEFAULT_CONFIG = paths.default_config_file PRESETS_DIR = paths.presets_dir @@ -21,7 +20,7 @@ MAX_THREADS = "threads" config = {} # Full configuration -presets = {} # Annotation presets +presets = {} # Annotation presets, needs to be global (accessed by Snakefile) _config_user = {} # Local corpus config _config_default = {} # Default config @@ -32,7 +31,7 @@ "install": {"_source": "core"}, PARENT: {"_source": "core"}, MAX_THREADS: {"_source": "core"}, - "preload": {"_source": "core"} + "preload": {"_source": "core"} } config_usage = defaultdict(set) # For each config key, a list of annotators using that key @@ -40,20 +39,21 @@ class Unset: """Class used to represent a config value that isn't set.""" + pass def read_yaml(yaml_file): """Read YAML file and handle errors.""" try: - with open(yaml_file) as f: + with open(yaml_file, encoding="utf-8") as f: data = yaml.load(f, Loader=yaml.FullLoader) except yaml.parser.ParserError as e: - raise util.SparvErrorMessage("Could not parse the configuration file:\n" + str(e)) + raise SparvErrorMessage("Could not parse the configuration file:\n" + str(e)) except yaml.scanner.ScannerError as e: - raise util.SparvErrorMessage("An error occurred while reading the configuration file:\n" + str(e)) + raise SparvErrorMessage("An error occurred while reading the configuration file:\n" + str(e)) except FileNotFoundError: - raise util.SparvErrorMessage(f"Could not find the config file '{yaml_file}'") + raise SparvErrorMessage(f"Could not find the config file '{yaml_file}'") return data or {} @@ -71,7 +71,7 @@ def load_config(config_file: Optional[str], config_dict: Optional[dict] = None) if DEFAULT_CONFIG.is_file(): _config_default = read_yaml(DEFAULT_CONFIG) else: - log.warning("Default config file is missing: {}".format(DEFAULT_CONFIG)) + logger.warning("Default config file is missing: {}".format(DEFAULT_CONFIG)) _config_default = {} if config_file: @@ -79,7 +79,7 @@ def load_config(config_file: Optional[str], config_dict: Optional[dict] = None) global _config_user _config_user = read_yaml(config_file) or {} - def handle_parents(cfg, current_dir="."): + def handle_parents(cfg, current_dir=Path(".")) -> dict: """Combine parent configs recursively.""" combined_parents = {} if cfg.get(PARENT): @@ -87,7 +87,7 @@ def handle_parents(cfg, current_dir="."): if isinstance(parents, str): parents = [parents] for parent in parents: - parent_path = Path(current_dir, parent) + parent_path = current_dir / parent config_parent = read_yaml(parent_path) config_parent = handle_parents(config_parent, parent_path.parent) combined_parents = _merge_dicts(config_parent, combined_parents) @@ -110,8 +110,8 @@ def handle_parents(cfg, current_dir="."): if key == PARENT: continue if not isinstance(config[key], (dict, list)): - raise util.SparvErrorMessage(f"The config section '{key}' could not be parsed.", module="sparv", - function="config") + raise SparvErrorMessage(f"The config section '{key}' could not be parsed.", module="sparv", + function="config") def dump_config(data, resolve_alias=False, sort_keys=False): @@ -130,11 +130,11 @@ def increase_indent(self, flow=False, indentless=False): return super(IndentDumper, self).increase_indent(flow) # Add custom string representer for prettier multiline strings - def str_presenter(dumper, data): - if len(data.splitlines()) > 1: # check for multiline string - return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') - return dumper.represent_scalar('tag:yaml.org,2002:str', data) - yaml.add_representer(str, str_presenter) + def str_representer(dumper, data): + if len(data.splitlines()) > 1: # Check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + yaml.add_representer(str, str_representer) if resolve_alias: # Resolve aliases and replace them with their anchors' contents @@ -196,15 +196,15 @@ def update_config(new_config): config = _merge_dicts(copy.deepcopy(new_config), config) -def _merge_dicts(user, default): - """Merge corpus config with default config, letting user values override default values.""" - if isinstance(user, dict) and isinstance(default, dict): +def _merge_dicts(d: dict, default: dict): + """Merge dict 'd' with dict 'default', letting values from 'd' override default values.""" + if isinstance(d, dict) and isinstance(default, dict): for k, v in default.items(): - if k not in user: - user[k] = v + if k not in d: + d[k] = v else: - user[k] = _merge_dicts(user[k], v) - return user + d[k] = _merge_dicts(d[k], v) + return d def add_to_structure(name, default=None, description=None, annotator: Optional[str] = None): @@ -237,7 +237,7 @@ def validate_module_config(): _get(config_key, config_structure) except KeyError: annotators = config_usage[config_key] - raise util.SparvErrorMessage( + raise SparvErrorMessage( "The annotator{} {} {} trying to access the config key '{}' which isn't declared anywhere.".format( "s" if len(annotators) > 1 else "", ", ".join(annotators), "are" if len(annotators) > 1 else "is", config_key), "sparv", "config") @@ -251,21 +251,24 @@ def validate_config(config_dict=None, structure=None, parent=""): path = (parent + "." + key) if parent else key if key not in structure: if not parent: - raise util.SparvErrorMessage(f"Unknown key in config file: '{path}'. No module with that name found.", - module="sparv", function="config") + raise SparvErrorMessage(f"Unknown key in config file: '{path}'. No module with that name found.", + module="sparv", function="config") else: module_name = parent.split(".", 1)[0] - raise util.SparvErrorMessage(f"Unknown key in config file: '{path}'. The module '{module_name}' " - f"doesn't have an option with that name.", - module="sparv", function="config") + raise SparvErrorMessage(f"Unknown key in config file: '{path}'. The module '{module_name}' " + f"doesn't have an option with that name.", + module="sparv", function="config") elif not structure[key].get("_source"): validate_config(config_dict[key], structure[key], path) -def load_presets(lang): - """Read presets files, set global presets variable and return dictionary with preset classes.""" +def load_presets(lang, lang_variety): + """Read presets files and return dictionaries with all available presets annotations and preset classes.""" global presets class_dict = {} + full_lang = lang + if lang_variety: + full_lang = lang + "-" + lang_variety for f in PRESETS_DIR.rglob("*.yaml"): presets_yaml = read_yaml(f) @@ -273,7 +276,7 @@ def load_presets(lang): # Skip preset if it is not valid for lang if lang: languages = presets_yaml.get("languages", []) - if languages and lang not in languages: + if languages and lang not in languages and full_lang not in languages: continue # Make sure preset names are upper case @@ -294,35 +297,35 @@ def load_presets(lang): return class_dict -def resolve_presets(annotations): +def resolve_presets(annotations, class_dict, preset_classes): """Resolve annotation presets into actual annotations.""" - result = [] + global presets + result_annotations = [] for annotation in annotations: if annotation in presets: - result.extend(resolve_presets(presets[annotation])) + if annotation in class_dict: + preset_classes = _merge_dicts(preset_classes, class_dict[annotation]) + result_annotations.extend(resolve_presets(presets[annotation], class_dict, preset_classes)[0]) else: - result.append(annotation) - return result + result_annotations.append(annotation) + return result_annotations, preset_classes def apply_presets(): """Resolve annotations from presets and set preset classes.""" # Load annotation presets and classes - class_dict = load_presets(get("metadata.language")) - - preset_classes = {} # Classes set by presets + class_dict = load_presets(get("metadata.language"), get("metadata.variety")) + preset_classes = {} # Go through annotation lists in config to find references to presets for a in registry.annotation_sources: annotations = get(a) if not annotations: continue - # Update classes set by presets - for annotation in annotations: - preset_classes.update(class_dict.get(annotation, {})) # Resolve presets and update annotation list in config - set_value(a, resolve_presets(annotations)) + annotations, preset_classes = resolve_presets(annotations, class_dict, preset_classes) + set_value(a, annotations) # Update classes default_classes = _config_default.get("classes", {}) @@ -332,19 +335,19 @@ def apply_presets(): config["classes"] = classes -def handle_document_annotation(): - """Copy document annotation to text class.""" - doc_elem = get("import.document_annotation") +def handle_text_annotation(): + """Copy text annotation to text class.""" + text_ann = get("import.text_annotation") - # Make sure that if both classes.text and import.document_annotation are set, that they have the same value - if get("classes.text") and doc_elem and get("classes.text") != doc_elem: - raise util.SparvErrorMessage( - "The config keys 'classes.text' and 'import.document_annotation' can't have different values.", + # Make sure that if both classes.text and import.text_annotation are set, that they have the same value + if get("classes.text") and text_ann and get("classes.text") != text_ann: + raise SparvErrorMessage( + "The config keys 'classes.text' and 'import.text_annotation' can't have different values.", "sparv", "config") - # If import.document_annotation is set, copy value to classes.text - if doc_elem: - set_default("classes.text", doc_elem) + # If import.text_annotation is set, copy value to classes.text + if text_ann: + set_default("classes.text", text_ann) def inherit_config(source: str, target: str) -> None: diff --git a/sparv/core/io.py b/sparv/core/io.py index abcb9310..e00dd533 100644 --- a/sparv/core/io.py +++ b/sparv/core/io.py @@ -1,16 +1,19 @@ """Corpus-related util functions like reading and writing annotations.""" +import bz2 +import gzip import heapq -import logging +import lzma import os import re from pathlib import Path -from typing import Union, Tuple, List, Optional +from typing import List, Optional, Tuple, Union +from sparv.api.classes import BaseAnnotation, BaseOutput from sparv.core import paths -from sparv.util.classes import BaseAnnotation, BaseOutput +from sparv.core.misc import get_logger, SparvErrorMessage -_log = logging.getLogger(__name__) +logger = get_logger(__name__) DOC_CHUNK_DELIM = ":" ELEM_ATTR_DELIM = ":" @@ -18,21 +21,32 @@ TEXT_FILE = "@text" STRUCTURE_FILE = "@structure" HEADERS_FILE = "@headers" +NAMESPACE_FILE = "@namespaces" +# Compression used for annotation files (can be changed using sparv.compression in config file) +compression = "gzip" -def annotation_exists(doc: str, annotation: BaseAnnotation): +_compressed_open = { + "none": open, + "gzip": gzip.open, + "bzip2": bz2.open, + "lzma": lzma.open +} + + +def annotation_exists(source_file: str, annotation: BaseAnnotation): """Check if an annotation file exists.""" - annotation_path = get_annotation_path(doc, annotation) + annotation_path = get_annotation_path(source_file, annotation) return os.path.exists(annotation_path) -def data_exists(doc: str, name: BaseAnnotation): +def data_exists(source_file: str, name: BaseAnnotation): """Check if an annotation data file exists.""" - annotation_path = get_annotation_path(doc, name, data=True) + annotation_path = get_annotation_path(source_file, name, data=True) return os.path.isfile(annotation_path) -def write_annotation(doc: str, annotation: BaseOutput, values, append: bool = False, +def write_annotation(source_file: str, annotation: BaseOutput, values, append: bool = False, allow_newlines: bool = False) -> None: """Write an annotation to one or more files. The file is overwritten if it exists. @@ -42,7 +56,7 @@ def write_annotation(doc: str, annotation: BaseOutput, values, append: bool = Fa if len(annotations) == 1: # Handle single annotation - _write_single_annotation(doc, annotations[0], values, append, annotation.root, allow_newlines) + _write_single_annotation(source_file, annotations[0], values, append, annotation.root, allow_newlines) else: elem_attrs = dict(split_annotation(ann) for ann in annotations) # Handle multiple annotations used as one @@ -50,28 +64,29 @@ def write_annotation(doc: str, annotation: BaseOutput, values, append: bool = Fa elem_attrs.values()), "Span annotations can not be written while treating multiple annotations as one." # Get spans and associated names for annotations. We need this information to figure out which value goes to # which annotation. - spans = read_annotation(doc, annotation, with_annotation_name=True, spans=True) + spans = read_annotation(source_file, annotation, with_annotation_name=True, spans=True) annotation_values = {elem: [] for elem in elem_attrs.keys()} for value, (_, annotation_name) in zip(values, spans): annotation_values[annotation_name].append(value) for annotation_name in annotation_values: - _write_single_annotation(doc, join_annotation(annotation_name, elem_attrs[annotation_name]), + _write_single_annotation(source_file, join_annotation(annotation_name, elem_attrs[annotation_name]), annotation_values[annotation_name], append, annotation.root, allow_newlines) -def _write_single_annotation(doc: str, annotation: str, values, append: bool, root: Path, allow_newlines: bool = False): +def _write_single_annotation(source_file: str, annotation: str, values, append: bool, root: Path, + allow_newlines: bool = False): """Write an annotation to a file.""" is_span = not split_annotation(annotation)[1] if is_span: # Make sure that spans are sorted assert all(values[i] <= values[i + 1] for i in range(len(values) - 1)), "Annotation spans must be sorted." - file_path = get_annotation_path(doc, annotation, root) + file_path = get_annotation_path(source_file, annotation, root) os.makedirs(os.path.dirname(file_path), exist_ok=True) mode = "a" if append else "w" - with open(file_path, mode) as f: + with open_annotation_file(file_path, mode) as f: ctr = 0 for value in values: if value is None: @@ -96,21 +111,48 @@ def _write_single_annotation(doc: str, annotation: str, values, append: bool, ro ctr += 1 # Update file modification time even if nothing was written os.utime(file_path, None) - _log.info(f"Wrote {ctr} items: {doc + '/' if doc else ''}{annotation}") + logger.info(f"Wrote {ctr} items: {source_file + '/' if source_file else ''}{annotation}") + + +def get_annotation_size(source_file: str, annotation: BaseAnnotation): + """Return number of lines in an annotation.""" + def _generator(reader_): + while True: + b = reader_(2 ** 16) + if not b: + break + yield b + + count = 0 + for ann in annotation.name.split(): + ann_file = get_annotation_path(source_file, ann, annotation.root) -def read_annotation_spans(doc: str, annotation: BaseAnnotation, decimals: bool = False, + try: + with open_annotation_file(ann_file, mode="rb") as f: + reader = f.raw.read if hasattr(f, "raw") and hasattr(f.raw, "read") else f.read + count += sum(buf.count(b"\n") for buf in _generator(reader)) + except (OSError, lzma.LZMAError, UnicodeDecodeError) as e: + # TODO: Use gzip.BadGzipFile instead of checking for "Not a gzipped file" once we require Python 3.8 + if isinstance(e, OSError) and not ("Not a gzipped file" in str(e) or str(e) == "Invalid data stream"): + raise e + raise_format_error(ann_file) + + return count + + +def read_annotation_spans(source_file: str, annotation: BaseAnnotation, decimals: bool = False, with_annotation_name: bool = False): """Iterate over the spans of an annotation.""" # Strip any annotation attributes - for span in read_annotation(doc, annotation, with_annotation_name, spans=True): + for span in read_annotation(source_file, annotation, with_annotation_name, spans=True): if not decimals: yield tuple(v[0] for v in span) else: yield span -def read_annotation(doc: str, annotation: BaseAnnotation, with_annotation_name: bool = False, +def read_annotation(source_file: str, annotation: BaseAnnotation, with_annotation_name: bool = False, allow_newlines: bool = False, spans: bool = False): """Yield each line from an annotation file.""" if spans: @@ -120,7 +162,7 @@ def read_annotation(doc: str, annotation: BaseAnnotation, with_annotation_name: root = annotation.root if len(annotations) == 1: # Handle single annotation - yield from _read_single_annotation(doc, annotations[0], with_annotation_name, root, allow_newlines) + yield from _read_single_annotation(source_file, annotations[0], with_annotation_name, root, allow_newlines) else: # Handle multiple annotations used as one @@ -130,68 +172,83 @@ def read_annotation(doc: str, annotation: BaseAnnotation, with_annotation_name: "annotation is not allowed." # Get iterators for all annotations - all_annotations = {split_annotation(ann)[0]: _read_single_annotation(doc, ann, with_annotation_name, root, - allow_newlines) + all_annotations = {split_annotation(ann)[0]: _read_single_annotation(source_file, ann, with_annotation_name, + root, allow_newlines) for ann in annotations} # We need to read the annotation spans to be able to interleave the values in the correct order - for _, ann in heapq.merge(*[_read_single_annotation(doc, split_annotation(ann)[0], with_annotation_name=True, + for _, ann in heapq.merge(*[_read_single_annotation(source_file, split_annotation(ann)[0], + with_annotation_name=True, root=root, allow_newlines=allow_newlines) for ann in annotations]): yield next(all_annotations[ann]) -def read_annotation_attributes(doc: str, annotations: Union[List[BaseAnnotation], Tuple[BaseAnnotation, ...]], +def read_annotation_attributes(source_file: str, annotations: Union[List[BaseAnnotation], Tuple[BaseAnnotation, ...]], with_annotation_name: bool = False, allow_newlines: bool = False): """Yield tuples of multiple attributes on the same annotation.""" assert isinstance(annotations, (tuple, list)), "'annotations' argument must be tuple or list" assert len(set(split_annotation(annotation)[0] for annotation in annotations)) == 1, "All attributes need to be " \ "for the same annotation" - return zip(*[read_annotation(doc, annotation, with_annotation_name, allow_newlines) + return zip(*[read_annotation(source_file, annotation, with_annotation_name, allow_newlines) for annotation in annotations]) -def _read_single_annotation(doc: str, annotation: str, with_annotation_name: bool, root: Path = None, +def _read_single_annotation(source_file: str, annotation: str, with_annotation_name: bool, root: Path = None, allow_newlines: bool = False): """Read a single annotation file.""" - ann_file = get_annotation_path(doc, annotation, root) + ann_file = get_annotation_path(source_file, annotation, root) - with open(ann_file) as f: + with open_annotation_file(ann_file) as f: ctr = 0 - for line in f: - value = line.rstrip("\n\r") - if not split_annotation(annotation)[1]: # If this is a span annotation - value = tuple(tuple(map(int, pos.split("."))) for pos in value.split("-")) - elif allow_newlines: - # Replace literal "\n" with line break (if we allow "\n" in values) - value = re.sub(r"((? Tuple[str, str]: return elem, attr -def join_annotation(name: str, attribute: str) -> str: +def join_annotation(name: str, attribute: Optional[str]) -> str: """Join annotation name and attribute.""" return ELEM_ATTR_DELIM.join((name, attribute)) if attribute else name -def get_annotation_path(doc: Optional[str], annotation: Union[BaseAnnotation, str], root: Path = None, +def get_annotation_path(source_file: Optional[str], annotation: Union[BaseAnnotation, str], root: Path = None, data: bool = False) -> Path: - """Construct a path to an annotation file given a doc and annotation.""" + """Construct a path to an annotation file given a source filename and annotation.""" chunk = "" - if doc: - doc, _, chunk = doc.partition(DOC_CHUNK_DELIM) + if source_file: + source_file, _, chunk = source_file.partition(DOC_CHUNK_DELIM) elem, attr = split_annotation(annotation) if data: - if doc: - path = paths.work_dir / doc / chunk / elem + if source_file: + path = paths.work_dir / source_file / chunk / elem else: path = paths.work_dir / elem else: if not attr: attr = SPAN_ANNOTATION - path = paths.work_dir / doc / chunk / elem / attr + path = paths.work_dir / source_file / chunk / elem / attr if root: path = root / path @@ -232,3 +289,20 @@ def get_annotation_path(doc: Optional[str], annotation: Union[BaseAnnotation, st path = annotation.root / path return path + + +def open_annotation_file(filename, mode="rt", encoding=None, errors=None, newline=None): + """Read and write annotation and data files using different kinds of compression.""" + if mode in "rwxa": + # Text mode is the default for open(), whereas gzip, bz2 and lzma uses binary mode. + # We adopt text mode as default. + mode += "t" + opener = _compressed_open.get(compression, open) + return opener(filename, mode=mode, encoding=encoding, errors=errors, newline=newline) + + +def raise_format_error(file_path): + """Raise a SparvErrorMessage about workdir files having the wrong format.""" + raise SparvErrorMessage(f"Compression of workdir files is set to '{compression}', but '{file_path}' is in another " + "format. Use the configuration key 'sparv.compression' to set the correct compression or " + "use 'sparv clean' to start over with a clean workdir.") diff --git a/sparv/core/log_handler.py b/sparv/core/log_handler.py index a1e12f26..040d2aea 100644 --- a/sparv/core/log_handler.py +++ b/sparv/core/log_handler.py @@ -24,31 +24,47 @@ from sparv.core import io, paths from sparv.core.console import console -from sparv.util.misc import SparvErrorMessage +from sparv.core.misc import SparvErrorMessage LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" LOG_FORMAT_DEBUG = "%(asctime)s - %(name)s (%(process)d) - %(levelname)s - %(message)s" DATE_FORMAT = "%Y-%m-%d %H:%M:%S" TIME_FORMAT = "%H:%M:%S" +# Variables set by setup_logging() +current_file = None +current_job = None + # Add internal logging level used for non-logging-related communication from child processes to log handler INTERNAL = 100 logging.addLevelName(INTERNAL, "INTERNAL") + +def _log_progress(self, progress=None, advance=None, total=None): + """Log progress of task.""" + if self.isEnabledFor(INTERNAL): + self._log(INTERNAL, "progress", (), extra={"progress": progress, "advance": advance, "total": total, + "job": current_job, "file": current_file}) + + +# Add progress function to logger +logging.progress = _log_progress +logging.Logger.progress = _log_progress + # Add logging level used for progress output (must be lower than INTERNAL) PROGRESS = 90 logging.addLevelName(PROGRESS, "PROGRESS") -def export_dirs(self, dirs): +def _export_dirs(self, dirs): """Send list of export dirs to log handler.""" if self.isEnabledFor(INTERNAL): self._log(INTERNAL, "export_dirs", (), extra={"export_dirs": dirs}) # Add log function to logger -logging.export_dirs = export_dirs -logging.Logger.export_dirs = export_dirs +logging.export_dirs = _export_dirs +logging.Logger.export_dirs = _export_dirs # Messages from the Sparv core messages = { @@ -61,6 +77,7 @@ def export_dirs(self, dirs): "corpus configuration file, like misspelled annotation names (including unintentional " \ "whitespace characters) or references to non-existent or implicit source annotations." + class LogRecordStreamHandler(socketserver.StreamRequestHandler): """Handler for streaming logging requests.""" @@ -127,14 +144,38 @@ def filter(self, record): class InternalLogHandler(logging.Handler): """Handler for internal log messages.""" - def __init__(self, export_dirs_list): + def __init__(self, export_dirs_list, progress_, jobs, job_ids): self.export_dirs_list = export_dirs_list + self.progress: progress.Progress = progress_ + self.jobs = jobs + self.job_ids = job_ids super().__init__() def emit(self, record): """Handle log record.""" if record.msg == "export_dirs": self.export_dirs_list.update(record.export_dirs) + elif record.msg == "progress": + job_id = self.job_ids.get((record.job, record.file)) + if job_id is not None: + if not self.jobs[job_id]["task"]: + self.jobs[job_id]["task"] = self.progress.add_task( + "", + start=bool(record.total), + completed=record.progress or record.advance or 0, + total=record.total or 100.0 + ) + else: + try: + if record.total: + self.progress.start_task(self.jobs[job_id]["task"]) + self.progress.update(self.jobs[job_id]["task"], total=record.total) + if record.progress: + self.progress.update(self.jobs[job_id]["task"], completed=record.progress) + elif record.advance or not record.total: + self.progress.advance(self.jobs[job_id]["task"], advance=record.advance or 1) + except KeyError: + pass class ModifiedRichHandler(RichHandler): @@ -150,34 +191,42 @@ def emit(self, record: logging.LogRecord) -> None: class ProgressWithTable(progress.Progress): """Progress bar with additional table.""" - def __init__(self, all_tasks, current_tasks, *args, **kwargs): + def __init__(self, all_tasks, current_tasks, max_len, *args, **kwargs): self.all_tasks = all_tasks self.current_tasks = current_tasks - self.task_max_len = 0 + self.task_max_len = max_len super().__init__(*args, **kwargs) def get_renderables(self): """Get a number of renderables for the progress display.""" - if not self.task_max_len and self.all_tasks: - self.task_max_len = max(map(len, self.all_tasks)) - # Progress bar - yield self.make_tasks_table(self.tasks) + yield self.make_tasks_table(self.tasks[0:1]) # Task table if self.all_tasks: + rows = [] + elapsed_max_len = 7 + bar_col = progress.BarColumn(bar_width=20) + for task in list(self.current_tasks.values()): # Make a copy to avoid mutations while iterating + elapsed = str(timedelta(seconds=round(time.time() - task["starttime"]))) + if len(elapsed) > elapsed_max_len: + elapsed_max_len = len(elapsed) + rows.append(( + task["name"], + f"[dim]{task['file']}[/dim]", + bar_col(self._tasks[task["task"]]) if task["task"] else "", + elapsed + )) + table = Table(show_header=False, box=box.SIMPLE, expand=True) table.add_column("Task", no_wrap=True, min_width=self.task_max_len + 2, ratio=1) - table.add_column("Document", no_wrap=True) - table.add_column("Elapsed", no_wrap=True, width=8, justify="right", style="progress.remaining") - table.add_row("[b]Task[/]", "[b]Document[/]", "[default b]Elapsed[/]") - for task in self.current_tasks.values(): - elapsed = round(time.time() - task["starttime"]) - table.add_row( - task["name"], - f"[dim]{task['doc']}[/dim]", - str(timedelta(seconds=elapsed)) - ) + table.add_column("File", no_wrap=True) + table.add_column("Bar", width=10) + table.add_column("Elapsed", no_wrap=True, width=elapsed_max_len, justify="right", + style="progress.remaining") + table.add_row("[b]Task[/]", "[b]File[/]", "", "[default b]Elapsed[/]") + for row in rows: + table.add_row(*row) yield table @@ -186,7 +235,7 @@ class LogHandler: icon = "\U0001f426" - def __init__(self, progressbar=True, log_level=None, log_file_level=None, verbose=False, + def __init__(self, progressbar=True, log_level=None, log_file_level=None, simple=False, stats=False, pass_through=False, dry_run=False): """Initialize log handler. @@ -194,12 +243,13 @@ def __init__(self, progressbar=True, log_level=None, log_file_level=None, verbos progressbar: Set to False to disable progress bar. Enabled by default. log_level: Log level for logging to stdout. log_file_level: Log level for logging to file. - verbose: Set to True to show more info about currently running tasks. + simple: Set to True to show less info about currently running jobs. + stats: Set to True to show stats after completion. pass_through: Let Snakemake's log messages pass through uninterrupted. dry_run: Set to True to print summary about jobs. """ self.use_progressbar = progressbar and console.is_terminal - self.verbose = verbose and console.is_terminal + self.simple = simple or not console.is_terminal self.pass_through = pass_through self.dry_run = dry_run self.log_level = log_level @@ -215,6 +265,9 @@ def __init__(self, progressbar=True, log_level=None, log_file_level=None, verbos self.export_dirs = set() self.start_time = time.time() self.jobs = {} + self.jobs_max_len = 0 + self.stats = stats + self.stats_data = defaultdict(float) self.logger = None # Progress bar related variables @@ -222,7 +275,8 @@ def __init__(self, progressbar=True, log_level=None, log_file_level=None, verbos self.bar: Optional[progress.TaskID] = None self.bar_started: bool = False self.last_percentage = 0 - self.current_tasks = OrderedDict() + self.current_jobs = OrderedDict() + self.job_ids = {} # Translation from (Sparv task name, source file) to Snakemake job ID # Create a simple TCP socket-based logging receiver tcpserver = socketserver.ThreadingTCPServer(("localhost", 0), RequestHandlerClass=LogRecordStreamHandler) @@ -272,7 +326,7 @@ def setup_loggers(self): self.logger.addHandler(levelcount_handler) # Internal log handler - internal_handler = InternalLogHandler(self.export_dirs) + internal_handler = InternalLogHandler(self.export_dirs, self.progress, self.current_jobs, self.job_ids) internal_handler.setLevel(INTERNAL) self.logger.addHandler(internal_handler) @@ -281,16 +335,17 @@ def setup_bar(self): print() progress_layout = [ progress.SpinnerColumn("dots2"), - progress.BarColumn(bar_width=None if self.verbose else 40), + progress.BarColumn(bar_width=None if not self.simple else 40), progress.TextColumn("[progress.description]{task.description}"), progress.TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), progress.TextColumn("[progress.remaining]{task.completed} of {task.total} tasks"), progress.TextColumn("{task.fields[text]}") ] - if self.verbose: - self.progress = ProgressWithTable(self.jobs, self.current_tasks, *progress_layout, console=console) - else: + if self.simple: self.progress = progress.Progress(*progress_layout, console=console) + else: + self.progress = ProgressWithTable(self.jobs, self.current_jobs, self.jobs_max_len, + *progress_layout, console=console) self.progress.start() self.bar = self.progress.add_task(self.icon, start=False, total=0, text="[dim]Preparing...[/dim]") @@ -347,7 +402,7 @@ def missing_class_message(source, classes=None): if "text" in _variables: _message += "\n\nNote: The 'text' class can also be set using the configuration variable " \ - "'import.document_annotation', but only if it refers to an annotation from the " \ + "'import.text_annotation', but only if it refers to an annotation from the " \ "source files." self.messages["error"].append((source, _message)) @@ -375,7 +430,7 @@ def missing_annotations_or_files(source, files): " • {}\n".format( "s" if len(missing_annotations) > 1 else "", "are" if len(missing_annotations) > 1 else "is", - "\n • ".join(":".join(ann) if len(ann) == 2 else ann for ann in missing_annotations) + "\n • ".join(":".join(ann) if len(ann) == 2 else ann[0] for ann in missing_annotations) ) ] if missing_other: @@ -403,6 +458,8 @@ def missing_annotations_or_files(source, files): _, count, job = j.split("\t") self.jobs[job.replace("::", ":")] = int(count) + self.jobs_max_len = max(map(len, self.jobs)) + if self.use_progressbar and not self.bar_started: # Get number of jobs and start progress bar if total_jobs.isdigit(): @@ -426,21 +483,30 @@ def missing_annotations_or_files(source, files): elif level == "job_info" and self.use_progressbar: if msg["msg"] and self.bar is not None: # Update progress status message - self.progress.update(self.bar, text=msg["msg"] if not self.verbose else "") + self.progress.update(self.bar, text=msg["msg"] if self.simple else "") - if self.verbose: - doc = msg["wildcards"].get("doc", "") - if doc.startswith(str(paths.work_dir)): - doc = doc[len(str(paths.work_dir)) + 1:] + if not self.simple: + file = msg["wildcards"].get("file", "") + if file.startswith(str(paths.work_dir)): + file = file[len(str(paths.work_dir)) + 1:] - self.current_tasks[msg["jobid"]] = { + self.current_jobs[msg["jobid"]] = { + "task": None, "name": msg["msg"], "starttime": time.time(), - "doc": doc + "file": file } - elif level == "job_finished" and self.use_progressbar: - self.current_tasks.pop(msg["jobid"], None) + self.job_ids[(msg["msg"], file)] = msg["jobid"] + + elif level == "job_finished" and self.use_progressbar and msg["jobid"] in self.current_jobs: + this_job = self.current_jobs[msg["jobid"]] + if self.stats: + self.stats_data[this_job["name"]] += time.time() - this_job["starttime"] + if this_job["task"]: + self.progress.remove_task(this_job["task"]) + self.job_ids.pop((this_job["name"], this_job["file"]), None) + self.current_jobs.pop(msg["jobid"], None) elif level == "info": if self.pass_through or msg["msg"] == "Nothing to be done.": @@ -504,6 +570,16 @@ def missing_annotations_or_files(source, files): "directory, run 'sparv run --unlock' to remove the lock." self.messages["error"].append((None, message)) handled = True + elif "IncompleteFilesException:" in msg["msg"]: + msg_contents = re.search(r"Incomplete files:\n(.+)", msg["msg"], flags=re.DOTALL) + incomplete_files = "\n • ".join(msg_contents.group(1).strip().splitlines()) + message = "The files below seem to be incomplete. If you are sure that certain files are not " \ + "incomplete, mark them as complete with 'sparv run --mark-complete '.\n" \ + "To re-generate the files instead, rerun your command with the --rerun-incomplete flag.\n" \ + "Incomplete files:\n" \ + f" • {incomplete_files}" + self.messages["error"].append((None, message)) + handled = True # Unhandled errors if not handled: @@ -555,7 +631,7 @@ def stop(self): # Stop bar self.progress.stop() - if self.verbose and self.bar_started: + if not self.simple and self.bar_started: # Clear table header from screen console.control(Control( ControlType.CARRIAGE_RETURN, @@ -563,7 +639,6 @@ def stop(self): )) self.finished = True - print() # Execution failed but we handled the error if self.handled_error: @@ -583,10 +658,25 @@ def stop(self): os.path.join(paths.log_dir, self.log_filename))) else: self.error("Job execution failed. See log messages above for details.") - # Defer to Snakemake's default log handler for unhandled errors + # Unhandled errors elif self.messages["unhandled_error"]: for error in self.messages["unhandled_error"]: - logger.text_handler(error) + errmsg = ["An unexpected error occurred."] + if self.log_level and logging._nameToLevel[self.log_level.upper()] > logging.DEBUG: + errmsg[0] += " To display further details about this error, rerun Sparv with the " \ + "'--log debug' argument.\n" + if "msg" in error: + error_lines = error["msg"].splitlines() + if " in line " in error_lines[0]: + errmsg.append(error_lines[0].split(" in line ")[0] + ":") + for line in error_lines[1:]: + if line.startswith(" File "): + break + errmsg.append(line) + else: + errmsg.append("") + errmsg.append(error.get("msg", "An unknown error occurred.")) + self.error("\n".join(errmsg)) else: spacer = "" if self.export_dirs: @@ -594,6 +684,19 @@ def stop(self): self.info("The exported files can be found in the following location{}:\n • {}".format( "s" if len(self.export_dirs) > 1 else "", "\n • ".join(sorted(self.export_dirs)))) + if self.stats_data: + spacer = "" + table = Table(show_header=False, box=box.SIMPLE) + table.add_column("Task", no_wrap=True, min_width=self.jobs_max_len + 2, ratio=1) + table.add_column("Time taken", no_wrap=True, width=10, justify="right", style="progress.remaining") + table.add_column("Percentage", no_wrap=True, justify="right") + table.add_row("[b]Task[/]", "[default b]Time taken[/]", "[b]Percentage[/b]") + total_time = sum(self.stats_data.values()) + for task, elapsed in sorted(self.stats_data.items(), key=lambda x: -x[1]): + table.add_row(task, str(timedelta(seconds=round(elapsed))), + "{:.1f}%".format(100 * elapsed / total_time)) + console.print(table) + if self.log_levelcount: # Errors or warnings were logged but execution finished anyway. Notify user of potential problems. problems = [] @@ -630,7 +733,7 @@ def cleanup(): pass -def setup_logging(log_server, log_level: str = "warning", log_file_level: str = "warning"): +def setup_logging(log_server, log_level: str = "warning", log_file_level: str = "warning", file=None, job=None): """Set up logging with socket handler.""" # Use the lowest log level, but never higher than warning log_level = min(logging.WARNING, getattr(logging, log_level.upper()), getattr(logging, log_file_level.upper())) @@ -638,3 +741,6 @@ def setup_logging(log_server, log_level: str = "warning", log_file_level: str = socket_logger.setLevel(log_level) socket_handler = logging.handlers.SocketHandler(*log_server) socket_logger.addHandler(socket_handler) + global current_file, current_job + current_file = file + current_job = job diff --git a/sparv/core/misc.py b/sparv/core/misc.py new file mode 100644 index 00000000..57f5e3d8 --- /dev/null +++ b/sparv/core/misc.py @@ -0,0 +1,31 @@ +"""Miscellaneous classes and methods.""" + +import logging + + +class SparvErrorMessage(Exception): + """Exception used to notify users of errors in a friendly way without displaying traceback.""" + + start_marker = "<<>>" + end_marker = "<<>>" + + def __init__(self, message, module="", function=""): + """Raise an error and notify user of the problem in a friendly way. + + Args: + message: Error message. + module: Name of module where error occurred (optional, not used in Sparv modules) + function: Name of function where error occurred (optional, not used in Sparv modules) + """ + self.message = message + # Alter message before calling base class + super().__init__("{}{}\n{}\n{}{}".format(SparvErrorMessage.start_marker, module, function, message, + SparvErrorMessage.end_marker)) + + +def get_logger(name): + """Get a logger that is a child of 'sparv.modules'.""" + if not name.startswith("sparv.modules"): + name = "sparv.modules." + name + return logging.getLogger(name) + diff --git a/sparv/core/paths.py b/sparv/core/paths.py index 7d21d7ba..fdd3bec3 100644 --- a/sparv/core/paths.py +++ b/sparv/core/paths.py @@ -12,7 +12,7 @@ def read_sparv_config(): data = {} if sparv_config_file.is_file(): try: - with open(sparv_config_file) as f: + with open(sparv_config_file, encoding="utf-8") as f: data = yaml.load(f, Loader=yaml.FullLoader) except: data = {} @@ -66,8 +66,3 @@ def get_data_path(subpath: Union[str, Path] = "") -> Optional[Path]: source_dir = "source" export_dir = Path("export") config_file = "config.yaml" - -# CWB variables -cwb_encoding = os.environ.get("CWB_ENCODING", "utf8") -cwb_datadir = os.environ.get("CWB_DATADIR") -corpus_registry = os.environ.get("CORPUS_REGISTRY") diff --git a/sparv/core/preload.py b/sparv/core/preload.py index 1d1ad570..e518bb95 100644 --- a/sparv/core/preload.py +++ b/sparv/core/preload.py @@ -14,7 +14,7 @@ from sparv.core import config, log_handler from sparv.core.console import console from sparv.core.snake_utils import SnakeStorage -from sparv.util import SparvErrorMessage +from sparv.core.misc import SparvErrorMessage INFO = "INFO" STATUS = "STATUS" @@ -32,6 +32,7 @@ class Preloader: """Class representing a preloader.""" + def __init__(self, function, target, preloader, params, cleanup, shared): self.function = function self.target = target diff --git a/sparv/core/registry.py b/sparv/core/registry.py index 6485ac84..bf94377f 100644 --- a/sparv/core/registry.py +++ b/sparv/core/registry.py @@ -9,12 +9,13 @@ import iso639 import typing_inspect -from pkg_resources import iter_entry_points from sparv.core import config as sparv_config from sparv.core import paths -from sparv.util.classes import (BaseOutput, Config, ExportAnnotations, ExportAnnotationsAllDocs, SourceStructureParser, - ModelOutput, Wildcard) +from sparv.core.console import console +from sparv.core.misc import SparvErrorMessage +from sparv.api.classes import (BaseOutput, Config, Export, ExportAnnotations, ExportAnnotationsAllSourceFiles, + SourceAnnotations, SourceStructureParser, ModelOutput, Wildcard) modules_path = ".".join(("sparv", paths.modules_dir)) core_modules_path = ".".join(("sparv", paths.core_modules_dir)) @@ -87,6 +88,8 @@ def find_modules(no_import: bool = False, find_custom: bool = False) -> list: Returns: A list of available module names. """ + from pkg_resources import iter_entry_points, VersionConflict + modules_full_path = paths.sparv_path / paths.modules_dir core_modules_full_path = paths.sparv_path / paths.core_modules_dir @@ -100,21 +103,34 @@ def find_modules(no_import: bool = False, find_custom: bool = False) -> list: add_module_metadata(m, module.name) if find_custom: + custom_annotators = [a.get("annotator", "").split(":")[0] for a in sparv_config.get("custom_annotations", [])] # Also search for modules in corpus dir custom_modules = pkgutil.iter_modules([str(paths.corpus_dir)]) for module in custom_modules: module_name = f"{custom_name}.{module.name}" + # Skip modules in corpus dir if they are not used in the corpus config + if module_name not in custom_annotators: + continue module_names.append(module_name) if not no_import: module_path = paths.corpus_dir.resolve() / f"{module.name}.py" spec = importlib.util.spec_from_file_location(module_name, module_path) m = importlib.util.module_from_spec(spec) + try: + spec.loader.exec_module(m) + except Exception as e: + raise SparvErrorMessage(f"Module '{module_name}' cannot be imported due to an error in file " + f"'{module_path}': {e}") add_module_metadata(m, module_name) - spec.loader.exec_module(m) # Search for installed plugins for entry_point in iter_entry_points("sparv.plugin"): - m = entry_point.load() + try: + m = entry_point.load() + except VersionConflict as e: + console.print(f"[red]:warning-emoji: The plugin {entry_point.dist} could not be loaded. " + f"It requires {e.req}, but the current installed version is {e.dist}.\n") + continue add_module_metadata(m, entry_point.name) module_names.append(entry_point.name) @@ -157,7 +173,7 @@ def _get_module_name(module_string: str) -> str: def _annotator(description: str, a_type: Annotator, name: Optional[str] = None, file_extension: Optional[str] = None, - outputs=(), document_annotation=None, structure=None, language: Optional[List[str]] = None, + outputs=(), text_annotation=None, structure=None, language: Optional[List[str]] = None, config: Optional[List[Config]] = None, order: Optional[int] = None, abstract: bool = False, wildcards: Optional[List[Wildcard]] = None, preloader: Optional[Callable] = None, preloader_params: Optional[List[str]] = None, preloader_target: Optional[str] = None, @@ -174,7 +190,7 @@ def decorator(f): "type": a_type, "file_extension": file_extension, "outputs": outputs, - "document_annotation": document_annotation, + "text_annotation": text_annotation, "structure": structure, "language": language, "config": config, @@ -205,7 +221,7 @@ def annotator(description: str, name: Optional[str] = None, language: Optional[L def importer(description: str, file_extension: str, name: Optional[str] = None, outputs=None, - document_annotation: Optional[str] = None, structure: Optional[Type[SourceStructureParser]] = None, + text_annotation: Optional[str] = None, structure: Optional[Type[SourceStructureParser]] = None, config: Optional[List[Config]] = None): """Return a decorator for importer functions. @@ -217,16 +233,16 @@ def importer(description: str, file_extension: str, name: Optional[str] = None, May also be a Config instance referring to such a list. It may generate more outputs than listed, but only the annotations listed here will be available to use as input for annotator functions. - document_annotation: An annotation from 'outputs' that should be used as the value for the - import.document_annotation config variable, unless it or classes.text has been set manually. - structure: A class used to parse and return the structure of source documents. + text_annotation: An annotation from 'outputs' that should be used as the value for the + import.text_annotation config variable, unless it or classes.text has been set manually. + structure: A class used to parse and return the structure of source files. config: List of Config instances defining config options for the importer. Returns: A decorator """ return _annotator(description=description, a_type=Annotator.importer, name=name, file_extension=file_extension, - outputs=outputs, document_annotation=document_annotation, structure=structure, config=config) + outputs=outputs, text_annotation=text_annotation, structure=structure, config=config) def exporter(description: str, name: Optional[str] = None, config: Optional[List[Config]] = None, @@ -271,9 +287,16 @@ def _add_to_registry(annotator): # Add to set of supported languages... for lang in annotator["language"]: if lang not in languages: - languages[lang] = iso639.languages.get(part3=lang).name if lang in iso639.languages.part3 else lang + langcode, _, suffix = lang.partition("-") + if suffix: + suffix = f" ({suffix})" + if langcode in iso639.languages.part3: + languages[lang] = iso639.languages.get(part3=langcode).name + suffix + else: + languages[lang] = lang # ... but skip annotators for other languages than the one specified in the config - if sparv_config.get("metadata.language") and sparv_config.get("metadata.language") not in annotator["language"]: + if sparv_config.get("metadata.language") and not check_language( + sparv_config.get("metadata.language"), annotator["language"], sparv_config.get("metadata.variety")): return # Add config variables to config @@ -281,13 +304,13 @@ def _add_to_registry(annotator): for c in annotator["config"]: handle_config(c, module_name, rule_name) - # Handle document annotation for selected importer + # Handle text annotation for selected importer if annotator["type"] == Annotator.importer and rule_name == sparv_config.get("import.importer"): - if annotator["document_annotation"] and not sparv_config.get("classes.text"): - sparv_config.set_value("import.document_annotation", annotator["document_annotation"]) - sparv_config.handle_document_annotation() + if annotator["text_annotation"] and not sparv_config.get("classes.text"): + sparv_config.set_value("import.text_annotation", annotator["text_annotation"]) + sparv_config.handle_text_annotation() - for param, val in inspect.signature(annotator["function"]).parameters.items(): + for _param, val in inspect.signature(annotator["function"]).parameters.items(): if isinstance(val.default, BaseOutput): ann = val.default cls = val.default.cls @@ -296,12 +319,12 @@ def _add_to_registry(annotator): # Make sure annotation names include module names as prefix if not attr: if not ann_name.startswith(module_name + "."): - raise ValueError("Output annotation '{}' in module '{}' doesn't include module " - "name as prefix.".format(ann_name, module_name)) + raise SparvErrorMessage(f"Output annotation '{ann_name}' in module '{module_name}' doesn't include " + "module name as prefix.") else: if not attr.startswith(module_name + "."): - raise ValueError("Output annotation '{}' in module '{}' doesn't include module " - "name as prefix in attribute.".format(ann, module_name)) + raise SparvErrorMessage(f"Output annotation '{ann}' in module '{module_name}' doesn't include " + "module name as prefix in attribute.") # Add to class registry if cls: @@ -326,20 +349,30 @@ def _add_to_registry(annotator): # Only add classes for relevant languages if not annotator["language"] or ( - annotator["language"] and sparv_config.get("metadata.language") in annotator["language"]): + annotator["language"] and sparv_config.get("metadata.language") + and check_language(sparv_config.get("metadata.language"), annotator["language"], + sparv_config.get("metadata.variety"))): if cls_target not in annotation_classes["module_classes"][cls]: annotation_classes["module_classes"][cls].append(cls_target) elif isinstance(val.default, ModelOutput): modeldir = val.default.name.split("/")[0] if not modeldir.startswith(module_name): - raise ValueError("Output model '{}' in module '{}' doesn't include module " - "name as sub directory.".format(val.default, module_name)) + raise SparvErrorMessage(f"Output model '{val.default}' in module '{module_name}' doesn't include module" + " name as sub directory.") elif isinstance(val.default, Config): sparv_config.add_config_usage(val.default.name, rule_name) - elif isinstance(val.default, (ExportAnnotations, ExportAnnotationsAllDocs)): + elif isinstance(val.default, (ExportAnnotations, ExportAnnotationsAllSourceFiles, SourceAnnotations)): sparv_config.add_config_usage(val.default.config_name, rule_name) annotation_sources.add(val.default.config_name) + elif isinstance(val.default, Export): + if "/" not in val.default: + raise SparvErrorMessage(f"Illegal export path for export '{val.default}' in module '{module_name}'. " + "A subdirectory must be used.") + export_dir = val.default.split("/")[0] + if not (export_dir.startswith(module_name + ".") or export_dir == module_name): + raise SparvErrorMessage(f"Illegal export path for export '{val.default}' in module '{module_name}'. " + "The export subdirectory must include the module name as prefix.") if module_name not in modules: modules[module_name] = Module(module_name) @@ -354,7 +387,6 @@ def _add_to_registry(annotator): def find_implicit_classes() -> None: """Figure out implicitly defined classes from annotation usage.""" - annotation_to_class = defaultdict(set) for class_source in ("module_classes", "config_classes"): for cls, anns in annotation_classes[class_source].items(): @@ -373,8 +405,8 @@ def find_implicit_classes() -> None: def handle_config(cfg, module_name, rule_name: Optional[str] = None) -> None: """Handle Config instances.""" if not cfg.name.startswith(module_name + "."): - raise ValueError("Config option '{}' in module '{}' doesn't include module " - "name as prefix.".format(cfg.name, module_name)) + raise SparvErrorMessage(f"Config option '{cfg.name}' in module '{module_name}' doesn't include module " + "name as prefix.") # Check that config variable hasn't already been declared prev = sparv_config.config_structure for k in cfg.name.split("."): @@ -382,12 +414,13 @@ def handle_config(cfg, module_name, rule_name: Optional[str] = None) -> None: break prev = prev[k] else: - raise Exception(f"The config variable '{cfg.name}' in '{rule_name or module_name}' has already been declared.") + raise SparvErrorMessage( + f"The config variable '{cfg.name}' in '{rule_name or module_name}' has already been declared.") if cfg.default is not None: sparv_config.set_default(cfg.name, cfg.default) sparv_config.add_to_structure(cfg.name, cfg.default, description=cfg.description, annotator=rule_name) if not cfg.description: - raise Exception(f"Missing description for configuration key '{cfg.name}' in module '{module_name}'.") + raise SparvErrorMessage(f"Missing description for configuration key '{cfg.name}' in module '{module_name}'.") def _expand_class(cls): @@ -515,3 +548,13 @@ def get_type_hint_type(type_hint): type_ = type_hint return type_, is_list, optional + + +def check_language(corpus_lang: str, langs: List[str], corpus_lang_suffix: Optional[str] = None) -> bool: + """Check if corpus language is among a list of languages. + + Any suffix on corpus_lang will be ignored. + """ + if corpus_lang_suffix: + corpus_lang = corpus_lang + "-" + corpus_lang_suffix + return corpus_lang in langs or corpus_lang.split("-")[0] in langs diff --git a/sparv/core/run.py b/sparv/core/run.py index f8c21f64..b8b6297f 100644 --- a/sparv/core/run.py +++ b/sparv/core/run.py @@ -7,7 +7,7 @@ import sys from sparv.core import log_handler, paths, registry -from sparv.util.classes import Annotation, AnnotationData, Config, Document, Output, OutputData +from sparv.api.classes import Annotation, AnnotationData, Config, SourceFilename, Output, OutputData def main(argv=None, log_level: str = "info"): @@ -43,7 +43,7 @@ def main(argv=None, log_level: str = "info"): subparsers = parser.add_subparsers(dest="_annotator", help="Annotator function") subparsers.required = True - needs_doc_types = (Annotation, AnnotationData, Output, OutputData) # Types that need a doc value + needs_source_types = (Annotation, AnnotationData, Output, OutputData) # Types that need a source file value for f_name in registry.modules[module_name].functions: annotator = registry.modules[module_name].functions[f_name] @@ -52,8 +52,8 @@ def main(argv=None, log_level: str = "info"): help=annotator["description"]) subparser.set_defaults(f_=f) required_args = subparser.add_argument_group("required named arguments") - needs_doc = False - has_doc = False + needs_source = False + has_source = False for parameter in inspect.signature(f).parameters.items(): param_ann = parameter[1].annotation param_default = parameter[1].default @@ -63,10 +63,10 @@ def main(argv=None, log_level: str = "info"): # arg_type = arg_type if arg_type in (str, int, bool) else None else: arg_type = None - if arg_type in needs_doc_types: - needs_doc = True - if arg_type == Document: - has_doc = True + if arg_type in needs_source_types: + needs_source = True + if arg_type == SourceFilename: + has_source = True required = param_default == inspect.Parameter.empty f_args = {"type": arg_type} if not required: @@ -89,23 +89,23 @@ def main(argv=None, log_level: str = "info"): else: subparser.add_argument("--" + parameter[0], help=" ", **f_args) - subparser.set_defaults(has_doc_=has_doc) - if not has_doc and needs_doc: - required_args.add_argument("--doc", required=True, type=str) + subparser.set_defaults(has_source_=has_source) + if not has_source and needs_source: + required_args.add_argument("--source_file", required=True, type=str) args = parser.parse_args(rest_args) arguments = {} - doc = args.doc if "doc" in args else None - has_doc = args.has_doc_ if "has_doc_" in args else False + source_file = args.source_file if "source_file" in args else None + has_source = args.has_source_ if "has_source_" in args else False for k, v in vars(args).items(): - if k in ("f_", "_annotator", "has_doc_"): + if k in ("f_", "_annotator", "has_source_"): continue - if not has_doc and k in "doc": + if not has_source and k in "source_file": continue - # Add doc value if the type requires it - if type(v) in needs_doc_types: - v.doc = doc + # Add source value if the type requires it + if type(v) in needs_source_types: + v.source_file = source_file arguments[k] = v args.f_(**arguments) diff --git a/sparv/core/run_snake.py b/sparv/core/run_snake.py index 8ecef06a..db338170 100644 --- a/sparv/core/run_snake.py +++ b/sparv/core/run_snake.py @@ -3,12 +3,13 @@ import importlib.util import logging import sys +import traceback from pkg_resources import iter_entry_points -from sparv.core import log_handler, paths +from sparv.core import io, log_handler, paths from sparv.core import registry -from sparv.util import SparvErrorMessage +from sparv.core.misc import SparvErrorMessage custom_name = "custom" plugin_name = "plugin" @@ -24,8 +25,8 @@ def exit_with_error_message(message, logger_name): """Log error message and exit with non-zero status.""" error_logger = logging.getLogger(logger_name) - if snakemake.params.doc: - message += f"\n\n(document: {snakemake.params.doc})" + if snakemake.params.source_file: + message += f"\n\n(file: {snakemake.params.source_file})" error_logger.error(message) sys.exit(123) @@ -40,6 +41,14 @@ def __init__(self, logger, log_level=logging.INFO): def write(self, buf): self.logger.log(self.log_level, buf.rstrip()) + @staticmethod + def isatty(): + return False + + +# Set compression +if snakemake.params.compression: + io.compression = snakemake.params.compression # Import module modules_path = ".".join(("sparv", paths.modules_dir)) @@ -63,7 +72,7 @@ def write(self, buf): preload.send_data(sock, preload.PING) response = preload.receive_data(sock) # Timeouts if busy sock.settimeout(None) - except (BlockingIOError, socket.timeout) as e: + except (BlockingIOError, socket.timeout): use_preloader = False preloader_busy = True if sock is not None: @@ -98,7 +107,9 @@ def write(self, buf): log_handler.setup_logging(snakemake.config["log_server"], log_level=snakemake.config["log_level"], - log_file_level=snakemake.config["log_file_level"]) + log_file_level=snakemake.config["log_file_level"], + file=snakemake.params.source_file, + job=f"{snakemake.params.module_name}:{snakemake.params.f_name}") logger = logging.getLogger("sparv") logger.info("RUN: %s:%s(%s)", module_name, f_name, ", ".join("%s=%s" % (i[0], repr(i[1])) for i in list(parameters.items()))) @@ -120,11 +131,16 @@ def write(self, buf): logger.export_dirs(snakemake.params.export_dirs) except SparvErrorMessage as e: # Any exception raised here would be printed directly to the terminal, due to how Snakemake runs the script. - # Instead we log the error message and exit with a non-zero status to signal to Snakemake that + # Instead, we log the error message and exit with a non-zero status to signal to Snakemake that # something went wrong. exit_with_error_message(e.message, "sparv.modules." + module_name) except Exception as e: - logger.exception("An error occurred while executing:") + errmsg = f"An error occurred while executing {module_name}:{f_name}:" + if logger.level > logging.DEBUG: + errmsg += f"\n\n {type(e).__name__}: {e}\n\n" \ + "To display further details when errors occur, run Sparv with the '--log debug' argument." + logger.error(errmsg) + logger.debug(traceback.format_exc()) sys.exit(123) finally: # Restore printing to stdout and stderr diff --git a/sparv/core/setup.py b/sparv/core/setup.py index 6b29c5c3..d68b434b 100644 --- a/sparv/core/setup.py +++ b/sparv/core/setup.py @@ -8,7 +8,6 @@ import appdirs import pkg_resources -import yaml from rich.padding import Padding from rich.prompt import Confirm @@ -28,7 +27,7 @@ def check_sparv_version() -> Optional[bool]: data_dir = paths.get_data_path() version_file = (data_dir / VERSION_FILE) if version_file.is_file(): - return version_file.read_text() == __version__ + return version_file.read_text(encoding="utf-8") == __version__ return None @@ -52,6 +51,26 @@ def copy_resource_files(data_dir: pathlib.Path): shutil.copy(f, data_dir / rel_f) +def reset(): + """Remove the data dir config file.""" + if paths.sparv_config_file.is_file(): + data_dir = paths.read_sparv_config().get("sparv_data") + try: + # Delete config file + paths.sparv_config_file.unlink() + # Delete config dir if empty + if not any(paths.sparv_config_file.parent.iterdir()): + paths.sparv_config_file.parent.rmdir() + except: + console.print("An error occurred while trying to reset the configuration.") + sys.exit(1) + console.print("Sparv's data directory information has been reset.") + if data_dir and pathlib.Path(data_dir).is_dir(): + console.print(f"The data directory itself has not been removed, and is still available at:\n{data_dir}") + else: + console.print("Nothing to reset.") + + def run(sparv_datadir: Optional[str] = None): """Query user about data dir path unless provided by argument, and populate path with files.""" default_dir = pathlib.Path(appdirs.user_data_dir("sparv")) @@ -134,12 +153,12 @@ def run(sparv_datadir: Optional[str] = None): } paths.sparv_config_file.parent.mkdir(parents=True, exist_ok=True) - with open(paths.sparv_config_file, "w") as f: + with open(paths.sparv_config_file, "w", encoding="utf-8") as f: f.write(config.dump_config(config_dict)) copy_resource_files(path) # Save Sparv version number to a file in data dir - (path / VERSION_FILE).write_text(__version__) + (path / VERSION_FILE).write_text(__version__, encoding="utf-8") console.print(f"\nSetup completed. The Sparv data directory is set to '{path}'.") diff --git a/sparv/core/snake_prints.py b/sparv/core/snake_prints.py index 6d6fe9d5..1dd3efcb 100644 --- a/sparv/core/snake_prints.py +++ b/sparv/core/snake_prints.py @@ -209,10 +209,21 @@ def print_languages(): """Print all supported languages.""" print() table = Table(title="Supported languages", box=box.SIMPLE, show_header=False, title_justify="left") - for language, name in sorted(registry.languages.items(), key=lambda x: x[1]): + full_langs = dict((k, v) for k, v in registry.languages.items() if "-" not in k) + for language, name in sorted(full_langs.items(), key=lambda x: x[1]): table.add_row(name, language) console.print(table) + sub_langs = dict((k, v) for k, v in registry.languages.items() if "-" in k) + if sub_langs: + print() + table = Table(title="Supported language varieties", box=box.SIMPLE, show_header=False, title_justify="left") + table.add_row("[b]Name[/b]", "[b]Language[/b]", "[b]Variety[/b]") + for language, name in sorted(sub_langs.items(), key=lambda x: x[1]): + lang, _, sublang = language.partition("-") + table.add_row(name, lang, sublang) + console.print(table) + def get_custom_module_description(name): """Return string with description for custom modules.""" diff --git a/sparv/core/snake_utils.py b/sparv/core/snake_utils.py index 50da015f..1ab4cd1b 100644 --- a/sparv/core/snake_utils.py +++ b/sparv/core/snake_utils.py @@ -11,14 +11,14 @@ import snakemake from snakemake.io import expand -from sparv import util +from sparv.api import util, SparvErrorMessage from sparv.core import config as sparv_config from sparv.core import io, log_handler, paths, registry from sparv.core.console import console -from sparv.util.classes import (AllDocuments, Annotation, AnnotationAllDocs, AnnotationData, Base, BaseAnnotation, - BaseOutput, Binary, BinaryDir, Config, Corpus, Document, Export, ExportAnnotations, - ExportAnnotationsAllDocs, ExportInput, Language, Model, ModelOutput, Output, OutputData, - Source, SourceAnnotations, Text) +from sparv.api.classes import (AllSourceFilenames, Annotation, AnnotationAllSourceFiles, AnnotationData, Base, BaseAnnotation, + BaseOutput, Binary, BinaryDir, Config, Corpus, SourceFilename, Export, ExportAnnotations, + ExportAnnotationsAllSourceFiles, ExportInput, Language, Model, ModelOutput, Output, OutputData, + Source, SourceAnnotations, SourceAnnotationsAllSourceFiles, Text) class SnakeStorage: @@ -44,11 +44,41 @@ def __init__(self): self.model_outputs = [] # Outputs from modelbuilders, used in build_models self.install_outputs = defaultdict(list) # Outputs from all installers, used in rule install_corpus - self.source_files = [] # List which will contain all source files self.all_rules: List[RuleStorage] = [] # List containing all rules created self.ordered_rules = [] # List of rules containing rule order self.preloader_info = {} + self._source_files = None # Auxiliary variable for the source_files property + + @property + def source_files(self) -> List[str]: + """Get list of all available source files.""" + if self._source_files is None: + if not sparv_config.get("import.importer"): + raise SparvErrorMessage("The config variable 'import.importer' must not be empty.", "sparv") + try: + importer_module, _, importer_function = sparv_config.get("import.importer").partition(":") + file_extension = "." + registry.modules[importer_module].functions[importer_function]["file_extension"] + except KeyError: + raise SparvErrorMessage( + "Could not find the importer '{}'. Make sure the 'import.importer' config value refers to an " + "existing importer.".format(sparv_config.get("import.importer")), "sparv") + # Collect files in source dir + sf = [f for f in snakemake.utils.listfiles(Path(get_source_path(), "{file}"))] + self._source_files = [f[1][0][:-len(file_extension)] for f in sf if f[1][0].endswith(file_extension)] + # Collect files that don't match the file extension provided by the corpus config + wrong_ext = [f[1][0] for f in sf if not f[1][0].endswith(file_extension) and not Path(f[0]).is_dir()] + if wrong_ext: + console.print("[yellow]\nThere {} file{} in your source directory that do{} not match the file " + "extension '{}' in the corpus config: {}{} will not be processed.\n[/yellow]".format( + "is one" if len(wrong_ext) == 1 else "are", + "" if len(wrong_ext) == 1 else "s", + "es" if len(wrong_ext) == 1 else "", + file_extension, + f"'{wrong_ext[0]}'" if len(wrong_ext) == 1 else "\n • " + "\n • ".join(wrong_ext), + ". This file" if len(wrong_ext) == 1 else "\nThese files"), highlight=False) + return self._source_files + class RuleStorage: """Object to store parameters for a snake rule.""" @@ -64,8 +94,8 @@ def __init__(self, module_name, f_name, annotator_info): self.inputs = [] self.outputs = [] self.parameters = {} - self.docs = [] # List of parameters referring to Document - self.doc_annotations = [] # List of parameters containing the {doc} wildcard + self.file_parameters = [] # List of parameters referring to SourceFilename + self.file_annotations = [] # List of parameters containing the {file} wildcard self.wildcard_annotations = [] # List of parameters containing other wildcards self.configs = set() # Set of config variables used self.classes = set() # Set of classes used @@ -109,7 +139,8 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m # Skip any annotator that is not available for the selected corpus language if rule.annotator_info["language"] and sparv_config.get("metadata.language") and \ - sparv_config.get("metadata.language") not in rule.annotator_info["language"]: + not registry.check_language(sparv_config.get("metadata.language"), rule.annotator_info["language"], + sparv_config.get("metadata.variety")): return False # Get this function's parameters @@ -117,13 +148,14 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m param_dict = make_param_dict(params) if rule.importer: - rule.inputs.append(Path(get_source_path(), "{doc}." + rule.file_extension)) + rule.inputs.append(Path(get_source_path(), "{file}." + rule.file_extension)) storage.all_importers.setdefault(rule.module_name, {}).setdefault(rule.f_name, {"description": rule.description, "params": param_dict}) if rule.target_name == sparv_config.get("import.importer"): - # Exports always generate corpus text file - rule.outputs.append(paths.work_dir / "{doc}" / io.TEXT_FILE) + # Imports always generate corpus text file and structure file + rule.outputs.append(paths.work_dir / "{file}" / io.TEXT_FILE) + rule.outputs.append(paths.work_dir / "{file}" / io.STRUCTURE_FILE) # If importer guarantees other outputs, add them to outputs list if rule.import_outputs: if isinstance(rule.import_outputs, Config): @@ -131,7 +163,7 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m annotations_ = set() renames = {} # Annotation list needs to be sorted to handle plain annotations before attributes - for ann, target in sorted(util.parse_annotation_list(rule.import_outputs)): + for ann, target in sorted(util.misc.parse_annotation_list(rule.import_outputs)): # Handle annotations renamed during import if target: source_ann, source_attr = BaseAnnotation(ann).split() @@ -145,11 +177,11 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m for element in annotations_: rule.outputs.append(paths.work_dir / get_annotation_path(element)) - # If import.document_annotation has been specified, add it to outputs if not already there - if sparv_config.get("import.document_annotation"): - doc_ann_file = paths.work_dir / get_annotation_path(sparv_config.get("import.document_annotation")) - if doc_ann_file not in rule.outputs: - rule.outputs.append(doc_ann_file) + # If import.text_annotation has been specified, add it to outputs if not already there + if sparv_config.get("import.text_annotation"): + text_ann_file = paths.work_dir / get_annotation_path(sparv_config.get("import.text_annotation")) + if text_ann_file not in rule.outputs: + rule.outputs.append(text_ann_file) if rule.exporter: storage.all_exporters.setdefault(rule.module_name, {}).setdefault(rule.f_name, @@ -172,7 +204,7 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m # This should be either a utility annotator or a custom annotator supplied by the user if not (rule.module_name == registry.custom_name or storage.all_custom_annotators.get(rule.module_name, {}).get(rule.f_name)): - raise util.SparvErrorMessage( + raise SparvErrorMessage( "The custom annotation for annotator '{}' is using 'params' which is not allowed with this type of " "annotator. Use 'config' instead.".format(custom_rule_obj["annotator"])) name_custom_rule(rule, storage) @@ -183,15 +215,15 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m try: custom_suffix = custom_rule_obj["suffix"] except KeyError: - raise util.SparvErrorMessage("The custom annotation for annotator '{}' is missing the required key " - "'suffix'.".format(custom_rule_obj["annotator"])) + raise SparvErrorMessage("The custom annotation for annotator '{}' is missing the required key " + "'suffix'.".format(custom_rule_obj["annotator"])) sparv_config.config = sparv_config._merge_dicts(copy.deepcopy(custom_rule_obj["config"]), sparv_config.config) else: # This is a custom rule which doesn't require any parameters, so it has already been processed return False - # Go though function parameters and handle based on type + # Go through function parameters and handle based on type for param_name, param in params.items(): param_default_empty = param.default == inspect.Parameter.empty param_value: Any @@ -204,7 +236,7 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m elif not param_default_empty: param_value = copy.deepcopy(param.default) else: - raise util.SparvErrorMessage( + raise SparvErrorMessage( f"Parameter '{param_name}' in custom rule '{rule.full_name}' has no value!", "sparv", "config") else: if param_default_empty: @@ -235,9 +267,9 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m missing_configs = param_value.expand_variables(rule.full_name) rule.missing_config.update(missing_configs) ann_path = get_annotation_path(param_value, data=param_type.data, common=param_type.common) - if param_type.all_docs: + if param_type.all_files: rule.outputs.extend(map(Path, expand(escape_wildcards(paths.work_dir / ann_path), - doc=get_source_files(storage.source_files)))) + file=storage.source_files))) elif param_type.common: rule.outputs.append(paths.work_dir / ann_path) if rule.installer: @@ -277,29 +309,30 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m continue rule.missing_config.update(missing_configs) ann_path = get_annotation_path(param_value, data=param_type.data, common=param_type.common) - if param_type.all_docs: - rule.inputs.extend(expand(escape_wildcards(paths.work_dir / ann_path), - doc=get_source_files(storage.source_files))) - elif rule.exporter or rule.installer or param_type.common: - rule.inputs.append(paths.work_dir / ann_path) - else: - rule.inputs.append(ann_path) + if param_value.is_input: + if param_type.all_files: + rule.inputs.extend(expand(escape_wildcards(paths.work_dir / ann_path), + file=storage.source_files)) + elif rule.exporter or rule.installer or param_type.common: + rule.inputs.append(paths.work_dir / ann_path) + else: + rule.inputs.append(ann_path) rule.parameters[param_name] = param_value if "{" in param_value: rule.wildcard_annotations.append(param_name) # ExportAnnotations - elif param_type in (ExportAnnotations, ExportAnnotationsAllDocs): + elif param_type in (ExportAnnotations, ExportAnnotationsAllSourceFiles): if not isinstance(param_value, param_type): param_value = param_type(param_value) rule.parameters[param_name] = param_value source = param.default.config_name - annotations = sparv_config.get(f"{source}", []) + annotations = sparv_config.get(source, []) if not annotations: - rule.missing_config.add(f"{source}") - export_annotations = util.parse_annotation_list(annotations, add_plain_annotations=False) - annotation_type = Annotation if param_type == ExportAnnotations else AnnotationAllDocs + rule.missing_config.add(source) + export_annotations = util.misc.parse_annotation_list(annotations, add_plain_annotations=False) + annotation_type = Annotation if param_type == ExportAnnotations else AnnotationAllSourceFiles plain_annotations = set() possible_plain_annotations = [] for i, (export_annotation_name, export_name) in enumerate(export_annotations): @@ -321,31 +354,37 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m for annotation, export_name in export_annotations: if param.default.is_input: - if param_type == ExportAnnotationsAllDocs: + if param_type == ExportAnnotationsAllSourceFiles: rule.inputs.extend( expand(escape_wildcards(paths.work_dir / get_annotation_path(annotation.name)), - doc=get_source_files(storage.source_files))) + file=storage.source_files)) else: rule.inputs.append(paths.work_dir / get_annotation_path(annotation.name)) rule.parameters[param_name].append((annotation, export_name)) # SourceAnnotations - elif param_type == SourceAnnotations: - rule.parameters[param_name] = sparv_config.get(f"{param.default.config_name}", None) + elif param_type in (SourceAnnotations, SourceAnnotationsAllSourceFiles): + rule.parameters[param_name] = sparv_config.get(param.default.config_name, None) + if param_type == SourceAnnotationsAllSourceFiles: + rule.inputs.extend( + expand(escape_wildcards(paths.work_dir / get_annotation_path(io.STRUCTURE_FILE, data=True)), + file=storage.source_files)) + else: + rule.inputs.append(paths.work_dir / get_annotation_path(io.STRUCTURE_FILE, data=True)) # Corpus elif param.annotation == Corpus: rule.parameters[param_name] = Corpus(sparv_config.get("metadata.id")) # Language elif param.annotation == Language: rule.parameters[param_name] = Language(sparv_config.get("metadata.language")) - # Document - elif param.annotation == Document: - rule.docs.append(param_name) - # AllDocuments (all source documents) - elif param_type == AllDocuments: - rule.parameters[param_name] = AllDocuments(get_source_files(storage.source_files)) + # SourceFilename + elif param.annotation == SourceFilename: + rule.file_parameters.append(param_name) + # AllSourceFilenames (all source filenames) + elif param_type == AllSourceFilenames: + rule.parameters[param_name] = AllSourceFilenames(storage.source_files) # Text elif param_type == Text: - text_path = Path("{doc}") / io.TEXT_FILE + text_path = Path("{file}") / io.TEXT_FILE if rule.exporter or rule.installer: rule.inputs.append(paths.work_dir / text_path) else: @@ -375,7 +414,7 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables(param.default, rule.full_name) rule.missing_config.update(missing_configs) - binary = util.find_binary(param_value, executable=False, allow_dir=param.annotation == BinaryDir) + binary = util.system.find_binary(param_value, executable=False, allow_dir=param.annotation == BinaryDir) if not binary: rule.missing_binaries.add(param_value) binary = Path(binary if binary else param_value) @@ -390,15 +429,12 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables(param.default, rule.full_name) rule.missing_config.update(missing_configs) - if param.default.absolute_path: - export_path = Path(param_value) - else: - export_path = paths.export_dir / param_value + export_path = paths.export_dir / param_value output_dirs.add(export_path.parent) rule.outputs.append(export_path) rule.parameters[param_name] = Export(str(export_path)) - if "{doc}" in rule.parameters[param_name]: - rule.doc_annotations.append(param_name) + if "{file}" in rule.parameters[param_name]: + rule.file_annotations.append(param_name) if "{" in param_value: rule.wildcard_annotations.append(param_name) # ExportInput @@ -407,13 +443,10 @@ def rule_helper(rule: RuleStorage, config: dict, storage: SnakeStorage, config_m rule.classes.update(registry.find_classes(param.default)) param_value, missing_configs = registry.expand_variables(param.default, rule.full_name) rule.missing_config.update(missing_configs) - if param.default.absolute_path: - rule.parameters[param_name] = ExportInput(param_value) - else: - rule.parameters[param_name] = ExportInput(paths.export_dir / param_value) - if param.default.all_docs: + rule.parameters[param_name] = ExportInput(paths.export_dir / param_value) + if param.default.all_files: rule.inputs.extend(expand(escape_wildcards(rule.parameters[param_name]), - doc=get_source_files(storage.source_files))) + file=storage.source_files)) else: rule.inputs.append(Path(rule.parameters[param_name])) if "{" in rule.parameters[param_name]: @@ -536,39 +569,39 @@ def check_ruleorder(storage: SnakeStorage) -> Set[Tuple[RuleStorage, RuleStorage return ordered_rules -def doc_value(rule_params): - """Get doc name for use as parameter to rule.""" - def _doc_value(wildcards): - return get_doc_value(wildcards, rule_params.annotator) - return _doc_value +def file_value(rule_params): + """Get source filename for use as parameter to rule.""" + def _file_value(wildcards): + return get_file_value(wildcards, rule_params.annotator) + return _file_value def get_parameters(rule_params): - """Extend function parameters with doc names and replace wildcards.""" + """Extend function parameters with source filenames and replace wildcards.""" def get_params(wildcards): - doc = get_doc_value(wildcards, rule_params.annotator) - # We need to make a copy of the parameters, since the rule might be used for multiple documents + file = get_file_value(wildcards, rule_params.annotator) + # We need to make a copy of the parameters, since the rule might be used for multiple source files _parameters = copy.deepcopy(rule_params.parameters) - _parameters.update({name: Document(doc) for name in rule_params.docs}) + _parameters.update({name: SourceFilename(file) for name in rule_params.file_parameters}) - # Add document name to annotation and output parameters + # Add source filename to annotation and output parameters for param in _parameters: if isinstance(_parameters[param], (Annotation, AnnotationData, Output, OutputData, Text)): - _parameters[param].doc = doc + _parameters[param].source_file = file if isinstance(_parameters[param], ExportAnnotations): for a in _parameters[param]: - a[0].doc = doc + a[0].source_file = file - # Replace {doc} wildcard in parameters - for name in rule_params.doc_annotations: + # Replace {file} wildcard in parameters + for name in rule_params.file_annotations: if isinstance(_parameters[name], Base): - _parameters[name].name = _parameters[name].name.replace("{doc}", doc) + _parameters[name].name = _parameters[name].name.replace("{file}", file) else: - _parameters[name] = _parameters[name].replace("{doc}", doc) + _parameters[name] = _parameters[name].replace("{file}", file) - # Replace wildcards (other than {doc}) in parameters + # Replace wildcards (other than {file}) in parameters for name in rule_params.wildcard_annotations: - wcs = re.finditer(r"(?!{doc}){([^}]+)}", str(_parameters[name])) + wcs = re.finditer(r"(?!{file}){([^}]+)}", str(_parameters[name])) for wc in wcs: if isinstance(_parameters[name], Base): _parameters[name].name = _parameters[name].name.replace(wc.group(), wildcards.get(wc.group(1))) @@ -602,7 +635,7 @@ def get_source_path() -> str: def get_annotation_path(annotation, data=False, common=False): - """Construct a path to an annotation file given a doc and annotation.""" + """Construct a path to an annotation file given an annotation name.""" if not isinstance(annotation, BaseAnnotation): annotation = BaseAnnotation(annotation) elem, attr = annotation.split() @@ -614,30 +647,13 @@ def get_annotation_path(annotation, data=False, common=False): path = path / attr if not common: - path = "{doc}" / path + path = "{file}" / path return path -def get_source_files(source_files) -> List[str]: - """Get list of all available source files.""" - if not source_files: - if not sparv_config.get("import.importer"): - raise util.SparvErrorMessage("The config variable 'import.importer' must not be empty.", "sparv") - try: - importer_module, _, importer_function = sparv_config.get("import.importer").partition(":") - file_extension = registry.modules[importer_module].functions[importer_function]["file_extension"] - except KeyError: - raise util.SparvErrorMessage( - "Could not find the importer '{}'. Make sure the 'import.importer' config value refers to an " - "existing importer.".format(sparv_config.get("import.importer")), "sparv") - source_files = [f[1][0] for f in snakemake.utils.listfiles( - Path(get_source_path(), "{file}." + file_extension))] - return source_files - - -def get_doc_values(config, snake_storage): - """Get a list of files represented by the doc wildcard.""" - return config.get("doc") or get_source_files(snake_storage.source_files) +def get_file_values(config, snake_storage): + """Get a list of files represented by the {file} wildcard.""" + return config.get("file") or snake_storage.source_files def get_wildcard_values(config): @@ -646,19 +662,19 @@ def get_wildcard_values(config): def escape_wildcards(s): - """Escape all wildcards other than {doc}.""" - return re.sub(r"(?!{doc})({[^}]+})", r"{\1}", str(s)) + """Escape all wildcards other than {file}.""" + return re.sub(r"(?!{file})({[^}]+})", r"{\1}", str(s)) -def get_doc_value(wildcards, annotator): - """Extract the {doc} part from full annotation path.""" - doc = None - if hasattr(wildcards, "doc"): +def get_file_value(wildcards, annotator): + """Extract the {file} part from full annotation path.""" + file = None + if hasattr(wildcards, "file"): if annotator: - doc = wildcards.doc[len(str(paths.work_dir)) + 1:] + file = wildcards.file[len(str(paths.work_dir)) + 1:] else: - doc = wildcards.doc - return doc + file = wildcards.file + return file def load_config(snakemake_config): @@ -681,25 +697,42 @@ def load_config(snakemake_config): def get_install_outputs(snake_storage: SnakeStorage, install_types: Optional[List] = None): """Collect files to be created for all installations given as argument or listed in config.install.""" - install_inputs = [] + unknown = [] + install_outputs = [] for installation in install_types or sparv_config.get("install", []): - install_inputs.extend(snake_storage.install_outputs[installation]) + if installation not in snake_storage.install_outputs: + unknown.append(installation) + else: + install_outputs.extend(snake_storage.install_outputs[installation]) + + if unknown: + raise SparvErrorMessage("Unknown installation{} selected:\n • {}".format( + "s" if len(unknown) > 1 else "", + "\n • ".join(unknown) + )) - return install_inputs + return install_outputs -def get_export_targets(snake_storage, rules, doc, wildcards): +def get_export_targets(snake_storage, rules, file, wildcards): """Get export targets from sparv_config.""" all_outputs = [] + config_exports = set(sparv_config.get("export.default", [])) for rule in snake_storage.all_rules: - if rule.type == "exporter" and rule.target_name in sparv_config.get("export.default", []): + if rule.type == "exporter" and rule.target_name in config_exports: + config_exports.remove(rule.target_name) + # Get all output files for all source files + rule_outputs = expand(rule.outputs if not rule.abstract else rule.inputs, file=file, **wildcards) # Get Snakemake rule object sm_rule = getattr(rules, rule.rule_name).rule - # Get all output files for all documents - rule_outputs = expand(rule.outputs if not rule.abstract else rule.inputs, doc=doc, **wildcards) - # Convert paths to IOFile objects so Snakemake knows which rule they come from (in case of ambiguity) - all_outputs.extend([snakemake.io.IOFile(f, rule=sm_rule) for f in rule_outputs]) + all_outputs.append((sm_rule if not rule.abstract else None, rule_outputs)) + + if config_exports: + raise SparvErrorMessage( + "Unknown output format{} specified in export.default:\n • {}".format( + "s" if len(config_exports) > 1 else "", + "\n • ".join(config_exports))) return all_outputs diff --git a/sparv/core/wizard.py b/sparv/core/wizard.py index 4a65f949..bb681da9 100644 --- a/sparv/core/wizard.py +++ b/sparv/core/wizard.py @@ -8,12 +8,11 @@ from typing import Callable, List, Optional, Tuple, Union import questionary.prompts.common -import yaml from prompt_toolkit.shortcuts import clear as clear_screen from prompt_toolkit.styles import Style from questionary import prompt -from sparv import SourceStructureParser, Wildcard +from sparv.api import SourceStructureParser, Wildcard from sparv.core import registry, paths, config, snake_utils from sparv.core.console import console @@ -189,7 +188,7 @@ def get_module_wizard(self, module_wizard: Tuple[Callable, list, bool]) -> List[ def save_config(self): """Save config to YAML file.""" - with open("config.yaml", mode="w") as out_file: + with open("config.yaml", mode="w", encoding="utf-8") as out_file: out_file.write(config.dump_config({k: v for k, v in self.corpus_config.items() if not k.startswith("_")})) print("Your corpus configuration has been saved as 'config.yaml'.") @@ -205,9 +204,11 @@ def load_config(self): "again. Select Y to continue or N to abort.".format(paths.config_file) }, clear=True, save_prompt=False)["answer"] if use_config_file: - with open(self.config_file) as f: - self.corpus_config = yaml.load(f, Loader=yaml.FullLoader) + self.corpus_config = config.read_yaml(self.config_file) config.load_config(self.config_file) + if not self.corpus_config: + # If config is completely empty, treat as if config is missing + return False return True else: sys.exit() @@ -283,7 +284,7 @@ def run(self): # Start with metadata questions self.metadata_questions() - # Questions related to the source documents + # Questions related to the source files self.source_questions() # Select annotations @@ -323,7 +324,7 @@ def run(self): "value": "metadata" }, { - "name": "Edit settings related to the source documents", + "name": "Edit settings related to the source files", "value": "source" }, { @@ -414,13 +415,19 @@ def metadata_questions(self): for w in self.wizard_from_module["metadata"]: questions.extend(self.get_module_wizard(w)) self.update_config(self.q(questions, clear=True)) + # Split language into language and variety if necessary and save in config + langcode, _, suffix = config.get("metadata.language", "").partition("-") + if suffix: + config.set_value("metadata.language", langcode, config_dict=self.corpus_config) + config.set_value("metadata.variety", suffix, config_dict=self.corpus_config) + # Now that we know the language, update the class dict in registry... self.update_class_registry() # ...and rebuild annotator list self.update_annotators() def source_questions(self): - """As questions related to the source documents.""" + """As questions related to the source files.""" # Importer choice questions = [] for w in self.wizard_from_module["import"]: @@ -430,8 +437,8 @@ def source_questions(self): # Ask user if they want to scan source files self.scan_source() - # Choose document annotation - self.select_document_annotation() + # Choose text annotation + self.select_text_annotation() # Select source annotations to keep questions = [] @@ -439,27 +446,27 @@ def source_questions(self): questions.extend(self.get_module_wizard(w)) self.update_config(self.q(questions)) - def select_document_annotation(self): - """Ask user for document annotation.""" - doc_annotation = self.q(self.set_defaults({ + def select_text_annotation(self): + """Ask user for text annotation.""" + text_annotation = self.q(self.set_defaults({ "type": "select", - "name": "import.document_annotation", + "name": "import.text_annotation", "message": "What is the name of the existing annotation in your source files that encapsulates a " - "'document'? This is the text unit for which all document level annotations will apply. " - "Usually, no text should exist outside of this annotation.", + "'text'? This is the text unit for which all text level annotations will apply. " + "Usually, no text content should exist outside of this annotation.", "choices": self.source_structure.get_plain_annotations(self.corpus_config) + [{ "name": "Enter manually", "value": "__sparv_manual_entry" }] })) - if doc_annotation["import.document_annotation"] == "__sparv_manual_entry": - doc_annotation = self.q({ + if text_annotation["import.text_annotation"] == "__sparv_manual_entry": + text_annotation = self.q({ "type": "text", - "name": "import.document_annotation", + "name": "import.text_annotation", "message": "Annotation name, e.g. 'text' or ̈́document':", "validate": lambda x: bool(re.match(r"^\S+$", x)) }) - self.update_config(doc_annotation) + self.update_config(text_annotation) def scan_source(self): """Create a SourceStructureParser instance and offer to scan source files if possible.""" diff --git a/sparv/core_modules/export/__init__.py b/sparv/core_modules/export/__init__.py index a2fd6113..e99b354f 100644 --- a/sparv/core_modules/export/__init__.py +++ b/sparv/core_modules/export/__init__.py @@ -1,10 +1,9 @@ -from sparv import Config, SourceStructureParser, wizard +from sparv.api import Config, SourceStructureParser, wizard __config__ = [ Config("export.default", description="List of exporters to use by default"), Config("export.annotations", description="List of automatic annotations to include in export"), Config("export.source_annotations", description="List of annotations and attributes from the source to include"), - Config("export.header_annotations", description="List of headers from the source data to include"), Config("export.word", description="Annotation to use as token text in export"), Config("export.remove_module_namespaces", description="Remove module name prefixes from annotation names in export"), diff --git a/sparv/core_modules/import/__init__.py b/sparv/core_modules/import/__init__.py index cb797c59..39b61388 100644 --- a/sparv/core_modules/import/__init__.py +++ b/sparv/core_modules/import/__init__.py @@ -1,16 +1,16 @@ import os -from sparv import Config, wizard +from sparv.api import Config, wizard from sparv.core import paths, registry __config__ = [ - Config("import.document_annotation", description="Annotation representing a document"), + Config("import.text_annotation", description="Annotation representing a text"), Config("import.source_dir", paths.source_dir, description="Directory containing corpus source files"), Config("import.importer", description="Name of importer to use"), Config("import.keep_control_chars", False, description="Set to True to keep control characters"), Config("import.normalize", "NFC", description="Normalize input using any of the following forms: " "'NFC', 'NFKC', 'NFD', and 'NFKD'"), - Config("import.encoding", "UTF-8", description="Encoding of source documents"), + Config("import.encoding", "UTF-8", description="Encoding of source files"), ] diff --git a/sparv/core_modules/metadata/__init__.py b/sparv/core_modules/metadata/__init__.py index 6345a65c..cf7129a5 100644 --- a/sparv/core_modules/metadata/__init__.py +++ b/sparv/core_modules/metadata/__init__.py @@ -1,13 +1,14 @@ """General metadata about corpus.""" import re -from sparv import Config, wizard +from sparv.api import Config, wizard from sparv.core import registry __config__ = [ Config("metadata.id", description="Machine name of corpus (a-z, 0-9, -)"), Config("metadata.name", description="Human readable name of corpus"), Config("metadata.language", description="Language of source files (ISO 639-3)"), + Config("metadata.variety", description="Language variety of source files (if applicable)"), Config("metadata.description", description="Description of corpus") ] diff --git a/sparv/core_modules/sparv/__init__.py b/sparv/core_modules/sparv/__init__.py new file mode 100644 index 00000000..45213031 --- /dev/null +++ b/sparv/core_modules/sparv/__init__.py @@ -0,0 +1,9 @@ +"""Settings related to core Sparv functionality.""" + +from sparv.api import Config +from sparv.core.io import compression + +__config__ = [ + Config("sparv.compression", default=compression, + description="Compression to use for files in work-dir ('none', 'gzip', 'bzip2' or 'lzma'. Default: gzip)") +] diff --git a/sparv/modules/conll_export/conllu.py b/sparv/modules/conll_export/conllu.py index 7ade281b..3d7958cb 100644 --- a/sparv/modules/conll_export/conllu.py +++ b/sparv/modules/conll_export/conllu.py @@ -3,15 +3,16 @@ import os from typing import Optional -import sparv.util as util -from sparv import Annotation, Config, Document, Export, SourceAnnotations, exporter +from sparv.api import Annotation, Config, SourceFilename, Export, SourceAnnotations, exporter, get_logger, util -logger = util.get_logger(__name__) +logger = get_logger(__name__) @exporter("CoNLL-U (SBX version) export", language=["swe"], config=[ + Config("conll_export.source_annotations", description="List of annotations and attributes from the source data to " + "include. Everything will be included by default."), Config("conll_export.conll_fields.sentid", default=":misc.id", description="Sentence ID"), - Config("conll_export.conll_fields.id", default=":misc.number_rel_", + Config("conll_export.conll_fields.id", default="", description="Annotation in ID field of CoNLL-U output"), Config("conll_export.conll_fields.lemma", default="", description="Annotation in LEMMA field of CoNLL-U output"), @@ -30,8 +31,8 @@ Config("conll_export.conll_fields.misc", default=None, description="Annotation in MISC field of CoNLL-U output") ]) -def conllu(doc: Document = Document(), - out: Export = Export("conll/{doc}.conllu"), +def conllu(source_file: SourceFilename = SourceFilename(), + out: Export = Export("conll_export/{file}.conllu"), token: Annotation = Annotation(""), sentence: Annotation = Annotation(""), sentence_id: Annotation = Annotation("[conll_export.conll_fields.sentid]"), @@ -72,10 +73,11 @@ def conllu(doc: Document = Document(), # want to use here. annotations = [sentence, sentence_id, token] + conll_fields annotations = [(annot, None) for annot in annotations] - annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, - remove_namespaces=True, - doc=doc, token_name=token_name) - span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) + annotation_list, _, export_names = util.export.get_annotation_names(annotations, source_annotations, + remove_namespaces=True, + source_file=source_file, token_name=token_name) + span_positions, annotation_dict = util.export.gather_annotations(annotation_list, export_names, + source_file=source_file) csv_data = ["# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"] # Go through spans_dict and add to csv, line by line @@ -101,7 +103,7 @@ def conllu(doc: Document = Document(), csv_data.append("") # Write result to file - with open(out, "w") as f: + with open(out, "w", encoding="utf-8") as f: f.write("\n".join(csv_data)) logger.info("Exported: %s", out) @@ -116,7 +118,7 @@ def _make_conll_token_line(conll_fields, token, annotation_dict, index, delimite attr_str = annotation_dict[token][annot.attribute_name][index].strip("|") or "_" # If there are multiple lemmas, use the first one if i == 2: - attr_str = util.set_to_list(attr_str)[0] + attr_str = util.misc.set_to_list(attr_str)[0] # Set head (index 6 in conll_fields) to '0' when root if i == 6 and attr_str == "_": attr_str = "0" diff --git a/sparv/modules/csv_export/csv_export.py b/sparv/modules/csv_export/csv_export.py index 193b3bbf..f47a1ef3 100644 --- a/sparv/modules/csv_export/csv_export.py +++ b/sparv/modules/csv_export/csv_export.py @@ -2,10 +2,10 @@ import os -import sparv.util as util -from sparv import Annotation, Config, Document, Export, ExportAnnotations, SourceAnnotations, exporter +from sparv.api import (Annotation, Config, SourceFilename, Export, ExportAnnotations, SourceAnnotations, exporter, get_logger, + util) -logger = util.get_logger(__name__) +logger = get_logger(__name__) @exporter("CSV export", config=[ @@ -15,8 +15,8 @@ "included by default."), Config("csv_export.annotations", description="Sparv annotations to include.") ]) -def csv(doc: Document = Document(), - out: Export = Export("csv/{doc}.csv"), +def csv(source_file: SourceFilename = SourceFilename(), + out: Export = Export("csv_export/{file}.csv"), token: Annotation = Annotation(""), word: Annotation = Annotation("[export.word]"), sentence: Annotation = Annotation(""), @@ -36,12 +36,14 @@ def csv(doc: Document = Document(), word_annotation = list(word.read()) # Get annotation spans, annotations list etc. - annotation_list, token_attributes, export_names = util.get_annotation_names(annotations, source_annotations, - doc=doc, token_name=token_name, - remove_namespaces=remove_namespaces, - sparv_namespace=sparv_namespace, - source_namespace=source_namespace) - span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) + annotation_list, token_attributes, export_names = util.export.get_annotation_names( + annotations, source_annotations, source_file=source_file, token_name=token_name, + remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) + if token not in annotation_list: + logger.warning("The 'csv_export:csv' export requires the annotation for the output to include " + "the source text. Make sure to add to the list of export annotations.") + span_positions, annotation_dict = util.export.gather_annotations(annotation_list, export_names, + source_file=source_file) # Make csv header csv_data = [_make_header(token_name, token_attributes, export_names, delimiter)] @@ -67,7 +69,7 @@ def csv(doc: Document = Document(), csv_data.append("") # Write result to file - with open(out, "w") as f: + with open(out, "w", encoding="utf-8") as f: f.write("\n".join(csv_data)) logger.info("Exported: %s", out) @@ -85,7 +87,7 @@ def _make_token_line(word, token, token_attributes, annotation_dict, index, deli line = [word.replace(delimiter, " ")] for attr in token_attributes: if attr not in annotation_dict[token]: - attr_str = util.UNDEF + attr_str = util.constants.UNDEF else: attr_str = annotation_dict[token][attr][index] line.append(attr_str) @@ -98,5 +100,6 @@ def _make_attrs(annotation, annotation_dict, export_names, index): for name, annot in annotation_dict[annotation].items(): export_name = export_names.get(":".join([annotation, name]), name) annotation_name = export_names.get(annotation, annotation) - attrs.append("%s.%s = %s" % (annotation_name, export_name, annot[index])) + if annot[index]: + attrs.append("%s.%s = %s" % (annotation_name, export_name, annot[index])) return attrs diff --git a/sparv/modules/cwb/__init__.py b/sparv/modules/cwb/__init__.py index b35d63a1..30ea304e 100644 --- a/sparv/modules/cwb/__init__.py +++ b/sparv/modules/cwb/__init__.py @@ -1,3 +1,10 @@ """Exports, encodes and aligns corpora for Corpus Workbench.""" -from . import cwb, info +from sparv.api import Config +from . import cwb, info, install_corpus + +__config__ = [ + Config("cwb.remote_host", description="Remote host to install CWB files to"), + Config("cwb.remote_registry_dir", "", description="CWB registry path on remote host"), + Config("cwb.remote_data_dir", "", description="CWB datadir path on remote host") +] diff --git a/sparv/modules/cwb/cwb.py b/sparv/modules/cwb/cwb.py index f5ca8b69..fc2cac6b 100644 --- a/sparv/modules/cwb/cwb.py +++ b/sparv/modules/cwb/cwb.py @@ -1,18 +1,17 @@ """Tools for exporting, encoding and aligning corpora for Corpus Workbench.""" -import logging import os import re +from collections import OrderedDict from glob import glob from pathlib import Path from typing import Optional -import sparv.util as util -from sparv import (AllDocuments, Annotation, AnnotationAllDocs, Config, Corpus, Document, Export, ExportAnnotations, - ExportInput, SourceAnnotations, exporter) -from sparv.core import paths +from sparv.api import (AllSourceFilenames, Annotation, AnnotationAllSourceFiles, Config, Corpus, SourceFilename, Export, + ExportAnnotations, ExportInput, SourceAnnotations, SourceAnnotationsAllSourceFiles, + SparvErrorMessage, exporter, get_logger, util) -log = logging.getLogger(__name__) +logger = get_logger(__name__) @exporter("VRT export", config=[ @@ -21,8 +20,8 @@ "included by default."), Config("cwb.annotations", description="Sparv annotations to include.") ]) -def vrt(doc: Document = Document(), - out: Export = Export("vrt/{doc}.vrt"), +def vrt(source_file: SourceFilename = SourceFilename(), + out: Export = Export("cwb.vrt/{file}.vrt"), token: Annotation = Annotation(""), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), @@ -33,7 +32,7 @@ def vrt(doc: Document = Document(), """Export annotations to vrt. - annotations: list of elements:attributes (annotations) to include. - - source_annotations: list of elements:attributes from the original document + - source_annotations: list of elements:attributes from the original source file to be kept. If not specified, everything will be kept. """ # Create export dir @@ -43,26 +42,28 @@ def vrt(doc: Document = Document(), word_annotation = list(word.read()) # Get annotation spans, annotations list etc. - annotation_list, token_attributes, export_names = util.get_annotation_names(annotations, source_annotations, - doc=doc, token_name=token.name, - remove_namespaces=remove_namespaces, - sparv_namespace=sparv_namespace, - source_namespace=source_namespace) - span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) + annotation_list, token_attributes, export_names = util.export.get_annotation_names( + annotations, source_annotations, source_file=source_file, token_name=token.name, + remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) + if token not in annotation_list: + logger.warning("The 'cwb:vrt' export requires the annotation for the output to include " + "the source text. Make sure to add to the list of export annotations.") + span_positions, annotation_dict = util.export.gather_annotations(annotation_list, export_names, + source_file=source_file) vrt_data = create_vrt(span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Write result to file - with open(out, "w") as f: + with open(out, "w", encoding="utf-8") as f: f.write(vrt_data) - log.info("Exported: %s", out) + logger.info("Exported: %s", out) @exporter("Scrambled VRT export", config=[ Config("cwb.scramble_on", description="Annotation to use for scrambling.") ]) -def vrt_scrambled(doc: Document = Document(), - out: Export = Export("vrt_scrambled/{doc}.vrt"), +def vrt_scrambled(source_file: SourceFilename = SourceFilename(), + out: Export = Export("cwb.vrt_scrambled/{file}.vrt"), chunk: Annotation = Annotation("[cwb.scramble_on]"), chunk_order: Annotation = Annotation("[cwb.scramble_on]:misc.number_random"), token: Annotation = Annotation(""), @@ -73,117 +74,117 @@ def vrt_scrambled(doc: Document = Document(), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt in scrambled order.""" + logger.progress(total=6) # Get annotation spans, annotations list etc. - annotation_list, token_attributes, export_names = util.get_annotation_names(annotations, source_annotations, - doc=doc, token_name=token.name, - remove_namespaces=remove_namespaces, - sparv_namespace=sparv_namespace, - source_namespace=source_namespace) + annotation_list, token_attributes, export_names = util.export.get_annotation_names( + annotations, source_annotations, source_file=source_file, token_name=token.name, + remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) + logger.progress() + if token not in annotation_list: + logger.warning("The 'cwb:vrt_scrambled' export requires the annotation for the output to include " + "the source text. Make sure to add to the list of export annotations.") if chunk not in annotation_list: - raise util.SparvErrorMessage( + raise SparvErrorMessage( "The annotation used for scrambling ({}) needs to be included in the output.".format(chunk)) - span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc, - split_overlaps=True) + span_positions, annotation_dict = util.export.gather_annotations(annotation_list, export_names, + source_file=source_file, split_overlaps=True) + logger.progress() - # Read words and document ID + # Read words and scramble order word_annotation = list(word.read()) chunk_order_data = list(chunk_order.read()) - # Reorder chunks and open/close tags in correct order - new_span_positions = util.scramble_spans(span_positions, chunk.name, chunk_order_data) + logger.progress() + # Reorder chunks and open/close tags in correct order + new_span_positions = util.export.scramble_spans(span_positions, chunk.name, chunk_order_data) + logger.progress() # Make vrt format vrt_data = create_vrt(new_span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) - + logger.progress() # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Write result to file - with open(out, "w") as f: + with open(out, "w", encoding="utf-8") as f: f.write(vrt_data) - log.info("Exported: %s", out) + logger.info("Exported: %s", out) + logger.progress() @exporter("CWB encode", order=2, config=[ - Config("cwb.corpus_registry", default=paths.corpus_registry, description="Path to CWB registry directory"), - Config("cwb.cwb_datadir", default=paths.cwb_datadir, description="Path to CWB data directory"), Config("cwb.bin_path", default="", description="Path to directory containing the CWB executables"), - Config("cwb.encoding", default=paths.cwb_encoding, description="Encoding to use"), + Config("cwb.encoding", default="utf8", description="Encoding to use"), Config("cwb.skip_compression", False, description="Whether to skip compression"), Config("cwb.skip_validation", False, description="Whether to skip validation") ]) def encode(corpus: Corpus = Corpus(), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations", is_input=False), - source_annotations: SourceAnnotations = SourceAnnotations("cwb.source_annotations"), - docs: AllDocuments = AllDocuments(), - words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"), - vrtfiles: ExportInput = ExportInput("vrt/{doc}.vrt", all_docs=True), - _out: Export = Export("[cwb.corpus_registry]/[metadata.id]", absolute_path=True), - out_marker: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.original_marker", absolute_path=True), - token: AnnotationAllDocs = AnnotationAllDocs(""), + source_annotations: SourceAnnotationsAllSourceFiles = SourceAnnotationsAllSourceFiles( + "cwb.source_annotations"), + source_files: AllSourceFilenames = AllSourceFilenames(), + words: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[export.word]"), + vrtfiles: ExportInput = ExportInput("cwb.vrt/{file}.vrt", all_files=True), + out_registry: Export = Export("cwb.encoded/registry/[metadata.id]"), + out_marker: Export = Export("cwb.encoded/data/.marker"), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), bin_path: Config = Config("cwb.bin_path"), encoding: str = Config("cwb.encoding"), - datadir: str = Config("cwb.cwb_datadir"), - registry: str = Config("cwb.corpus_registry"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), skip_compression: Optional[bool] = Config("cwb.skip_compression"), skip_validation: Optional[bool] = Config("cwb.skip_validation")): - """Do cwb encoding with vrt files in original order.""" - cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, out_marker, token.name, - bin_path, encoding, datadir, registry, remove_namespaces, sparv_namespace, source_namespace, + """Encode CWB corpus from VRT files.""" + cwb_encode(corpus, annotations, source_annotations, source_files, words, vrtfiles, out_registry, out_marker, + token.name, bin_path, encoding, remove_namespaces, sparv_namespace, source_namespace, skip_compression, skip_validation) @exporter("CWB encode, scrambled", order=1) def encode_scrambled(corpus: Corpus = Corpus(), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations", is_input=False), - source_annotations: SourceAnnotations = SourceAnnotations("cwb.source_annotations"), - docs: AllDocuments = AllDocuments(), - words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"), - vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt", all_docs=True), - _out: Export = Export("[cwb.corpus_registry]/[metadata.id]", absolute_path=True), - out_marker: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker", - absolute_path=True), - token: AnnotationAllDocs = AnnotationAllDocs(""), + source_annotations: SourceAnnotationsAllSourceFiles = SourceAnnotationsAllSourceFiles( + "cwb.source_annotations"), + source_files: AllSourceFilenames = AllSourceFilenames(), + words: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[export.word]"), + vrtfiles: ExportInput = ExportInput("cwb.vrt_scrambled/{file}.vrt", all_files=True), + out_registry: Export = Export("cwb.encoded_scrambled/registry/[metadata.id]"), + out_marker: Export = Export("cwb.encoded_scrambled/data/.scrambled_marker"), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), bin_path: Config = Config("cwb.bin_path"), encoding: str = Config("cwb.encoding"), - datadir: str = Config("cwb.cwb_datadir"), - registry: str = Config("cwb.corpus_registry"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), skip_compression: Optional[bool] = Config("cwb.skip_compression"), skip_validation: Optional[bool] = Config("cwb.skip_validation")): - """Do cwb encoding with vrt files in scrambled order.""" - cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, out_marker, token.name, - bin_path, encoding, datadir, registry, remove_namespaces, sparv_namespace, source_namespace, + """Encode CWB corpus from scrambled VRT files.""" + cwb_encode(corpus, annotations, source_annotations, source_files, words, vrtfiles, out_registry, out_marker, + token.name, bin_path, encoding, remove_namespaces, sparv_namespace, source_namespace, skip_compression, skip_validation) -def cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, out_marker, token_name: str, - bin_path, encoding, datadir, registry, remove_namespaces, sparv_namespace, source_namespace, +def cwb_encode(corpus, annotations, source_annotations, source_files, words, vrtfiles, out_registry, out_marker, + token_name: str, bin_path, encoding, remove_namespaces, sparv_namespace, source_namespace, skip_compression, skip_validation): """Encode a number of vrt files, by calling cwb-encode.""" - assert datadir, "CWB_DATADIR not specified" - assert registry, "CORPUS_REGISTRY not specified" + if not corpus.strip(): + raise SparvErrorMessage("metadata.id needs to be set.") # Get vrt files - vrtfiles = [vrtfiles.replace("{doc}", doc) for doc in docs] + vrtfiles = [vrtfiles.replace("{file}", file) for file in source_files] vrtfiles.sort() # Word annotation should always be included in CWB export annotations.insert(0, (words, None)) # Get annotation names - annotation_list, token_attributes, export_names = util.get_annotation_names(annotations, source_annotations, - docs=docs, token_name=token_name, - remove_namespaces=remove_namespaces, - sparv_namespace=sparv_namespace, - source_namespace=source_namespace, - keep_struct_names=True) + annotation_list, token_attributes, export_names = util.export.get_annotation_names( + annotations, source_annotations, source_files=source_files, token_name=token_name, + remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace, + keep_struct_names=True) # Get VRT columns token_attributes = [(token_name + ":" + i) for i in token_attributes] @@ -196,13 +197,17 @@ def cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, o not a.annotation_name == token_name] structs = parse_structural_attributes(struct_annotations) - corpus_registry = os.path.join(registry, corpus) - corpus_datadir = os.path.join(datadir, corpus) - util.system.clear_directory(corpus_datadir) + data_dir = Path(out_marker).resolve().parent + registry_dir = Path(out_registry).resolve().parent + registry_file = Path(out_registry).resolve() + + # Create export dirs + data_dir.mkdir(exist_ok=True) + registry_dir.mkdir(exist_ok=True) encode_args = ["-s", "-p", "-", - "-d", corpus_datadir, - "-R", corpus_registry, + "-d", data_dir, + "-R", registry_file, "-c", encoding, "-x" ] @@ -214,38 +219,43 @@ def cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, o if col != "-": encode_args += ["-P", col] for struct, attrs in structs: - attrs2 = "+".join(attr for attr, _n in attrs if not attr == util.UNDEF) + attrs2 = "+".join(attrs) if attrs2: attrs2 = "+" + attrs2 + # ":0" is added to the s-attribute name to enable nesting support in cwb-encode encode_args += ["-S", "%s:0%s" % (struct, attrs2)] - util.system.call_binary(os.path.join(bin_path, "cwb-encode"), encode_args, verbose=True) + _, stderr = util.system.call_binary(os.path.join(bin_path, "cwb-encode"), encode_args) + if stderr: + logger.warning(stderr.decode().strip()) # Use xargs to avoid "Argument list too long" problems - # util.system.call_binary(os.path.join(bin_path, "cwb-encode"), raw_command="cat %s | xargs cat | %%s %s" % (vrtfiles, " ".join(encode_args)), use_shell=True) + # util.system.call_binary(os.path.join(bin_path, "cwb-encode"), + # raw_command="cat %s | xargs cat | %%s %s" % (vrtfiles, " ".join(encode_args)), + # use_shell=True) - index_args = ["-V", "-r", registry, corpus.upper()] + index_args = ["-V", "-r", registry_dir, corpus.upper()] util.system.call_binary(os.path.join(bin_path, "cwb-makeall"), index_args) - log.info("Encoded and indexed %d columns, %d structs", len(columns), len(structs)) + logger.info("Encoded and indexed %d columns, %d structs", len(columns), len(structs)) if not skip_compression: - log.info("Compressing corpus files...") - compress_args = ["-A", corpus.upper()] + logger.info("Compressing corpus files...") + compress_args = ["-A", "-r", registry_dir, corpus.upper()] if skip_validation: compress_args.insert(0, "-T") - log.info("Skipping validation") + logger.info("Skipping validation") # Compress token stream util.system.call_binary(os.path.join(bin_path, "cwb-huffcode"), compress_args) - log.info("Removing uncompressed token stream...") - for f in glob(os.path.join(corpus_datadir, "*.corpus")): + logger.info("Removing uncompressed token stream...") + for f in glob(os.path.join(data_dir, "*.corpus")): os.remove(f) # Compress index files util.system.call_binary(os.path.join(bin_path, "cwb-compress-rdx"), compress_args) - log.info("Removing uncompressed index files...") - for f in glob(os.path.join(corpus_datadir, "*.corpus.rev")): + logger.info("Removing uncompressed index files...") + for f in glob(os.path.join(data_dir, "*.corpus.rev")): os.remove(f) - for f in glob(os.path.join(corpus_datadir, "*.corpus.rdx")): + for f in glob(os.path.join(data_dir, "*.corpus.rdx")): os.remove(f) - log.info("Compression done.") + logger.info("Compression done.") # Write marker file Path(out_marker).touch() @@ -257,7 +267,7 @@ def cwb_align(corpus, other, link, aligndir="annotations/align", bin_path="", """Align 'corpus' with 'other' corpus, using the 'link' annotation for alignment.""" os.makedirs(aligndir, exist_ok=True) alignfile = os.path.join(aligndir, corpus + ".align") - log.info("Aligning %s <-> %s", corpus, other) + logger.info("Aligning %s <-> %s", corpus, other) try: [(link_name, [(link_attr, _path)])] = parse_structural_attributes(link) @@ -268,34 +278,34 @@ def cwb_align(corpus, other, link, aligndir="annotations/align", bin_path="", # Align linked chunks args = ["-v", "-o", alignfile, "-V", link_attr, corpus, other, link_name] result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-align"), args, encoding=encoding) - with open(alignfile + ".result", "w") as F: + with open(alignfile + ".result", "w", encoding="utf-8") as F: print(result, file=F) _, lastline = result.rsplit("Alignment complete.", 1) - log.info("%s", lastline.strip()) + logger.info("%s", lastline.strip()) if " 0 alignment" in lastline.strip(): - log.warning("No alignment regions created") - log.info("Alignment file/result: %s/.result", alignfile) + logger.warning("No alignment regions created") + logger.info("Alignment file/result: %s/.result", alignfile) # Add alignment parameter to registry # cwb-regedit is not installed by default, so we skip it and modify the regfile directly instead: regfile = os.path.join(os.environ["CORPUS_REGISTRY"], corpus) - with open(regfile) as F: + with open(regfile, encoding="utf-8") as F: skip_align = ("ALIGNED %s" % other) in F.read() if not skip_align: - with open(regfile, "a") as F: + with open(regfile, "a", encoding="utf-8") as F: print(file=F) print("# Added by cwb.py", file=F) print("ALIGNED", other, file=F) - log.info("Added alignment to registry: %s", regfile) + logger.info("Added alignment to registry: %s", regfile) # args = [corpus, ":add", ":a", other] # result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-regedit"), args) - # log.info("%s", result.strip()) + # logger.info("%s", result.strip()) # Encode the alignments into CWB args = ["-v", "-D", alignfile] result, _ = util.system.call_binary(os.path.join(bin_path, "cwb-align-encode"), args, encoding=encoding) - log.info("%s", result.strip()) + logger.info("%s", result.strip()) ################################################################################ @@ -349,46 +359,31 @@ def make_token_line(word, token, token_attributes, annotation_dict, index): line = [word.replace(" ", "_").replace("&", "&").replace("<", "<").replace(">", ">")] for attr in token_attributes: if attr not in annotation_dict[token]: - attr_str = util.UNDEF + attr_str = util.constants.UNDEF else: attr_str = annotation_dict[token][attr][index] line.append( attr_str.replace(" ", "_").replace("/", "").replace("&", "&").replace("<", "<").replace(">", ">")) line = "\t".join(line) - return util.remove_control_characters(line) + return util.misc.remove_control_characters(line) def parse_structural_attributes(structural_atts): - """Parse a list of annotations (element:attribute) into a list of tuples. - - >>> parse_structural_attributes("s - text:title text:author") - [('s', [('__UNDEF__', 0)]), ('text', [('title', 2), ('author', 3)])] - """ - if isinstance(structural_atts, str): - structural_atts = structural_atts.split() - structs = {} - order = [] - for n, struct in enumerate(structural_atts): - - # From the CWB documentation: "By convention, all attribute names must be lowercase - # (more precisely, they may only contain the characters a-z, 0-9, -, and _, and may not start with a digit)" - assert not struct or struct == "-" or "." not in struct, "Struct should contain ':' or be equal to '-': %s" % struct - - if ":" in struct: - elem, attr = struct.split(":") - else: - elem = struct - attr = util.UNDEF - if struct and not struct == "-": - if elem not in structs: - structs[elem] = [] - order.append(elem) - structs[elem].append((attr, n)) - return [(elem, structs[elem]) for elem in order] + """Parse a list of annotation names (annotation:attribute) into a list of tuples.""" + structs = OrderedDict() + for struct in structural_atts: + elem, _, attr = struct.partition(":") + if elem not in structs: + structs[elem] = [] + if attr: + structs[elem].append(attr) + return [(elem, structs[elem]) for elem in structs] def cwb_escape(inname): """Replace dots with "-" for CWB compatibility.""" + # From the CWB documentation: "By convention, all attribute names must be lowercase + # (more precisely, they may only contain the characters a-z, 0-9, -, and _, and may not start with a digit)" return re.sub(r"\.", "-", inname) @@ -402,4 +397,4 @@ def truncateset(string, maxlength=4095, delimiter="|", affix="|", encoding="UTF- for i, value in enumerate(values): length += len(value.encode(encoding)) + 1 if length > maxlength: - return util.cwbset(values[:i], delimiter, affix) + return util.misc.cwbset(values[:i], delimiter, affix) diff --git a/sparv/modules/cwb/info.py b/sparv/modules/cwb/info.py index b496c5f3..87e38f01 100644 --- a/sparv/modules/cwb/info.py +++ b/sparv/modules/cwb/info.py @@ -1,25 +1,42 @@ """Create or edit .info file.""" -import logging -import os import time from datetime import datetime -import sparv.util as util -from sparv import (AllDocuments, Config, Corpus, Export, ExportInput, annotator, exporter, - AnnotationAllDocs, OutputCommonData, AnnotationCommonData) +from sparv.api import (AllSourceFilenames, AnnotationAllSourceFiles, AnnotationCommonData, Config, Export, + OutputCommonData, SparvErrorMessage, annotator, exporter, get_logger) -log = logging.getLogger(__name__) +logger = get_logger(__name__) @exporter("CWB .info file") -def info(out: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.info", absolute_path=True), +def info(out: Export = Export("cwb.encoded/data/.info"), sentences: AnnotationCommonData = AnnotationCommonData("misc._count"), firstdate: AnnotationCommonData = AnnotationCommonData("cwb.datefirst"), lastdate: AnnotationCommonData = AnnotationCommonData("cwb.datelast"), resolution: AnnotationCommonData = AnnotationCommonData("dateformat.resolution"), - protected: bool = Config("korp.protected")): - """Save information to the file specified by 'out'.""" + protected: bool = Config("korp.protected"), + korp_modes: list = Config("korp.modes")): + """Create CWB .info file.""" + create_info_file(sentences, firstdate, lastdate, resolution, protected, korp_modes, out) + + +@exporter("CWB .info file for scrambled corpus") +def info_scrambled(out: Export = Export("cwb.encoded_scrambled/data/.info"), + sentences: AnnotationCommonData = AnnotationCommonData("misc._count"), + firstdate: AnnotationCommonData = AnnotationCommonData("cwb.datefirst"), + lastdate: AnnotationCommonData = AnnotationCommonData("cwb.datelast"), + resolution: AnnotationCommonData = AnnotationCommonData("dateformat.resolution"), + protected: bool = Config("korp.protected"), + korp_modes: list = Config("korp.modes")): + """Create CWB .info file for scrambled corpus.""" + create_info_file(sentences, firstdate, lastdate, resolution, protected, korp_modes, out) + + +def create_info_file(sentences: AnnotationCommonData, firstdate: AnnotationCommonData, lastdate: AnnotationCommonData, + resolution: AnnotationCommonData, protected: bool, korp_modes: list, + out: Export): + """Create .info file.""" content = [] protected_str = str(protected).lower() @@ -28,7 +45,8 @@ def info(out: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.info", absolute_ ("LastDate", lastdate), ("DateResolution", resolution), ("Updated", time.strftime("%Y-%m-%d")), - ("Protected", protected_str)]: + ("Protected", protected_str), + ("KorpModes", ",".join(korp_modes))]: if isinstance(value_obj, AnnotationCommonData): value = value_obj.read() else: @@ -37,32 +55,35 @@ def info(out: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.info", absolute_ content.append("%s: %s\n" % (key, value)) # Write .info file - with open(out, "w") as o: + with open(out, "w", encoding="utf-8") as o: o.writelines(content) - log.info("Exported: %s", out) + logger.info("Exported: %s", out) @annotator("datefirst and datelast files for .info", order=1) -def info_date(docs: AllDocuments = AllDocuments(), +def info_date(source_files: AllSourceFilenames = AllSourceFilenames(), out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"), out_datelast: OutputCommonData = OutputCommonData("cwb.datelast"), - datefrom: AnnotationAllDocs = AnnotationAllDocs("[dateformat.out_annotation]:dateformat.datefrom"), - dateto: AnnotationAllDocs = AnnotationAllDocs("[dateformat.out_annotation]:dateformat.dateto"), - timefrom: AnnotationAllDocs = AnnotationAllDocs("[dateformat.out_annotation]:dateformat.timefrom"), - timeto: AnnotationAllDocs = AnnotationAllDocs("[dateformat.out_annotation]:dateformat.timeto")): + datefrom: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[dateformat.out_annotation]:dateformat.datefrom"), + dateto: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[dateformat.out_annotation]:dateformat.dateto"), + timefrom: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[dateformat.out_annotation]:dateformat.timefrom"), + timeto: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[dateformat.out_annotation]:dateformat.timeto")): """Create datefirst and datelast file (needed for .info file).""" first_date = None last_date = None - for doc in docs: - from_dates = sorted((int(x[0]), x[1]) for x in datefrom.read_attributes(doc, (datefrom, timefrom))) - if first_date is None or from_dates[0] < first_date: + for file in source_files: + from_dates = sorted((int(x[0]), x[1]) for x in datefrom.read_attributes(file, (datefrom, timefrom)) if x[0]) + if from_dates and (first_date is None or from_dates[0] < first_date): first_date = from_dates[0] - to_dates = sorted((int(x[0]), x[1]) for x in dateto.read_attributes(doc, (dateto, timeto))) - if last_date is None or to_dates[-1] > last_date: + to_dates = sorted((int(x[0]), x[1]) for x in dateto.read_attributes(file, (dateto, timeto)) if x[0]) + if to_dates and (last_date is None or to_dates[-1] > last_date): last_date = to_dates[-1] + if not first_date or not last_date: + raise SparvErrorMessage("Corpus is configured as having date information, but no dates were found.") + # Parse and re-format dates (zero-padding dates with less than 8 digits, needed by strptime) first_date_d = datetime.strptime(f"{str(first_date[0]).zfill(8)} {first_date[1]}", "%Y%m%d %H%M%S") first_date_formatted = first_date_d.strftime("%Y-%m-%d %H:%M:%S") @@ -77,7 +98,7 @@ def info_date(docs: AllDocuments = AllDocuments(), def info_date_unknown(out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"), out_datelast: OutputCommonData = OutputCommonData("cwb.datelast")): """Create empty datefirst and datelast file (needed for .info file) if corpus has no date information.""" - log.info("No date information found in corpus") + logger.info("No date information found in corpus") # Write datefirst and datelast files out_datefirst.write("") diff --git a/sparv/modules/cwb/install_corpus.py b/sparv/modules/cwb/install_corpus.py new file mode 100644 index 00000000..a8b28109 --- /dev/null +++ b/sparv/modules/cwb/install_corpus.py @@ -0,0 +1,76 @@ +"""Module for installing cwb binary files on remote host.""" + +import os +from typing import Optional + +from sparv.api import Config, Corpus, ExportInput, OutputCommonData, SparvErrorMessage, installer, util + + +@installer("Install CWB datafiles on remote host") +def install_corpus( + corpus: Corpus = Corpus(), + out: OutputCommonData = OutputCommonData("cwb.install_corpus_marker"), + host: Optional[str] = Config("cwb.remote_host"), + registry_file: ExportInput = ExportInput("cwb.encoded/registry/[metadata.id]"), + info_file: ExportInput = ExportInput("cwb.encoded/data/.info"), + target_data_dir: str = Config("cwb.remote_data_dir"), + target_registry_dir: str = Config("cwb.remote_registry_dir"), + # The remaining arguments are needed by Snakemake + _marker: ExportInput = ExportInput("cwb.encoded/data/.marker")): + """Install CWB datafiles on server, by rsyncing datadir and registry.""" + sync_cwb(corpus=corpus, out=out, host=host, info_file=info_file, registry_file=registry_file, + target_data_dir=target_data_dir, target_registry_dir=target_registry_dir) + + +@installer("Install CWB datafiles for a scrambled corpus on remote host") +def install_corpus_scrambled( + corpus: Corpus = Corpus(), + out: OutputCommonData = OutputCommonData("cwb.install_corpus_scrambled_marker"), + host: Optional[str] = Config("cwb.remote_host"), + registry_file: ExportInput = ExportInput("cwb.encoded_scrambled/registry/[metadata.id]"), + info_file: ExportInput = ExportInput("cwb.encoded_scrambled/data/.info"), + target_data_dir: str = Config("cwb.remote_data_dir"), + target_registry_dir: str = Config("cwb.remote_registry_dir"), + # The remaining arguments are needed by Snakemake + _scrambled_marker: ExportInput = ExportInput("cwb.encoded_scrambled/data/.scrambled_marker")): + """Install scrambled CWB datafiles on server, by rsyncing datadir and registry.""" + sync_cwb(corpus=corpus, out=out, host=host, info_file=info_file, registry_file=registry_file, + target_data_dir=target_data_dir, target_registry_dir=target_registry_dir) + + +def sync_cwb(corpus, out, host, info_file, registry_file, target_data_dir, target_registry_dir): + """Install CWB datafiles on server, by rsyncing CWB datadir and registry.""" + if not corpus: + raise SparvErrorMessage("Missing corpus name. Corpus not installed.") + + if not target_data_dir: + raise SparvErrorMessage("Configuration variable cwb.remote_data_dir not set! Corpus not installed.") + + if not target_registry_dir: + raise SparvErrorMessage("Configuration variable cwb.remote_registry_dir not set! Corpus not installed.") + + source_data_dir = os.path.dirname(info_file) + source_registry_dir = os.path.dirname(registry_file) + + target = os.path.join(target_data_dir, corpus) + util.system.rsync(source_data_dir, host, target) + + target_registry_file = os.path.join(target_registry_dir, corpus) + source_registry_file = os.path.join(source_registry_dir, corpus + ".tmp") + + # Fix absolute paths in registry file + with open(registry_file, encoding="utf-8") as registry_in: + with open(source_registry_file, "w", encoding="utf-8") as registry_out: + for line in registry_in: + if line.startswith("HOME"): + line = f"HOME {target_data_dir}/{corpus}\n" + elif line.startswith("INFO"): + line = f"INFO {target_data_dir}/{corpus}/.info\n" + + registry_out.write(line) + + util.system.rsync(source_registry_file, host, target_registry_file) + os.remove(source_registry_file) + + # Write marker file + out.write("") diff --git a/sparv/modules/dateformat/dateformat.py b/sparv/modules/dateformat/dateformat.py index 2fe45b20..208154ca 100644 --- a/sparv/modules/dateformat/dateformat.py +++ b/sparv/modules/dateformat/dateformat.py @@ -1,15 +1,14 @@ """Formats dates and times.""" import datetime -import logging import re from typing import Optional from dateutil.relativedelta import relativedelta -from sparv import Annotation, Config, Output, OutputCommonData, annotator +from sparv.api import Annotation, Config, Output, OutputCommonData, SparvErrorMessage, annotator, get_logger -log = logging.getLogger(__name__) +logger = get_logger(__name__) @annotator("Convert existing dates to specified output format", config=[ @@ -193,6 +192,12 @@ def get_date_length(informat): return length + # Check that the input annotation matches the output + if (in_from.annotation_name != out_from.annotation_name) or ( + in_to.annotation_name != out_to.annotation_name): + raise SparvErrorMessage("The 'dateformat' attributes must be attached to the same annotation as the input" + f" (in this case the '{in_from.annotation_name}' annotation)") + if not in_to: in_to = in_from @@ -261,7 +266,7 @@ def get_date_length(informat): break except ValueError: if tries == len(informat): - log.error("Could not parse: %s", str(vals)) + logger.error("Could not parse: %s", str(vals)) raise continue @@ -335,7 +340,7 @@ def get_date_length(informat): break except ValueError: if tries == len(informat): - log.error("Could not parse: %s", str(vals)) + logger.error("Could not parse: %s", str(vals)) raise continue diff --git a/sparv/modules/docx_import/__init__.py b/sparv/modules/docx_import/__init__.py new file mode 100644 index 00000000..e6954749 --- /dev/null +++ b/sparv/modules/docx_import/__init__.py @@ -0,0 +1,3 @@ +"""Import of docx source files.""" + +from . import docx_import diff --git a/sparv/modules/docx_import/docx_import.py b/sparv/modules/docx_import/docx_import.py new file mode 100644 index 00000000..59bcefcc --- /dev/null +++ b/sparv/modules/docx_import/docx_import.py @@ -0,0 +1,51 @@ +"""Import module for docx source files.""" + +import unicodedata + +from docx2python import docx2python +from docx2python.iterators import iter_at_depth + +from sparv.api import Config, SourceFilename, Output, Source, SourceStructure, Text, importer, util + + + +@importer("docx import", file_extension="docx", outputs=["text"], text_annotation="text", config=[ + Config("docx_import.prefix", "", description="Optional prefix to add to annotation names."), + Config("docx_import.keep_control_chars", False, description="Set to True if control characters should not be " + "removed from the text."), + Config("docx_import.normalize", "NFC", description="Normalize input using any of the following forms: " + "'NFC', 'NFKC', 'NFD', and 'NFKD'.") +]) +def parse(source_file: SourceFilename = SourceFilename(), + source_dir: Source = Source(), + prefix: str = Config("docx_import.prefix"), + keep_control_chars: bool = Config("docx_import.keep_control_chars"), + normalize: str = Config("docx_import.normalize")) -> None: + """Parse docx file as input to the Sparv Pipeline. + + Args: + source_file: The source filename. + source_dir: The source directory. + prefix: Optional prefix for output annotation. + keep_control_chars: Set to True to keep control characters in the text. + normalize: Normalize input text using any of the following forms: 'NFC', 'NFKC', 'NFD', and 'NFKD'. + 'NFC' is used by default. + """ + source_file_path = source_dir.get_path(source_file, ".docx") + d = docx2python(source_file_path) + + # Extract all text from the body, ignoring headers and footers + text = "\n\n".join(iter_at_depth(d.body, 4)) + + if not keep_control_chars: + text = util.misc.remove_control_characters(text) + + if normalize: + text = unicodedata.normalize(normalize, text) + + Text(source_file).write(text) + + # Make up a text annotation surrounding the whole file + text_annotation = "{}.text".format(prefix) if prefix else "text" + Output(text_annotation, source_file=source_file).write([(0, len(text))]) + SourceStructure(source_file).write([text_annotation]) diff --git a/sparv/modules/geo/geo.py b/sparv/modules/geo/geo.py index bb62d2bb..e19537bd 100644 --- a/sparv/modules/geo/geo.py +++ b/sparv/modules/geo/geo.py @@ -1,32 +1,25 @@ """Annotate geographical features.""" -import logging import pickle from collections import defaultdict -import sparv.util as util -from sparv import Annotation, Config, Model, ModelOutput, Output, Wildcard, annotator, modelbuilder +from sparv.api import (Annotation, Config, Model, ModelOutput, Output, Wildcard, annotator, get_logger, modelbuilder, + util) -log = logging.getLogger(__name__) +logger = get_logger(__name__) @annotator("Annotate {chunk} with location data, based on locations contained within the text", language=["swe"], - config=[ - Config("geo.context_chunk", default="", - description="Text chunk (annotation) to use for disambiguating places") - ], wildcards=[Wildcard("chunk", Wildcard.ANNOTATION)]) + wildcards=[Wildcard("chunk", Wildcard.ANNOTATION)]) def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), - context: Annotation = Annotation("[geo.context_chunk]"), ne_type: Annotation = Annotation("swener.ne:swener.type"), ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"), ne_name: Annotation = Annotation("swener.ne:swener.name"), model: Model = Model("[geo.model]"), - method: str = "populous", language: list = []): """Annotate chunks with location data, based on locations contained within the text. - context = text chunk to use for disambiguating places (when applicable). chunk = text chunk to which the annotation will be added. """ model = load_model(model, language=language) @@ -35,33 +28,23 @@ def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geog ne_subtype_annotation = list(ne_subtype.read()) ne_name_annotation = list(ne_name.read()) - children_context_chunk, _orphans = context.get_children(chunk) children_chunk_ne, _orphans = chunk.get_children(ne_type) - out_annotation = chunk.create_empty_attribute() - for chunks in children_context_chunk: - all_locations = [] # TODO: Maybe not needed for anything? - context_locations = [] - chunk_locations = defaultdict(list) - - for ch in chunks: - for n in children_chunk_ne[ch]: - if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]: - location_text = ne_name_annotation[n].replace("\n", " ").replace(" ", " ") - location_data = model.get(location_text.lower()) - if location_data: - all_locations.append((location_text, list(location_data))) - context_locations.append((location_text, list(location_data))) - chunk_locations[ch].append((location_text, list(location_data))) - else: - pass - # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%")) + for chunk_index, chunk_nes in enumerate(children_chunk_ne): + chunk_locations = [] + for n in chunk_nes: + if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]: + location_text = ne_name_annotation[n].replace("\n", " ").replace(" ", " ") + location_data = model.get(location_text.lower()) + if location_data: + chunk_locations.append((location_text, list(location_data))) + else: + pass + # logger.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%")) chunk_locations = most_populous(chunk_locations) - - for c in chunks: - out_annotation[c] = _format_location(chunk_locations.get(c, ())) + out_annotation[chunk_index] = _format_location(chunk_locations) out.write(out_annotation) @@ -74,7 +57,6 @@ def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geogr chunk: Annotation = Annotation("{chunk}"), source: Annotation = Annotation("[geo.metadata_source]"), model: Model = Model("[geo.model]"), - method: str = "populous", language: list = []): """Get location data based on metadata containing location names.""" geomodel = load_model(model, language=language) @@ -82,15 +64,15 @@ def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geogr same_target_source = chunk.split()[0] == source.split()[0] chunk_annotation = list(chunk.read()) source_annotation = list(source.read()) + out_annotation = chunk.create_empty_attribute() # If location source and target chunk are not the same, we need # to find the parent/child relations between them. if not same_target_source: target_source_parents = list(source.get_parents(chunk)) - chunk_locations = {} - for i, _ in enumerate(chunk_annotation): + chunk_locations = [] if same_target_source: location_source = source_annotation[i] else: @@ -100,15 +82,10 @@ def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geogr if location_source: location_data = geomodel.get(location_source.strip().lower()) if location_data: - chunk_locations[i] = [(location_source, list(location_data))] - else: - chunk_locations[i] = [] + chunk_locations = [(location_source, list(location_data))] - chunk_locations = most_populous(chunk_locations) - - out_annotation = chunk.create_empty_attribute() - for c in chunk_locations: - out_annotation[c] = _format_location(chunk_locations.get(c, ())) + chunk_locations = most_populous(chunk_locations) + out_annotation[i] = _format_location(chunk_locations) out.write(out_annotation) @@ -141,7 +118,7 @@ def pickle_model(geonames, alternative_names, out): Add alternative names for each city. """ - log.info("Reading geonames: %s", geonames.name) + logger.info("Reading geonames: %s", geonames.name) result = {} model_file = geonames.read() @@ -160,7 +137,7 @@ def pickle_model(geonames, alternative_names, out): } # Parse file with alternative names of locations, paired with language codes - log.info("Reading alternative names: %s", alternative_names.name) + logger.info("Reading alternative names: %s", alternative_names.name) model_file = alternative_names.read() for line in model_file.split("\n"): @@ -171,7 +148,7 @@ def pickle_model(geonames, alternative_names, out): result[geonameid]["alternative_names"].setdefault(isolanguage, []) result[geonameid]["alternative_names"][isolanguage].append(altname) - log.info("Saving geomodel in Pickle format") + logger.info("Saving geomodel in Pickle format") out.write_pickle(result) @@ -182,7 +159,7 @@ def pickle_model(geonames, alternative_names, out): def load_model(model: Model, language=()): """Load geo model and return as dict.""" - log.info("Reading geomodel: %s", model) + logger.info("Reading geomodel: %s", model) with open(model.path, "rb") as infile: m = pickle.load(infile) @@ -195,24 +172,21 @@ def load_model(model: Model, language=()): result[altname.lower()].add( (l["name"], l["latitude"], l["longitude"], l["country"], l["population"])) - log.info("Read %d geographical names", len(result)) + logger.info("Read %d geographical names", len(result)) return result def most_populous(locations): """Disambiguate locations by only keeping the most populous ones.""" - new_locations = {} + result = set() - for chunk in locations: - new_locations[chunk] = set() - - for loc in locations[chunk]: - biggest = (loc[0], sorted(loc[1], key=lambda x: -int(x[-1]))[0]) - new_locations[chunk].add(biggest) - return new_locations + for loc in locations: + biggest = (loc[0], sorted(loc[1], key=lambda x: -int(x[-1]))[0]) + result.add(biggest) + return result def _format_location(location_data): """Format location as city;country;latitude;longitude.""" - return util.cwbset(";".join((y[0], y[3], y[1], y[2])) for x, y in location_data) + return util.misc.cwbset(";".join((y[0], y[3], y[1], y[2])) for x, y in location_data) diff --git a/sparv/modules/hist/__init__.py b/sparv/modules/hist/__init__.py index 3f290c44..a9b5c656 100644 --- a/sparv/modules/hist/__init__.py +++ b/sparv/modules/hist/__init__.py @@ -1,3 +1,18 @@ """Annotations for historical Swedish texts.""" -# from . import diapivot, models +from sparv.api import Config, util + +from . import diapivot, hist, models + +__config__ = [ + Config("hist.dalin_model", default="hist/dalin.pickle", description="Path to Dalin model"), + Config("hist.swedberg_model", default="hist/swedberg.pickle", description="Path to Swedberg model"), + Config("hist.fsv_model", default="hist/fsvm.pickle", description="Path to model for fornsvenska morphology"), + Config("hist.fsv_spelling", default="hist/fsv-spelling-variants.txt", + description="Path to model for fornsvenska spelling variants"), + # Set max_mwe_gaps to 0 since many (most?) multi-word in the old lexicons are unseparable (half öre etc) + Config("hist.max_mwe_gaps", default=0, description="Max amount of gaps allowed within a multiword expression"), + Config("hist.delimiter", default=util.constants.DELIM, description="Character to put between ambiguous results"), + Config("hist.affix", default=util.constants.AFFIX, description="Character to put before and after sets of results"), + Config("hist.extralemgrams", default="", description="Additional lemgram annotation") +] diff --git a/sparv/modules/hist/diapivot.py b/sparv/modules/hist/diapivot.py index 6a371299..426cd8c7 100644 --- a/sparv/modules/hist/diapivot.py +++ b/sparv/modules/hist/diapivot.py @@ -1,20 +1,19 @@ """Create diapivot annotation.""" -import logging import pickle import xml.etree.ElementTree as etree -import sparv.util as util -from sparv import Annotation, Model, ModelOutput, Output, annotator, modelbuilder +from sparv.api import Annotation, Model, ModelOutput, Output, annotator, get_logger, modelbuilder, util -log = logging.getLogger(__name__) +logger = get_logger(__name__) PART_DELIM1 = "^1" -# @annotator("Diapivot annotation", language=["swe-1800"]) -def diapivot_annotate(out: Output = Output(":hist.diapivot", description="SALDO IDs corresponding to lemgrams"), - lemgram: Annotation = Annotation(":saldo.lemgram"), +@annotator("Diapivot annotation", language=["swe-1800", "swe-fsv"]) +def diapivot_annotate(out: Output = Output(":hist.diapivot", cls="token:lemgram", + description="SALDO lemgrams inferred from the diapivot model"), + lemgram: Annotation = Annotation(":hist.lemgram"), model: Model = Model("hist/diapivot.pickle")): """Annotate each lemgram with its corresponding saldo_id according to model. @@ -24,23 +23,34 @@ def diapivot_annotate(out: Output = Output(":hist.diapivot", description= lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation(":saldo.lemgram"). model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle"). """ - lexicon = PivotLexicon(model) + lexicon = PivotLexicon(model.path) lemgram_annotation = list(lemgram.read()) out_annotation = [] for lemgrams in lemgram_annotation: saldo_ids = [] - for lemgram in lemgrams.split(util.DELIM): + for lemgram in lemgrams.split(util.constants.DELIM): s_i = lexicon.get_exactMatch(lemgram) if s_i: saldo_ids += [s_i] - out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) + util.AFFIX if saldo_ids else util.AFFIX) + + out_annotation.append(util.misc.cwbset(set(saldo_ids), sort=True)) out.write(out_annotation) -# @modelbuilder("Diapivot model", language=["swe"]) +@annotator("Combine lemgrams from SALDO, Dalin, Swedberg and the diapivot", language=["swe-1800", "swe-fsv"]) +def combine_lemgrams(out: Output = Output(":hist.combined_lemgrams", cls="token:lemgram", + description="SALDO lemgrams combined from SALDO, Dalin, Swedberg and the diapivot"), + diapivot: Annotation = Annotation(":hist.diapivot"), + lemgram: Annotation = Annotation(":hist.lemgram")): + """Combine lemgrams from SALDO, Dalin, Swedberg and the diapivot into a set of annotations.""" + from sparv.modules.misc import misc + misc.merge_to_set(out, left=diapivot, right=lemgram, unique=True, sort=True) + + +@modelbuilder("Diapivot model", language=["swe-1800", "swe-fsv"]) def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")): """Download diapivot XML dictionary and save as a pickle file.""" # Download diapivot.xml @@ -49,7 +59,7 @@ def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")): # Create pickle file xml_lexicon = read_xml(xml_model.path) - log.info("Saving cross lexicon in Pickle format") + logger.info("Saving cross lexicon in Pickle format") picklex = {} for lem in xml_lexicon: lemgrams = [] @@ -77,11 +87,11 @@ class PivotLexicon: def __init__(self, crossfile, verbose=True): """Read pickled lexicon.""" if verbose: - log.info("Reading cross lexicon: %s", crossfile) + logger.info("Reading cross lexicon: %s", crossfile) with open(crossfile, "rb") as F: self.lexicon = pickle.load(F) if verbose: - log.info("OK, read %d words", len(self.lexicon)) + logger.info("OK, read %d words", len(self.lexicon)) def lookup(self, lem): """Lookup a word in the lexicon.""" @@ -104,7 +114,7 @@ def _split_val(key_val): def read_xml(xml): """Read the XML version of crosslinked lexicon.""" - log.info("Reading XML lexicon") + logger.info("Reading XML lexicon") lexicon = {} context = etree.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element @@ -135,9 +145,9 @@ def read_xml(xml): testwords = ["tigerhjerta..nn.1", "lågland..nn.1", "gud..nn.1"] - util.test_lexicon(lexicon, testwords) + util.misc.test_lexicon(lexicon, testwords) - log.info("OK, read") + logger.info("OK, read") return lexicon diff --git a/sparv/modules/hist/hist.py b/sparv/modules/hist/hist.py index 2c20892a..2ea77bf5 100644 --- a/sparv/modules/hist/hist.py +++ b/sparv/modules/hist/hist.py @@ -1,26 +1,229 @@ -import sparv.modules.saldo.saldo as saldo -import sparv.util as util -import sparv.diapivot as diapivot +"""Annotators for historical Swedish.""" + import re -import itertools -import os +from typing import List, Optional + +import sparv.modules.saldo.saldo as saldo +from sparv.api import Annotation, Config, Model, Output, annotator, get_logger, util +from sparv.api.util.tagsets import tagmappings + +logger = get_logger(__name__) + + +@annotator("Annotations from SALDO, Dalin and Swedberg", language=["swe-1800"], order=1, preloader=saldo.preloader, + preloader_params=["models"], preloader_target="models_preloaded") +def annotate_saldo( + token: Annotation = Annotation(""), + word: Annotation = Annotation(""), + sentence: Annotation = Annotation(""), + reference: Annotation = Annotation(""), + out_sense: Output = Output(":hist.sense", cls="token:sense", + description="Sense identifiers from SALDO, Dalin and Swedberg"), + out_lemgram: Output = Output(":hist.lemgram", cls="token:lemgram", + description="Lemgrams from SALDO, Dalin and Swedberg"), + out_baseform: Output = Output(":hist.baseform", cls="token:baseform", + description="Baseforms from SALDO, Dalin and Swedberg"), + models: List[Model] = [Model("[saldo.model]"), Model("[hist.dalin_model]"), Model("[hist.swedberg_model]")], + msd: Optional[Annotation] = Annotation(""), + delimiter: str = Config("hist.delimiter"), + affix: str = Config("hist.affix"), + precision: str = Config("saldo.precision"), + precision_filter: str = Config("saldo.precision_filter"), + min_precision: float = Config("saldo.min_precision"), + skip_multiword: bool = Config("saldo.skip_multiword"), + max_gaps: int = Config("hist.max_mwe_gaps"), + allow_multiword_overlap: bool = Config("saldo.allow_multiword_overlap"), + word_separator: str = Config("saldo.word_separator"), + models_preloaded: Optional[dict] = None): + """Use lexicon models (SALDO, Dalin and Swedberg) to annotate (potentially msd-tagged) words.""" + saldo.main(token=token, word=word, sentence=sentence, reference=reference, out_sense=out_sense, + out_lemgram=out_lemgram, out_baseform=out_baseform, models=models, msd=msd, delimiter=delimiter, + affix=affix, precision=precision, precision_filter=precision_filter, min_precision=min_precision, + skip_multiword=skip_multiword, max_gaps=max_gaps, allow_multiword_overlap=allow_multiword_overlap, + word_separator=word_separator, models_preloaded=models_preloaded) + + +@annotator("Annotations from Schlyter and Söderwall", language=["swe-fsv"], + preloader=saldo.preloader, preloader_params=["models"], preloader_target="models_preloaded", + config=[Config("hist.fsv_min_precision", default=0.25, + description="Only use annotations with a probability score higher than this")]) +def annotate_saldo_fsv( + token: Annotation = Annotation(""), + word: Annotation = Annotation(":hist.all_spelling_variants"), + sentence: Annotation = Annotation(""), + reference: Annotation = Annotation(""), + out_sense: Output = Output(":hist.sense", cls="token:sense", + description="Sense identifiers from SALDO (empty dummy annotation)"), + out_lemgram: Output = Output(":hist.lemgram", cls="token:lemgram", + description="Lemgrams from Schlyter and Söderwall"), + out_baseform: Output = Output(":hist.baseform", cls="token:baseform", + description="Baseforms from Schlyter and Söderwall"), + models: List[Model] = [Model("[hist.fsv_model]")], + delimiter: str = Config("hist.delimiter"), + affix: str = Config("hist.affix"), + precision: str = Config("saldo.precision"), + precision_filter: str = Config("saldo.precision_filter"), + min_precision: float = Config("hist.fsv_min_precision"), + skip_multiword: bool = Config("saldo.skip_multiword"), + max_gaps: int = Config("hist.max_mwe_gaps"), + allow_multiword_overlap: bool = Config("saldo.allow_multiword_overlap"), + word_separator: str = "|", + models_preloaded: Optional[dict] = None): + """Use lexicon models (Schlyter and Söderwall) to annotate words.""" + saldo.main(token=token, word=word, sentence=sentence, reference=reference, out_sense=out_sense, + out_lemgram=out_lemgram, out_baseform=out_baseform, models=models, msd="", delimiter=delimiter, + affix=affix, precision=precision, precision_filter=precision_filter, min_precision=min_precision, + skip_multiword=skip_multiword, max_gaps=max_gaps, allow_multiword_overlap=allow_multiword_overlap, + word_separator=word_separator, models_preloaded=models_preloaded) + + +@annotator("Extract POS tags (homograph sets) from lemgrams", language=["swe-fsv"]) +def extract_pos(out: Output = Output(":hist.homograph_set", description="Sets of POS extracted from lemgrams"), + lemgrams: Annotation = Annotation(""), + extralemgrams: Optional[Annotation] = Annotation("[hist.extralemgrams]"), + delimiter: str = Config("hist.delimiter"), + affix: str = Config("hist.affix")): + """Extract POS tags from lemgrams. + + Args: + out (Output): The output annotation. Defaults to Output(":hist.homograph_set"). + lemgrams (Annotation): Input lemgram annotation. Defaults to Annotation(":saldo.lemgram"). + extralemgrams (Optional[Annotation], optional): Additional annotation from which more pos-tags can be extracted. + Defaults to Annotation("[hist.extralemgrams]"). + delimiter (str): Character to put between ambiguous results. Defaults to Config("hist.delimiter"). + affix (str): Character to put before and after sets of results. Defaults to Config("hist.affix"). + """ + def oktag(tag): + return tag is not None and tag.group(1) not in ["e", "sxc", "mxc"] -# The minimun precision difference for two annotations to be considered equal -PRECISION_DIFF = 0.01 + def mkpos(_, thelems): + pos = [re.search(r"\.\.(.*?)\.", lem) for lem in thelems] + mapping = tagmappings.mappings["saldo_pos_to_suc"] + pos_lists = [mapping.get(p.group(1), []) for p in pos if oktag(p)] + return sorted(list(set([y for x in pos_lists for y in x]))) + + _annotate_standard(out, lemgrams, mkpos, extralemgrams, delimiter=delimiter, affix=affix) + + +@annotator("Get fallback lemgrams from Dalin or Swedberg", language=["swe-1800"], order=2, config=[ + Config("hist.lemgram_key", default="lem", description="Key to lookup in the lexicon"), +], preloader=saldo.preloader, preloader_params=["models"], preloader_target="models_preloaded") +def lemgram_fallback( + out: Output = Output(":hist.lemgram", cls="token:lemgram", + description="Fallback lemgrams from Dalin or Swedberg"), + word: Annotation = Annotation(""), + msd: Annotation = Annotation(""), + lemgram: Annotation = Annotation(":saldo.lemgram"), + key: str = Config("hist.lemgram_key"), + models: List[Model] = [Model("[hist.dalin_model]"), Model("[hist.swedberg_model]")], + delimiter: str = Config("hist.delimiter"), + affix: str = Config("hist.affix"), + models_preloaded: Optional[dict] = None +): + """Lookup lemgrams in models for words that do not already have a lemgram. + + Args: + out (Output): The output annotation. Defaults to Output(":hist.lemgram"). + word (Annotation): Input annotation with token strings. Defaults to Annotation(""). + msd (Annotation): Input annotation with POS and morphosyntactig desciptions. Defaults to Annotation(""). + lemgram (Annotation): Input annotation with SALDO lemgrams. Defaults to Annotation(":saldo.lemgram"). + key (str): Key to lookup in the models. Defaults to Config("hist.lemgram_key"). + models (List[Model], optional): A list of lexicon models. Defaults to [Model("[hist.dalin_model]"), Model("[hist.swedberg_model]")]. + delimiter (str): Character to put between ambiguous results. Defaults to Config("hist.delimiter"). + affix (str): Character to put before and after sets of results. Defaults to Config("hist.affix"). + models_preloaded (dict, optional): Preloaded models. Defaults to None. + """ + _annotate_fallback(out=out, word=word, msd=msd, main_annotation=lemgram, key=key, models=models, delimiter=delimiter, + affix=affix, models_preloaded=models_preloaded) + + +@annotator("Get fallback baseforms from Dalin or Swedberg", language=["swe-1800"], order=2, config=[ + Config("hist.baseform_key", default="gf", description="Key to lookup in the lexicon"), +], preloader=saldo.preloader, preloader_params=["models"], preloader_target="models_preloaded") +def baseform_fallback( + out: Output = Output(":hist.baseform", cls="token:baseform", + description="Fallback baseforms from Dalin or Swedberg"), + word: Annotation = Annotation(""), + msd: Annotation = Annotation(""), + baseform: Annotation = Annotation(":saldo.baseform"), + key: str = Config("hist.baseform_key"), + models: List[Model] = [Model("[hist.dalin_model]"), Model("[hist.swedberg_model]")], + delimiter: str = Config("hist.delimiter"), + affix: str = Config("hist.affix"), + models_preloaded: Optional[dict] = None +): + """Lookup baseforms in models for words that do not already have a baseform. + + Args: + out (Output): The output annotation. Defaults to Output(":hist.baseform"). + word (Annotation): Input annotation with token strings. Defaults to Annotation(""). + msd (Annotation): Input annotation with POS and morphosyntactig desciptions. Defaults to Annotation(""). + baseform (Annotation): Input annotation with SALDO baseforms. Defaults to Annotation(":saldo.baseform"). + key (str): Key to lookup in the models. Defaults to Config("hist.baseform_key"). + models (List[Model], optional): A list of lexicon models. Defaults to [Model("[hist.dalin_model]"), Model("[hist.swedberg_model]")]. + delimiter (str): Character to put between ambiguous results. Defaults to Config("hist.delimiter"). + affix (str): Character to put before and after sets of results. Defaults to Config("hist.affix"). + models_preloaded (dict, optional): Preloaded models. Defaults to None. -def annotate_variants(word, out, spellmodel, delimiter="|", affix="|", model=None): + """ + _annotate_fallback(out=out, word=word, msd=msd, main_annotation=baseform, key=key, models=models, delimiter=delimiter, + affix=affix, models_preloaded=models_preloaded) + + +@annotator("Convert POS into sets", language=["swe-1800"]) +def posset(pos: Annotation = Annotation(""), + out: Output = Output(":hist.homograph_set", description="POS converted into sets"), + delimiter: str = Config("hist.delimiter"), + affix: str = Config("hist.affix")): + """Annotate with POS sets by converting a single POS into a set (mostly used to make corpora comparable). + + Args: + pos (Annotation, optional): Input annotation with part-of-speech tags. Defaults to Annotation(""). + out (Output, optional): Output annotation with sets of part-of-speech tags. + Defaults to Output(":hist.homograph_set"). + delimiter (str): Character to put between ambiguous results. Defaults to Config("hist.delimiter"). + affix (str): Character to put before and after sets of results. Defaults to Config("hist.affix"). + """ + def makeset(_, thepos): + """Annotate thepos with separators (dummy function).""" + return [thepos] + + _annotate_standard(out, pos, makeset, delimiter=delimiter, affix=affix, split=False) + + +@annotator("Get spelling variants from spelling model for Old Swedish", language=["swe-fsv"], + preloader=saldo.preloader, preloader_params=["model"], preloader_target="model_preloaded") +def spelling_variants(word: Annotation = Annotation(""), + out: Output = Output(":hist.spelling_variants", description="token spelling variants"), + spellingmodel: Model = Model("[hist.fsv_spelling]"), + # model: Model = Model("[hist.fsv_model]"), + delimiter: str = Config("hist.delimiter"), + affix: str = Config("hist.affix"), + # model_preloaded: Optional[dict] = None + ): """Use a lexicon model and a spelling model to annotate words with their spelling variants. - - word is existing annotations for wordforms - - out is a string containing the resulting annotation file - - spellmodel is the spelling model - - model is the lexicon model - - delimiter is the delimiter character to put between ambiguous results - - affix is an optional character to put before and after results + Args: + word (Annotation, optional): Input annotation with token strings. Defaults to Annotation(""). + out (Output, optional): Output annotation with spelling variations. Defaults to Output(""). + spellingmodel (Model): The spelling model. Defaults to Model("[hist.fsv_spelling]") + model (Model): The lexicon model. Defaults to Model("[hist.fsv_model]") + delimiter (str): Character to put between ambiguous results. Defaults to Config("hist.delimiter"). + affix (str): Character to put before and after sets of results. Defaults to Config("hist.affix"). + model_preloaded (dict, optional): Preloaded morphology model. Defaults to None. """ - # model -> {word : [(variant, dist)]} + # # Load model + # model_name = model.path.stem + # if not model_preloaded: + # lexicon = (model_name, saldo.SaldoLexicon(model.path)) + # # Use pre-loaded lexicon + # else: + # assert model_preloaded.get(model_name, None) is not None, "Lexicon %s not found!" % model_name + # lexicon = (model_name, model_preloaded[model_name]) + def parsevariant(modelfile): + # spellingmodel -> {word : [(variant, dist)]} d = {} def addword(res, word, info): @@ -35,472 +238,100 @@ def addword(res, word, info): addword(d, wd, info) return d - if model is None: - lexicon = saldo.SaldoLexicon(model) - - variations = parsevariant(spellmodel) + variations = parsevariant(spellingmodel.path) - def findvariants(tokid, theword): + def findvariants(_, theword): variants = [x_d for x_d in variations.get(theword.lower(), []) if x_d[0] != theword] - # return set(_concat([get_single_annotation(lexicon, v, "lemgram") for v, d in variants])) - return set([v for v, d in variants]) + return sorted(list(set([v for v, d in variants]))) + # variants_lists = [_get_single_annotation([lexicon], v, "lem", "") for v, _d in variants] + # return set([y for x in variants_lists for y in x]) - annotate_standard(out, word, findvariants, split=False) + _annotate_standard(out, word, findvariants, delimiter=delimiter, affix=affix, split=False) -def extract_pos(out, lemgrams, extralemgrams="", delimiter="|", affix="|"): - """Annotate each lemgram with pos-tags, extracted from this. - - - out is the resulting annotation file - - lemgrams is the existing annotations for lemgram - - extralemgrams is an optional extra annotation from which more pos-tags can be extracted - - delimiter is the delimiter character to put between ambiguous results - - affix is an optional character to put before and after results - """ - def oktag(tag): - return tag is not None and tag.group(1) not in ["e", "sxc", "mxc"] - - def mkpos(tokid, thelems): - pos = [re.search(r"\.\.(.*?)\.", lem) for lem in thelems] - # The function lag18002pos has been moved into the corpus (SVN)! - return set(sum([util.tagsets.lag18002pos(p.group(1)) for p in pos if oktag(p)], [])) - - annotate_standard(out, lemgrams, mkpos, extralemgrams) - - -def annotate_fallback(out, word, msd, lemgram, models, key="lemgram", lexicons=None): - """Annotate the words that do not already have a lemgram, according to model(s). - - - out is the resulting annotation file - - word is the words to be annotated - - lemgram is the existing annotations for lemgram - - model is the crosslink model - """ - # catalaunch stuff - if lexicons is None: - models = models.split() - lexicons = [saldo.SaldoLexicon(lex) for lex in models] - - WORD = util.read_annotation(word) - MSD = util.read_annotation(msd) - - def annotate_empties(tokid, lemgrams): - fallbacks = [] - if not lemgrams: - word = WORD[tokid] - msdtag = MSD[tokid] - fallbacks.extend(get_single_annotation(lexicons, word, key, msdtag)) - - return fallbacks - - annotate_standard(out, lemgram, annotate_empties) - - -def annotate_diachron(out, lemgram, model, extralemgrams="", delimiter="|", affix="|"): - """Annotate each lemgram with its corresponding saldo_id, according to model (diapivot.pickle). - - - out is the resulting annotation file - - lemgram is the existing annotations for lemgram - - model is the diapivot model - - delimiter is the delimiter character to put between ambiguous results - - affix is an optional character to put before and after results - """ - lexicon = diapivot.PivotLexicon(model) - - def diachronlink(tokid, thelems): - all_lemgrams = thelems - for lemgram in thelems: - s_i = lexicon.get_exactMatch(lemgram) - if s_i: - all_lemgrams += [s_i] - return all_lemgrams - - annotate_standard(out, lemgram, diachronlink, extralemgrams) - - -def mergemany(out, annotations, separator="|"): - """Concatenate values from two or more annotations, with an optional separator. - - Remove superfluous separators. - """ - # annotations = [util.read_annotation(a) for a in annotations] - d = {} - OUT = {} +@annotator("Merge token and spelling variants into one annotation", language=["swe-fsv"]) +def all_spelling_variants( + out: Output = Output(":hist.all_spelling_variants", description="Original token and spelling variants"), + word: Annotation = Annotation(""), + variants: Annotation = Annotation(":hist.spelling_variants")): + """Merge token and spelling variants into one annotation.""" + from sparv.modules.misc import misc + misc.merge_to_set(out, left=word, right=variants, unique=False, sort=False) - if isinstance(annotations, str): - annotations = annotations.split() - for annotation in [util.read_annotation(a) for a in annotations]: - for key_a, val_a in list(annotation.items()): - if val_a: - d.setdefault(key_a, []).append(val_a) - for key, lst in list(d.items()): - OUT[key] = separator + separator.join(lst) + separator if lst else separator +################################################################################ +# Auxiliaries +################################################################################ - util.write_annotation(out, OUT) - - -def merge(out, left, right, separator=""): - """Concatenate values from two annotations, with an optional separator. - - Remove superfluous separators. - """ - b = util.read_annotation(right) - OUT = {} - - for key_a, val_a in util.read_annotation_iteritems(left): - val = [x for x in [val_a, b[key_a]] if x != separator] - OUT[key_a] = separator.join(list(val)) if val else separator - - util.write_annotation(out, OUT) - - -def posset(out, pos, separator="|"): - """Annotate with POS sets.""" - def makeset(tokid, thepos): - """Annotate thepos with separators (dummy function).""" - return [thepos] - - annotate_standard(out, pos, makeset, split=False) - - -def annotate_standard(out, input_annotation, annotator, extra_input="", delimiter="|", affix="|", split=True): +def _annotate_standard(out, input_annotation, annotator, extra_input="", delimiter: str = util.constants.DELIM, + affix: str = util.constants.AFFIX, split=True): """Apply the 'annotator' function to the annotations in 'input_annotation' and write the new output to 'out'. - The annotator function should have type :: token_id -> oldannotations -> newannotations - No support for multiword expressions - - out is the output file - - input_annotation is the given input annotation - - f is the function which is to be applied to the input annotation - - extra_input is an extra input annotation - - delimiter is the delimiter character to put between ambiguous results - - affix is an optional character to put before and after results - - split defines if the input annatoation is a set, with elements separated by delimiter - if so, return a list. Else, return one single element + Args: + out: The output annotation. + input_annotation: The input annotation. + annotator: function which is to be applied to the input annotation. + It should have type :: oldannotations -> newannotations + extra_input (str, optional): An additional input annotation. Defaults to "". + delimiter (str, optional): Delimiter character to put between ambiguous results. Defaults to + util.constants.DELIM. + affix (str, optional): Character to put before and after results. Defaults to util.constants.AFFIX. + split (bool, optional): Defines whether the input annatoation is a set, with elements separated by delimiter. + If so, return a list. Else, return one single element. Defaults to True. """ - def merge(d1, d2): - result = dict(d1) - for k, v in list(d2.items()): - if k in result: - result[k] = result[k] + delimiter + v - else: - result[k] = v - return result - - LEMS = util.read_annotation(input_annotation) + # Join input_annotation and extra_input with delimiter + annotations = input_annotation.read() if extra_input: - LEMS = merge(LEMS, util.read_annotation(extra_input)) - - clear_annotation(out) - OUT = {} + annotations = [delimiter.join([x, y]) for x, y in zip(annotations, extra_input.read())] - for tokid in LEMS: - thelems = LEMS[tokid] + out_annotation = [] + for token_index, annot in enumerate(annotations): if split: - thelems = [x for x in thelems.split(delimiter) if x != ""] - - output_annotation = set(annotator(tokid, thelems)) - OUT[tokid] = affix + delimiter.join(list(output_annotation)) + affix if output_annotation else affix - - util.write_annotation(out, OUT) - - -def annotate_full(word, sentence, reference, out, annotations, models, msd="", - delimiter="|", affix="|", precision=":%.3f", precision_filter=None, min_precision=0.0, - skip_multiword=False, lexicons=None): - # TODO almost the same as normal saldo.annotate, but doesn't use msd or saldo-specific stuff - """Use a lmf-lexicon model to annotate (pos-tagged) words. - - - word, msd are existing annotations for wordforms and part-of-speech - - sentence is an existing annotation for sentences and their children (words) - - reference is an existing annotation for word references, to be used when - annotating multi-word units - - out is a string containing a whitespace separated list of the resulting annotation files - - annotations is a string containing a whitespace separate list of annotations to be written. - Currently: gf (= baseform), lem (=lemgram) - Number of annotations and their order must correspond to the list in the 'out' argument. - - model is the Saldo model - - delimiter is the delimiter character to put between ambiguous results - - affix is an optional character to put before and after results - - precision is a format string for how to print the precision for each annotation - (use empty string for no precision) - - precision_filter is an optional filter, currently there are the following values: - max: only use the annotations that are most probable - first: only use the most probable annotation (or one of the most probable if more than one) - - min_precision: only use annotations with a probability score higher than this - - skip_multiword can be set to True to disable multi word annotations - - lexicon: this argument cannot be set from the command line, - but is used in the catapult. This argument must be last. - """ - # allow use of multiple lexicons - if not lexicons: - models = [(os.path.basename(m).rstrip(".pickle"), m) for m in models.split()] - lexicons = [(name, saldo.SaldoLexicon(lex)) for name, lex in models] - - max_gaps = 0 # Maximum number of gaps in multi-word units. - # Set to 0 since many (most?) multi-word in the old lexicons are unseparable (half öre etc) - - annotations = annotations.split() - out = out.split() - assert len(out) == len(annotations), "Number of target files and annotations must be the same" - - if isinstance(skip_multiword, str): - skip_multiword = (skip_multiword.lower() == "true") - if skip_multiword: - util.log.info("Skipping multi word annotations") - - min_precision = float(min_precision) - - WORD = util.read_annotation(word) - REF = util.read_annotation(reference) - if msd: - MSD = util.read_annotation(msd) - for out_file in out: - clear_annotation(out_file) - - sentences = [sent.split() for _, sent in util.read_annotation_iteritems(sentence)] - OUT = {} - - for sent in sentences: - incomplete_multis = [] # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}] - complete_multis = [] # ([ref], annotation) - sentence_tokens = {} + annot = [x for x in annot.split(delimiter) if x != ""] - for tokid in sent: - thewords = [w for w in WORD[tokid].split("|") if w] - ref = REF[tokid] - if msd: - msdtag = MSD[tokid] - else: - msdtag = "" + # Pass annot to annotator and convert into cwbset + annots = list(dict.fromkeys(annotator(token_index, annot))) + out_annotation.append(util.misc.cwbset(annots, delimiter=delimiter, affix=affix)) - annotation_info = {} - sentence_tokens[ref] = {"tokid": tokid, "word": thewords, "msd": msdtag, "annotations": annotation_info} + out.write(out_annotation) - for theword in thewords: - # First use MSD tags to find the most probable single word annotations - ann_tags_words = saldo.find_single_word([theword], lexicons, msdtag, precision, min_precision, precision_filter, annotation_info) +def _annotate_fallback(out, word, msd, main_annotation, key, models, delimiter, affix, models_preloaded): + """Lookup 'key' in models for words that are lacking 'main_annotation'.""" + # Allow use of multiple lexicons + models_list = [(m.path.stem, m) for m in models] + if not models_preloaded: + lexicon_list = [(name, saldo.SaldoLexicon(lex.path)) for name, lex in models_list] + # Use pre-loaded lexicons + else: + lexicon_list = [] + for name, _lex in models_list: + assert models_preloaded.get(name, None) is not None, "Lexicon %s not found!" % name + lexicon_list.append((name, models_preloaded[name])) - # Find multi-word expressions - if not skip_multiword: - find_multiword_expressions(incomplete_multis, complete_multis, theword, ref, max_gaps, ann_tags_words) + word_annotation = list(word.read()) + msd_annotation = list(msd.read()) - # Loop to next token - - # Check that we don't have any unwanted overlaps - remove_unwanted_overlaps(complete_multis) - - # Then save the rest of the multi word expressions in sentence_tokens - saldo.save_multiwords(complete_multis, sentence_tokens) - - for token in list(sentence_tokens.values()): - OUT[token["tokid"]] = saldo._join_annotation(token["annotations"], delimiter, affix) - - # Loop to next sentence - - for out_file, annotation in zip(out, annotations): - util.write_annotation(out_file, [(tok, OUT[tok].get(annotation, affix)) for tok in OUT], append=True) - - -def find_multiword_expressions(incomplete_multis, complete_multis, theword, ref, max_gaps, ann_tags_words): - todelfromincomplete = [] # list to keep track of which expressions that have been completed - - for i, x in enumerate(incomplete_multis): - seeking_word = x["words"][0] # The next word we are looking for in this multi-word expression - - # TODO "*" only in saldo - if seeking_word == "*": - if x["words"][1].lower() == theword.lower(): - seeking_word = x["words"][1] - del x["words"][0] - - if x["numberofgaps"] > max_gaps: - todelfromincomplete.append(i) - - elif seeking_word.lower() == theword.lower(): - x["lastwordwasgap"] = False - del x["words"][0] - x["ref"].append(ref) - - # Is current word the last word we are looking for? - if len(x["words"]) == 0: - todelfromincomplete.append(i) - complete_multis.append((x["ref"], x["annotation"])) - else: - # Increment gap counter if previous word was not part of a gap - if not x["lastwordwasgap"]: - x["numberofgaps"] += 1 - x["lastwordwasgap"] = True # Marking that previous word was part of a gap - - # Remove found word from incompletes-list - for x in todelfromincomplete[::-1]: - del incomplete_multis[x] + def annotate_empties(token_index, annotation): + fallbacks = [] + if not annotation: + word = word_annotation[token_index] + msdtag = msd_annotation[token_index] + fallbacks.extend(_get_single_annotation(lexicon_list, word, key, msdtag)) + return fallbacks - # Is this word a possible start for multi-word units? - looking_for = [{"annotation": annotation, "words": words, "ref": [ref], - "is_particle": is_particle, "lastwordwasgap": False, "numberofgaps": 0} - for (annotation, _, wordslist, _, is_particle, _) in ann_tags_words if wordslist for words in wordslist] - if len(looking_for) > 0: - incomplete_multis.extend(looking_for) + _annotate_standard(out, main_annotation, annotate_empties, delimiter=delimiter, affix=affix) -def get_single_annotation(lexicons, word, key, msdtag): +def _get_single_annotation(lexicons, word, key, msdtag): + """Get 'key' from lexicon(s) for each token.""" annotation = [] - # TODO the translation of tags is not fully working yet. - # the precision must be set to 0.25 in order for the lemgrams to be kept. - - for lexicon in lexicons: - res = [(saldo.get_precision(msdtag, msdtags), ann) for (ann, msdtags, wordslist, _, _) in lexicon.lookup(word) if not wordslist] - res = [a for x, a in sorted(res, reverse=True) if x >= 0.25] # TODO use saldo.py for this!!! - if res: - annotation = res + for _, lexicon in lexicons: + # Get precision and 'key' annotation + annotation = [(saldo.get_precision(msdtag, msdtags), ann) for (ann, msdtags, wordslist, _, _) in lexicon.lookup(word) + if not wordslist] + if annotation: break - return _concat(a.get(key) for a in annotation) - - -def remove_unwanted_overlaps(complete_multis): - remove = set() - for ai, a in enumerate(complete_multis): - for b in complete_multis: - if re.search(r"(.*)--.*", a[1]["lemgram"][0]).groups()[0] != re.search(r"(.*)--.*", b[1]["lemgram"][0]).groups()[0]: - # Both are from the same lexicon - remove.add(ai) - elif len(set(a[0])) != len(a[0]): - # Since we allow many words for one token (when using spelling variation) - # we must make sure that two words of a mwe are not made up by two variants of one token - # that is, that the same reference-id is not used twice in a mwe - remove.add(ai) - elif re.search(r"\.\.(\w+)\.", a[1]["lemgram"][0]).groups()[0] == re.search(r"\.\.(\w+)\.", b[1]["lemgram"][0]).groups()[0]: - # Both are of same POS - if b[0][0] < a[0][0] and b[0][-1] > a[0][0] and b[0][-1] < a[0][-1]: - # A case of x1 y1 x2 y2. Remove y. - remove.add(ai) - elif a[0][0] < b[0][0] and b[0][-1] == a[0][-1]: - # A case of x1 y1 xy2. Remove x. - remove.add(ai) - - for a in sorted(remove, reverse=True): - del complete_multis[a] - - -def annotate_mwe(variants, word, reference, sentence, out, annotations, models, delimiter="|", affix="|", precision_filter=":%.3f", filter=None, lexicons=None): - """Annotate multi words only.""" - max_gaps = 0 # Maximum number of gaps in multi-word units. - - annotations = annotations.split() - out = out.split() - assert len(out) == len(annotations), "Number of target files and annotations must be the same" - - # we allow multiple lexicons, each word will get annotations from only one of the lexicons, starting the lookup in the first lexicon in the list - if lexicons is None: - models = models.split() - lexicons = [saldo.SaldoLexicon(lex) for lex in models] - WORD = util.read_annotation(variants) - REALWORD = util.read_annotation(word) - REF = util.read_annotation(reference) - - for out_file in out: - clear_annotation(out_file) - - sentences = [sent.split() for _, sent in util.read_annotation_iteritems(sentence)] - OUT = {} - - for sent in sentences: - incomplete_multis = [] # :: [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}] - complete_multis = [] # :: ([ref], annotation, [text]) - sentence_tokens = {} - - for tokid in sent: - thewords = [w for w in WORD[tokid].split("|") if w] - ref = REF[tokid] - word = REALWORD[tokid] - - annotation_info = {} - sentence_tokens[ref] = {"tokid": tokid, "word": word, "variant": thewords, "annotations": annotation_info} - - endword = len(thewords) - 1 - for i, theword in enumerate(thewords): - - ann_tags_words = saldo.find_single_word([theword], lexicons, "", annotation_info) # emtpy msd tag - # For multi-word expressions - find_multiword_expressions(incomplete_multis, complete_multis, theword, word, ref, max_gaps, ann_tags_words, i == endword) - - # Loop to next token - - # Check that we don't have any unwanted overlaps - remove_unwanted_overlaps(complete_multis) - - # Then save the rest of the multi word expressions in sentence_tokens - saldo.save_multiwords(complete_multis, sentence_tokens) - - for token in list(sentence_tokens.values()): - OUT[token["tokid"]] = saldo._join_annotation(token["annotations"], delimiter, affix) - - # Loop to next sentence - - for out_file, annotation in zip(out, annotations): - print("adding", [(tok, OUT[tok].get(annotation, affix)) for tok in OUT]) - util.write_annotation(out_file, [(tok, OUT[tok].get(annotation, affix)) for tok in OUT], append=True) - - -def find_multiword_expressions(incomplete_multis, complete_multis, theword, textword, ref, max_gaps, ann_tags_words, increase): - # use normal findvariant instead, only textword is different, but not used anyway - todelfromincomplete = [] # list to keep track of which expressions that have been completed - - for i, x in enumerate(incomplete_multis): - seeking_word = x["words"][0] # The next word we are looking for in this multi-word expression - - if x["numberofgaps"] > max_gaps: - todelfromincomplete.append(i) - - elif seeking_word.lower() == theword.lower(): - x["lastwordwasgap"] = False - del x["words"][0] - x["ref"].append(ref) - x["text"].append(textword) - - # Is current word the last word we are looking for? - if len(x["words"]) == 0: - todelfromincomplete.append(i) - complete_multis.append((x["ref"], x["annotation"], x["text"])) - elif increase and ref != x["ref"][-1]: - # Increment gap counter if previous word was not part of a gap - if not x["lastwordwasgap"]: - x["numberofgaps"] += 1 - x["lastwordwasgap"] = True # Marking that previous word was part of a gap - - # Remove found word from incompletes-list - for x in todelfromincomplete[::-1]: - del incomplete_multis[x] - - # Is this word a possible start for multi-word units? - looking_for = [{"annotation": annotation, "words": words, "ref": [ref], "text": [textword], - "is_particle": is_particle, "lastwordwasgap": False, "numberofgaps": 0} - for (annotation, _, wordslist, _, is_particle) in ann_tags_words if wordslist for words in wordslist] - if len(looking_for) > 0: - incomplete_multis.extend(looking_for) - - -def _concat(xs): - return sum(xs, []) - - -def clear_annotation(doc, annotation): - """Remove an annotation file if it exists.""" - annotation_path = util.get_annotation_path(doc, annotation) - if os.path.exists(annotation_path): - os.remove(annotation_path) - - -if __name__ == "__main__": - util.run.main(annotate_variants=annotate_variants, - extract_pos=extract_pos, - merge=merge, - mergemany=mergemany, - posset=posset, - annotate_full=annotate_full, - annotate_fallback=annotate_fallback, - annotate_mwe=annotate_mwe, - annotate_diachron=annotate_diachron - ) + # Sort by precision (descending) and remove precision values + annotation_lists = [a.get(key, []) for _, a in sorted(annotation, reverse=True, key=lambda x: x[0])] + return [y for x in annotation_lists for y in x] diff --git a/sparv/modules/hist/models.py b/sparv/modules/hist/models.py index 57897caf..c8803bc3 100644 --- a/sparv/modules/hist/models.py +++ b/sparv/modules/hist/models.py @@ -1,14 +1,16 @@ """Model builders for older Swedish lexicons.""" -import logging +import re +import xml.etree.ElementTree as etree -from sparv import Model, ModelOutput, modelbuilder -from sparv.util.lmflexicon import lmf_to_pickle +from sparv.api import Model, ModelOutput, get_logger, modelbuilder, util +from sparv.api.util.tagsets import tagmappings +from sparv.modules.saldo.saldo_model import HashableDict, SaldoLexicon -log = logging.getLogger(__name__) +logger = get_logger(__name__) -# @modelbuilder("Dalin morphology model", language=["swe"]) +@modelbuilder("Dalin morphology model", language=["swe-1800"]) def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")): """Download Dalin morphology XML and save as a pickle file.""" # Download dalinm.xml @@ -22,10 +24,10 @@ def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")): xml_model.remove() -# @modelbuilder("Swedberg morphology model", language=["swe"]) +@modelbuilder("Swedberg morphology model", language=["swe-1800"]) def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")): """Download Swedberg morphology XML and save as a pickle file.""" - # Download diapivot.xml + # Download swedbergm.xml xml_model = Model("hist/swedbergm.xml") xml_model.download("https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swedbergm/swedbergm.xml") @@ -34,3 +36,203 @@ def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")): # Clean up xml_model.remove() + + +@modelbuilder("Morphology model for Old Swedish", language=["swe-fsv"]) +def build_fsvm(out: ModelOutput = ModelOutput("hist/fsvm.pickle")): + """Download pickled model for fornsvenska.""" + xml_model = Model("hist/fsvm.xml") + xml_model.download("https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/fsvm/fsvm.xml") + + # Create pickle file + lmf_to_pickle(xml_model.path, out.path, use_fallback=True) + + # Clean up + xml_model.remove() + + +@modelbuilder("Spelling variants list for Old Swedish", language=["swe-fsv"]) +def build_fsv_spelling(out: ModelOutput = ModelOutput("hist/fsv-spelling-variants.txt")): + """Download spelling variants list for fornsvenska.""" + out.download("https://github.com/spraakbanken/sparv-models/raw/master/hist/fsv-spelling-variants.txt") + + +################################################################################ +# LMF CONVERSION +################################################################################ + + +def lmf_to_pickle(xml, filename, annotation_elements=("writtenForm", "lemgram"), skip_multiword=False, + translate_tags=True, use_fallback=False): + """Read an XML dictionary and save as a pickle file.""" + xml_lexicon = read_lmf(xml, annotation_elements=annotation_elements, skip_multiword=skip_multiword, + translate_tags=translate_tags, use_fallback=use_fallback) + SaldoLexicon.save_to_picklefile(filename, xml_lexicon) + + +def read_lmf(xml, annotation_elements=("writtenForm", "lemgram"), verbose=True, skip_multiword=False, + translate_tags=True, use_fallback=False): + """Parse a historical morphological LMF lexicon into the standard SALDO format. + + Does not handle msd-information well. + Does not mark particles. + Does handle multiwords expressions with gaps. + + Args: + xml (str): Path to the input XML file. + annotation_elements (tuple, optional): XML element(s) for the annotation value, "writtenForm" for baseform, + "lemgram" for lemgram. "writtenForm" is translated to "gf" and "lemgram" to "lem" + (for compatability with Saldo). Defaults to ("writtenForm", "lemgram"). + verbose (bool, optional): Whether to turn on verbose mode. Defaults to True. + skip_multiword (bool, optional): Whether to make special entries for multiword expressions. + Set this to False only if the tool used for text annotation cannot handle this at all. Defaults to False. + translate_tags (bool, optional): [description]. Defaults to True. + use_fallback (bool, optional): [description]. Defaults to False. + + Returns: + A lexicon dict: + {wordform: {{annotation-type: annotation}: (set(possible tags), set(tuples with following words) )}} + """ + if verbose: + logger.info("Reading XML lexicon") + lexicon = {} + + context = etree.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element + context = iter(context) + event, root = next(context) + + for event, elem in context: + if event == "end": + if elem.tag == "LexicalEntry": + annotations = HashableDict() + + lem = elem.find("Lemma").find("FormRepresentation") + for a in annotation_elements: + if a == "writtenForm": + key = "gf" + elif a == "lemgram": + key = "lem" + annotations[key] = tuple([_findval(lem, a)]) + + pos = _findval(lem, "partOfSpeech") + inhs = _findval(lem, "inherent") + lemgram = _findval(lem, "lemgram") + if inhs == "-": + inhs = "" + inhs = inhs.split() + + # there may be several WordForms + for forms in elem.findall("WordForm"): + word = _findval(forms, "writtenForm") + param = _findval(forms, "msd") + + multiwords = [] + wordparts = word.split() + for i, word in enumerate(wordparts): + if (not skip_multiword) and len(wordparts) > 1: + + # Handle multi-word expressions + multiwords.append(word) + + # We don't use any particles or mwe:s with gaps since that information is not formally + # expressed in the historical lexicons. But keep the fields so that the file format matches + # the saldo-pickle format. + particle = False + mwe_gap = False + + # Is it the last word in the multi word expression? + if i == len(wordparts) - 1: + lexicon.setdefault(multiwords[0], {}).setdefault(annotations, (set(), set(), mwe_gap, particle))[1].add(tuple(multiwords[1:])) + multiwords = [] + else: + # Single word expressions + if translate_tags: + tags = _convert_default(pos, inhs, param) + if not tags and use_fallback: + tags = _pos_from_lemgram(lemgram) + if tags: + lexicon.setdefault(word, {}).setdefault(annotations, (set(), set(), False, False))[0].update(tags) + else: + saldotag = " ".join([pos, param]) # this tag is rather useless, but at least gives some information + tags = tuple([saldotag]) + lexicon.setdefault(word, {}).setdefault(annotations, (set(), set(), False, False))[0].update(tags) + + # Done parsing section. Clear tree to save memory + if elem.tag in ["LexicalEntry", "frame", "resFrame"]: + root.clear() + if verbose: + testwords = ["äplebuske", + "stöpljus", + "katt", + "doktor"] + util.misc.test_lexicon(lexicon, testwords) + logger.info(f"OK, read {len(lexicon)} entries") + return lexicon + + +################################################################################ +# Auxiliaries +################################################################################ + +def _findval(elems, key): + """Help function for looking up values in the lmf.""" + def iterfindval(): + for form in elems: + att = form.get("att", "") + if att == key: + yield form.get("val") + yield "" + + return next(iterfindval()) + + +def _convert_default(pos, inhs, param): + """Try to convert SALDO tags into SUC tags.""" + tagmap = tagmappings.mappings["saldo_to_suc"] + saldotag = " ".join(([pos] + inhs + [param])) + tags = tagmap.get(saldotag) + if tags: + return tags + tags = _try_translate(saldotag) + if tags: + tagmap[saldotag] = tags + return tags + tags = tagmap.get(pos) + if tags: + return tags + tags = [] + for t in list(tagmap.keys()): + if t.split()[0] == pos: + tags.extend(tagmap.get(t)) + return tags + + +def _try_translate(params): + """Do some basic translations.""" + params_list = [params] + if " m " in params: + # Masculine is translated into utrum + params_list.append(re.sub(" m ", " u ", params)) + if " f " in params: + # Feminine is translated into utrum + params_list.append(re.sub(" f ", " u ", params)) + for params in params_list: + params = params.split() + # Copied from tagmappings._make_saldo_to_suc(), try to convert the tag + # but allow m (the match) to be None if the tag still can't be translated + paramstr = " ".join(tagmappings.mappings["saldo_params_to_suc"].get(prm, prm.upper()) for prm in params) + for (pre, post) in tagmappings._suc_tag_replacements: + m = re.match(pre, paramstr) + if m: + break + if m is not None: + sucfilter = m.expand(post).replace(" ", r"\.").replace("+", r"\+") + return set(suctag for suctag in tagmappings.tags["suc_tags"] if re.match(sucfilter, suctag)) + return [] + + +def _pos_from_lemgram(lemgram): + """Get SUC POS tag from POS in lemgram.""" + pos = lemgram.split(".")[2] + tagmap = tagmappings.mappings["saldo_pos_to_suc"] + return tagmap.get(pos, []) diff --git a/sparv/modules/hunpos/__init__.py b/sparv/modules/hunpos/__init__.py index 96936894..a3dc85a4 100644 --- a/sparv/modules/hunpos/__init__.py +++ b/sparv/modules/hunpos/__init__.py @@ -1,3 +1,26 @@ """Part of Speech annotation using Hunpos.""" +from sparv.api import Config, util + from . import hunpos, morphtable, morphtable_hist + +__config__ = [ + Config("hunpos.binary", default="hunpos-tag", description="Hunpos executable"), + Config("hunpos.encoding", default=util.constants.UTF8, description="Encoding of the input text"), + + # Config for modern Swedish + Config("hunpos.model", default="hunpos/suc3_suc-tags_default-setting_utf8.model", + description="Path to Hunpos model"), + Config("hunpos.morphtable", default="hunpos/saldo_suc-tags.morphtable", + description="Path to optional Hunpos morphtable file"), + Config("hunpos.patterns", default="hunpos/suc.patterns", description="Path to optional patterns file"), + Config("hunpos.tag_mapping", default=None, description="Optional tag mapping for translating the output tags"), + + # Config for swe-1800 + Config("hunpos.model_hist", default="hunpos/suc3_suc-tags_default-setting_utf8.model", + description="Path to Hunpos model (older Swedish)"), + Config("hunpos.morphtable_hist", default="hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable", + description="Path to optional Hunpos morphtable file (older Swedish)"), + Config("hunpos.tag_mapping_hist", default=None, + description="Optional tag mapping for translating the output tags (older Swedish)") +] diff --git a/sparv/modules/hunpos/hunpos.py b/sparv/modules/hunpos/hunpos.py index b3a56657..d26c39ab 100644 --- a/sparv/modules/hunpos/hunpos.py +++ b/sparv/modules/hunpos/hunpos.py @@ -3,8 +3,11 @@ import re from typing import Optional -import sparv.util as util -from sparv import Annotation, Binary, Config, Model, ModelOutput, Output, annotator, modelbuilder +from sparv.api import (Annotation, Binary, Config, Model, ModelOutput, Output, SparvErrorMessage, annotator, get_logger, + modelbuilder, util) +from sparv.api.util.tagsets import tagmappings + +logger = get_logger(__name__) SENT_SEP = "\n\n" TOK_SEP = "\n" @@ -12,14 +15,7 @@ TAG_COLUMN = 1 -@annotator("Part-of-speech annotation with morphological descriptions", language=["swe"], config=[ - Config("hunpos.binary", default="hunpos-tag", description="Hunpos executable"), - Config("hunpos.model", default="hunpos/suc3_suc-tags_default-setting_utf8.model", - description="Path to Hunpos model"), - Config("hunpos.morphtable", default="hunpos/saldo_suc-tags.morphtable", - description="Path to optional Hunpos morphtable file"), - Config("hunpos.patterns", default="hunpos/suc.patterns", description="Path to optional patterns file") - ]) +@annotator("Part-of-speech annotation with morphological descriptions", language=["swe"]) def msdtag(out: Output = Output(":hunpos.msd", cls="token:msd", description="Part-of-speeches with morphological descriptions"), word: Annotation = Annotation(""), @@ -28,11 +24,33 @@ def msdtag(out: Output = Output(":hunpos.msd", cls="token:msd", model: Model = Model("[hunpos.model]"), morphtable: Optional[Model] = Model("[hunpos.morphtable]"), patterns: Optional[Model] = Model("[hunpos.patterns]"), - tag_mapping=None, - encoding: str = util.UTF8): + tag_mapping: Optional[str] = Config("hunpos.tag_mapping"), + encoding: str = Config("hunpos.encoding")): + """POS/MSD tag modern Swedish texts using the Hunpos tagger.""" + main(out, word, sentence, binary, model, morphtable=morphtable, patterns=patterns, tag_mapping=tag_mapping, + encoding=encoding) + + +@annotator("Part-of-speech annotation with morphological descriptions for older Swedish", language=["swe-1800"]) +def msdtag_hist(out: Output = Output(":hunpos.msd_hist", cls="token:msd", + description="Part-of-speeches with morphological descriptions"), + word: Annotation = Annotation(""), + sentence: Annotation = Annotation(""), + binary: Binary = Binary("[hunpos.binary]"), + model: Model = Model("[hunpos.model_hist]"), + morphtable: Optional[Model] = Model("[hunpos.morphtable_hist]"), + tag_mapping: Optional[str] = Config("hunpos.tag_mapping_hist"), + encoding: str = Config("hunpos.encoding")): + """POS/MSD tag modern Swedish texts using the Hunpos tagger.""" + main(out, word, sentence, binary, model, morphtable=morphtable, patterns=None, tag_mapping=tag_mapping, + encoding=encoding) + + +def main(out, word, sentence, binary, model, morphtable=None, patterns=None, tag_mapping=None, + encoding=util.constants.UTF8): """POS/MSD tag using the Hunpos tagger.""" if isinstance(tag_mapping, str) and tag_mapping: - tag_mapping = util.tagsets.mappings[tag_mapping] + tag_mapping = tagmappings.mappings[tag_mapping] elif tag_mapping is None or tag_mapping == "": tag_mapping = {} @@ -71,16 +89,41 @@ def replace_word(w): out.write(out_annotation) -@annotator("Extract POS from MSD", language=["swe"]) +@annotator("Extract POS from MSD", language=["swe", "swe-1800"]) def postag(out: Output = Output(":hunpos.pos", cls="token:pos", description="Part-of-speech tags"), - msd: Annotation = Annotation(":hunpos.msd")): + msd: Annotation = Annotation("")): """Extract POS from MSD.""" from sparv.modules.misc import misc misc.select(out, msd, index=0, separator=".") @modelbuilder("Hunpos model", language=["swe"]) -def hunpos_model(model: ModelOutput = ModelOutput("hunpos/suc3_suc-tags_default-setting_utf8.model")): +def hunpos_model(model: ModelOutput = ModelOutput("hunpos/suc3_suc-tags_default-setting_utf8.model"), + binary: Binary = Binary("[hunpos.binary]")): """Download the Hunpos model.""" - model.download( + from sys import platform + + def test_hunpos(model): + stdin = TOK_SEP.join(["jag", "och", "du"]) + SENT_SEP + stdout, _ = util.system.call_binary(binary, [model.path], stdin, encoding="UTF-8") + logger.debug("Output from 'hunpos-tag' with test input:\n%s", stdout) + if stdout.split() != "jag PN.UTR.SIN.DEF.SUB och KN du PN.UTR.SIN.DEF.SUB".split(): + raise SparvErrorMessage("Hunpos model does not work correctly.") + + # Run "hunpos-tag -h" to check what version was installed + stdout, _ = util.system.call_binary(binary, ["-h"], allow_error=True) + logger.debug("Output from 'hunpos-tag -h': %s", stdout) + # Search for keyword "--verbose" in help message + if "--verbose" in stdout.decode(): + model.download( + "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc3_suc-tags_default-setting_utf8-mivoq.model") + else: + model.download( "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc3_suc-tags_default-setting_utf8.model") + + try: + logger.info("Testing Hunpos model") + test_hunpos(model) + except (RuntimeError, OSError): + model.remove() + raise SparvErrorMessage("Hunpos does not seem to be working on your system with any of the available models.") diff --git a/sparv/modules/hunpos/morphtable.py b/sparv/modules/hunpos/morphtable.py index 6fe17fdd..fa873c40 100644 --- a/sparv/modules/hunpos/morphtable.py +++ b/sparv/modules/hunpos/morphtable.py @@ -2,7 +2,7 @@ from collections import defaultdict -from sparv import Model, ModelOutput, modelbuilder +from sparv.api import Model, ModelOutput, modelbuilder from sparv.modules.saldo import saldo diff --git a/sparv/modules/hunpos/morphtable_hist.py b/sparv/modules/hunpos/morphtable_hist.py index 73f3c82a..a1d2ccd4 100644 --- a/sparv/modules/hunpos/morphtable_hist.py +++ b/sparv/modules/hunpos/morphtable_hist.py @@ -2,16 +2,16 @@ import re -import sparv.util as util -from sparv import Model, ModelOutput, modelbuilder +from sparv.api import Model, ModelOutput, modelbuilder +from sparv.api.util.tagsets import tagmappings # Constants -SALDO_TO_SUC = util.tagsets.mappings["saldo_to_suc"] +SALDO_TO_SUC = tagmappings.mappings["saldo_to_suc"] SALDO_TO_SUC["pm"] = {"PM.NOM"} SALDO_TO_SUC["nl invar"] = {"NL.NOM"} -@modelbuilder("Hunpos morphtable for Swedish historical resources", language=["swe"]) +@modelbuilder("Hunpos morphtable for Swedish historical resources", language=["swe-1800"]) def hist_morphtable(out: ModelOutput = ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"), swedberg: Model = Model("hunpos/hist/swedberg-gender.hunpos"), dalin: Model = Model("hunpos/hist/dalinm.hunpos"), @@ -95,26 +95,26 @@ def _force_parse(msd): SALDO_TO_SUC[msd] = new_suc return new_suc - paramstr = " ".join(util.tagsets.mappings["saldo_params_to_suc"].get(prm, prm.upper()) for prm in params) - for (pre, post) in util.tagsets.tagmappings._suc_tag_replacements: + paramstr = " ".join(tagmappings.mappings["saldo_params_to_suc"].get(prm, prm.upper()) for prm in params) + for (pre, post) in tagmappings._suc_tag_replacements: m = re.match(pre, paramstr) if m: break if m is None: return set() sucfilter = m.expand(post).replace(" ", r"\.").replace("+", r"\+") - new_suc = set(suctag for suctag in util.tagsets.tags["suc_tags"] if re.match(sucfilter, suctag)) + new_suc = set(suctag for suctag in tagmappings.tags["suc_tags"] if re.match(sucfilter, suctag)) SALDO_TO_SUC[msd] = new_suc return new_suc -@modelbuilder("Swedberg wordlist", language=["swe"]) +@modelbuilder("Swedberg wordlist", language=["swe-1800"]) def download_swedberg_wordlist(out: ModelOutput = ModelOutput("hunpos/hist/swedberg-gender.hunpos")): """Download Swedberg wordlist.""" out.download("https://github.com/spraakbanken/sparv-models/raw/master/hunpos/hist/swedberg-gender.hunpos") -@modelbuilder("Dalin wordlist", language=["swe"]) +@modelbuilder("Dalin wordlist", language=["swe-1800"]) def download_dalin_wordlist(out: ModelOutput = ModelOutput("hunpos/hist/dalinm.hunpos")): """Download Dalin wordlist.""" out.download("https://github.com/spraakbanken/sparv-models/raw/master/hunpos/hist/dalinm.hunpos") diff --git a/sparv/modules/korp/__init__.py b/sparv/modules/korp/__init__.py index 84ca97e2..b1b2a5be 100644 --- a/sparv/modules/korp/__init__.py +++ b/sparv/modules/korp/__init__.py @@ -1,10 +1,11 @@ """Korp-related annotators, exporters and installers.""" -from sparv import Config -from . import install_corpus, lemgram_index, relations, timespan +from sparv.api import Config +from . import lemgram_index, relations, timespan __config__ = [ - Config("korp.remote_host", description="Remote host to install to"), + Config("korp.remote_host", description="Remote host to install to. Leave blank to install locally."), Config("korp.mysql_dbname", description="Name of database where Korp data will be stored"), - Config("korp.mode", default="modern", description="The Korp mode in which the corpus will be published") + Config("korp.modes", default=["default"], description="The Korp modes in which the corpus will be published"), + Config("korp.protected", False, description="Whether this corpus should have limited access or not") ] diff --git a/sparv/modules/korp/install_corpus.py b/sparv/modules/korp/install_corpus.py deleted file mode 100644 index b621fca1..00000000 --- a/sparv/modules/korp/install_corpus.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Module for installing Korp-related corpus files on remote host.""" - -import logging -import os -import re - -import sparv.util as util -from sparv import Config, Corpus, ExportInput, OutputCommonData, installer - -log = logging.getLogger(__name__) - - -@installer("Install CWB datafiles on remote host", config=[ - Config("korp.remote_cwb_registry", "", description="CWB registry path on remote host"), - Config("korp.remote_cwb_datadir", "", description="CWB datadir path on remote host"), - Config("korp.protected", False, description="Whether this corpus should have limited access or not") -]) -def install_corpus(corpus: Corpus = Corpus(), - info_file: ExportInput = ExportInput("[cwb.cwb_datadir]/[metadata.id]/.info", absolute_path=True), - cwb_file: ExportInput = ExportInput("[cwb.corpus_registry]/[metadata.id]", absolute_path=True), - out: OutputCommonData = OutputCommonData("korp.install_corpus_marker"), - host: str = Config("korp.remote_host"), - datadir: str = Config("cwb.cwb_datadir"), - registry: str = Config("cwb.corpus_registry"), - target_datadir: str = Config("korp.remote_cwb_datadir"), - target_registry: str = Config("korp.remote_cwb_registry")): - """Install CWB datafiles on server, by rsyncing datadir and registry.""" - if not corpus: - raise util.SparvErrorMessage("Missing corpus name. Corpus not installed.") - - if not host: - raise util.SparvErrorMessage("No host provided! Corpus not installed.") - - if not target_datadir: - raise util.SparvErrorMessage("Configuration variable korp.remote_cwb_datadir not set! Corpus not installed.") - - if not target_registry: - raise util.SparvErrorMessage("Configuration variable korp.remote_cwb_registry not set! Corpus not installed.") - - target = os.path.join(target_datadir, corpus) - util.system.rsync(os.path.join(datadir, corpus), host, target) - - target_registry_file = os.path.join(target_registry, corpus) - source_registry_file = os.path.join(registry, corpus + ".tmp") - - # Fix absolute paths in registry file - with open(os.path.join(registry, corpus)) as registry_in: - with open(source_registry_file, "w") as registry_out: - for line in registry_in: - if line.startswith("HOME"): - line = re.sub(r"HOME .*(/.+)", r"HOME " + target_datadir + r"\1", line) - elif line.startswith("INFO"): - line = re.sub(r"INFO .*(/.+)/\.info", r"INFO " + target_datadir + r"\1/.info", line) - - registry_out.write(line) - - util.system.rsync(source_registry_file, host, target_registry_file) - os.remove(source_registry_file) - - # Write marker file - out.write("") diff --git a/sparv/modules/korp/lemgram_index.py b/sparv/modules/korp/lemgram_index.py index febf7ba1..f8f4d834 100644 --- a/sparv/modules/korp/lemgram_index.py +++ b/sparv/modules/korp/lemgram_index.py @@ -1,21 +1,20 @@ """Create files needed for the lemgram search in Korp.""" -import logging from collections import defaultdict -import sparv.util as util -from sparv import (AllDocuments, AnnotationAllDocs, Config, Corpus, Export, ExportInput, OutputCommonData, exporter, - installer) -from sparv.util.mysql_wrapper import MySQL +from sparv.api import (AllSourceFilenames, AnnotationAllSourceFiles, Config, Corpus, Export, ExportInput, OutputCommonData, exporter, + get_logger, installer, util) +from sparv.api.util.mysql_wrapper import MySQL + +logger = get_logger(__name__) -log = logging.getLogger(__name__) # Path to the cwb-scan-corpus binary CWB_SCAN_EXECUTABLE = "cwb-scan-corpus" @installer("Install lemgram SQL on remote host", language=["swe"]) -def install_lemgrams(sqlfile: ExportInput = ExportInput("korp_lemgram_index/lemgram_index.sql"), +def install_lemgrams(sqlfile: ExportInput = ExportInput("korp.lemgram_index/lemgram_index.sql"), marker: OutputCommonData = OutputCommonData("korp.install_lemgram_marker"), db_name: str = Config("korp.mysql_dbname"), host: str = Config("korp.remote_host")): @@ -23,28 +22,27 @@ def install_lemgrams(sqlfile: ExportInput = ExportInput("korp_lemgram_index/lemg Args: sqlfile (str, optional): SQL file to be installed. - Defaults to ExportInput("korp_lemgram_index/lemgram_index.sql"). + Defaults to ExportInput("korp.lemgram_index/lemgram_index.sql"). marker (str, optional): Marker file to be written. Defaults to OutputCommonData("korp.install_lemgram_marker"). db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname"). host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host"). """ - util.install_mysql(host, db_name, sqlfile) + util.install.install_mysql(host, db_name, sqlfile) marker.write("") @exporter("Lemgram index SQL file for use in Korp", language=["swe"]) def lemgram_sql(corpus: Corpus = Corpus(), - docs: AllDocuments = AllDocuments(), - out: Export = Export("korp_lemgram_index/lemgram_index.sql"), - lemgram: AnnotationAllDocs = AnnotationAllDocs(":saldo.lemgram")): + source_files: AllSourceFilenames = AllSourceFilenames(), + out: Export = Export("korp.lemgram_index/lemgram_index.sql"), + lemgram: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":saldo.lemgram")): """Create lemgram index SQL file.""" - corpus = corpus.upper() result = defaultdict(int) - for doc in docs: - for lg in lemgram.read(doc): + for file in source_files: + for lg in lemgram.read(file): for value in lg.split("|"): if value and ":" not in value: result[value] += 1 @@ -60,9 +58,9 @@ def lemgram_sql(corpus: Corpus = Corpus(), "lemgram": lemgram, "corpus": corpus, "freq": freq - }) + }) - log.info("Creating SQL") + logger.info("Creating SQL") mysql.add_row(MYSQL_TABLE, rows) diff --git a/sparv/modules/korp/relations.py b/sparv/modules/korp/relations.py index 613c7e7e..3ba7e878 100644 --- a/sparv/modules/korp/relations.py +++ b/sparv/modules/korp/relations.py @@ -1,17 +1,16 @@ """Create files needed for the word picture in Korp.""" -import logging import math import re from collections import defaultdict from typing import Optional -import sparv.util as util -from sparv import (AllDocuments, Annotation, AnnotationDataAllDocs, Config, Corpus, Export, ExportInput, - OutputCommonData, OutputData, annotator, exporter, installer) -from sparv.util.mysql_wrapper import MySQL +from sparv.api import (AllSourceFilenames, Annotation, AnnotationDataAllSourceFiles, Config, Corpus, Export, ExportInput, + OutputCommonData, OutputData, annotator, exporter, get_logger, installer, util) +from sparv.api.util.mysql_wrapper import MySQL + +logger = get_logger(__name__) -log = logging.getLogger(__name__) MAX_STRING_LENGTH = 100 MAX_STRINGEXTRA_LENGTH = 32 @@ -19,19 +18,19 @@ @installer("Install Korp's Word Picture SQL on remote host", language=["swe"]) -def install_relations(sqlfile: ExportInput = ExportInput("korp_wordpicture/relations.sql"), +def install_relations(sqlfile: ExportInput = ExportInput("korp.wordpicture/relations.sql"), out: OutputCommonData = OutputCommonData("korp.install_relations_marker"), db_name: str = Config("korp.mysql_dbname"), host: str = Config("korp.remote_host")): """Install Korp's Word Picture SQL on remote host. Args: - sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_wordpicture/relations.sql"). + sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp.wordpicture/relations.sql"). out (str, optional): Marker file to be written. db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname"). host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host"). """ - util.install_mysql(host, db_name, sqlfile) + util.install.install_mysql(host, db_name, sqlfile) out.write("") @@ -43,12 +42,14 @@ def relations(out: OutputData = OutputData("korp.relations"), dephead: Annotation = Annotation(""), deprel: Annotation = Annotation(""), sentence_id: Annotation = Annotation(":misc.id"), - ref: Annotation = Annotation(":misc.number_rel_"), + ref: Annotation = Annotation(""), baseform: Annotation = Annotation(":saldo.baseform")): """Find certain dependencies between words, to be used by the Word Picture feature in Korp.""" sentence_ids = sentence_id.read() sentence_tokens, _ = sentence_id.get_children(word) + logger.progress(total=len(sentence_tokens) + 1) + annotations = list(word.read_attributes((word, pos, lemgram, dephead, deprel, ref, baseform))) # http://stp.ling.uu.se/~nivre/swedish_treebank/dep.html @@ -183,6 +184,7 @@ def _findrel(head, rel, dep): (v["lemgram"], v["word"], v["pos"], v["ref"]), mrel, ("", "", "", v["ref"]), ("", None), sentid, v["ref"], v["ref"]) triples.extend(_mutate_triple(triple)) + logger.progress() triples = sorted(set(triples)) @@ -191,6 +193,7 @@ def _findrel(head, rel, dep): head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep, bfhead, bfdep, wfhead, wfdep) in triples]) out.write(out_data) + logger.progress() def _mutate_triple(triple): @@ -265,20 +268,22 @@ def mi_lex(rel, x_rel_y, x_rel, rel_y): @exporter("Word Picture SQL for use in Korp", language=["swe"]) def relations_sql(corpus: Corpus = Corpus(), - out: Export = Export("korp_wordpicture/relations.sql"), - relations: AnnotationDataAllDocs = AnnotationDataAllDocs("korp.relations"), - docs: Optional[AllDocuments] = AllDocuments(), - doclist: str = "", + out: Export = Export("korp.wordpicture/relations.sql"), + relations: AnnotationDataAllSourceFiles = AnnotationDataAllSourceFiles("korp.relations"), + source_files: Optional[AllSourceFilenames] = AllSourceFilenames(), + source_files_list: str = "", split: bool = False): """Calculate statistics of the dependencies and saves to SQL files. - - corpus is the corpus name. - - out is the name for the SQL file which will contain the resulting SQL statements. - - relations is the name of the relations annotation. - - docs is a list of documents. - - doclist can be used instead of docs, and should be a file containing the name of docs, one per row. - - split set to true leads to SQL commands being split into several parts, requiring less memory during creation, - but installing the data will take much longer. + Args: + corpus: the corpus name + out: the name for the SQL file which will contain the resulting SQL statements + relations: the name of the relations annotation + source_files: a list of source filenames + source_files_list: can be used instead of source_files, and should be a file containing the name of source + files, one per row + split: when set to true leads to SQL commands being split into several parts, requiring less memory during + creation, but installing the data will take much longer """ db_table = MYSQL_TABLE + "_" + corpus.upper() @@ -298,27 +303,29 @@ def relations_sql(corpus: Corpus = Corpus(), strings = {} # ID -> string table freq_index = {} sentence_count = defaultdict(int) - doc_count = 0 + file_count = 0 - assert (docs or doclist), "Missing source" + assert (source_files or source_files_list), "Missing source" - if doclist: - with open(doclist) as insource: - docs = [line.strip() for line in insource] + if source_files_list: + with open(source_files_list, encoding="utf-8") as insource: + source_files = [line.strip() for line in insource] - if len(docs) == 1: + if len(source_files) == 1: split = False - for doc in docs: - doc_count += 1 + logger.progress(total=len(source_files) + 1) + + for file in source_files: + file_count += 1 sentences = {} - if doc_count == 1 or split: + if file_count == 1 or split: freq = {} # Frequency of (head, rel, dep) rel_count = defaultdict(int) # Frequency of (rel) head_rel_count = defaultdict(int) # Frequency of (head, rel) dep_rel_count = defaultdict(int) # Frequency of (rel, dep) - relations_data = relations.read(doc) + relations_data = relations.read(file) for triple in relations_data.splitlines(): head, headpos, rel, dep, deppos, extra, sid, refh, refd, bfhead, bfdep, wfhead, wfdep = triple.split(u"\t") @@ -362,20 +369,23 @@ def relations_sql(corpus: Corpus = Corpus(), dep_rel_count[(dep, rel)] += 1 # If not the last file - if not doc_count == len(docs): + if not file_count == len(source_files): if split: # Don't print string table until the last file _write_sql({}, sentences, freq, rel_count, head_rel_count, dep_rel_count, out, db_table, split, - first=(doc_count == 1)) + first=(file_count == 1)) else: # Only save sentences data, save the rest for the last file - _write_sql({}, sentences, {}, {}, {}, {}, out, db_table, split, first=(doc_count == 1)) + _write_sql({}, sentences, {}, {}, {}, {}, out, db_table, split, first=(file_count == 1)) + + logger.progress() # Create the final file, including the string table _write_sql(strings, sentences, freq, rel_count, head_rel_count, dep_rel_count, out, db_table, split, - first=(doc_count == 1), last=True) + first=(file_count == 1), last=True) - log.info("Done creating SQL files") + logger.progress() + logger.info("Done creating SQL files") def _write_sql(strings, sentences, freq, rel_count, head_rel_count, dep_rel_count, sql_file, db_table, @@ -501,7 +511,7 @@ def _write_sql(strings, sentences, freq, rel_count, head_rel_count, dep_rel_coun mysql.enable_checks() - log.info("%s written", sql_file) + logger.info("%s written", sql_file) ################################################################################ diff --git a/sparv/modules/korp/timespan.py b/sparv/modules/korp/timespan.py index 6e4af69d..25342467 100644 --- a/sparv/modules/korp/timespan.py +++ b/sparv/modules/korp/timespan.py @@ -1,59 +1,57 @@ """Create timespan SQL data for use in Korp.""" -import logging from collections import defaultdict -import sparv.util as util -from sparv import (AllDocuments, Annotation, AnnotationAllDocs, Config, Corpus, Export, ExportInput, OutputCommonData, - annotator, exporter, installer) -from sparv.util.mysql_wrapper import MySQL +from sparv.api import (AllSourceFilenames, Annotation, AnnotationAllSourceFiles, Config, Corpus, Export, ExportInput, + OutputCommonData, annotator, exporter, get_logger, installer, util) +from sparv.api.util.mysql_wrapper import MySQL -log = logging.getLogger(__name__) +logger = get_logger(__name__) @installer("Install timespan SQL on remote host") -def install_timespan(sqlfile: ExportInput = ExportInput("korp_timespan/timespan.sql"), +def install_timespan(sqlfile: ExportInput = ExportInput("korp.timespan/timespan.sql"), out: OutputCommonData = OutputCommonData("korp.install_timespan_marker"), db_name: str = Config("korp.mysql_dbname"), host: str = Config("korp.remote_host")): """Install timespan SQL on remote host. Args: - sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_timespan/timespan.sql"). + sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp.timespan/timespan.sql"). out (str, optional): Marker file to be written. db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname"). host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host"). """ - util.install_mysql(host, db_name, sqlfile) + util.install.install_mysql(host, db_name, sqlfile) out.write("") @exporter("Timespan SQL data for use in Korp", abstract=True) -def timespan_sql(_sql: ExportInput = ExportInput("korp_timespan/timespan.sql")): +def timespan_sql(_sql: ExportInput = ExportInput("korp.timespan/timespan.sql")): """Create timespan SQL data for use in Korp.""" pass @annotator("Timespan SQL data for use in Korp", order=1) def timespan_sql_with_dateinfo(corpus: Corpus = Corpus(), - out: Export = Export("korp_timespan/timespan.sql"), - docs: AllDocuments = AllDocuments(), - token: AnnotationAllDocs = AnnotationAllDocs(""), - datefrom: AnnotationAllDocs = AnnotationAllDocs(":dateformat.datefrom"), - dateto: AnnotationAllDocs = AnnotationAllDocs(":dateformat.dateto"), - timefrom: AnnotationAllDocs = AnnotationAllDocs(":dateformat.timefrom"), - timeto: AnnotationAllDocs = AnnotationAllDocs(":dateformat.timeto")): + out: Export = Export("korp.timespan/timespan.sql"), + source_files: AllSourceFilenames = AllSourceFilenames(), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + datefrom: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":dateformat.datefrom"), + dateto: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":dateformat.dateto"), + timefrom: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":dateformat.timefrom"), + timeto: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":dateformat.timeto")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() datespans = defaultdict(int) datetimespans = defaultdict(int) - for doc in docs: - text_tokens, orphans = Annotation(datefrom.name, doc=doc).get_children(token) + for file in source_files: + text_tokens, orphans = Annotation(datefrom.name, source_file=file).get_children(token) if orphans: datespans[("0" * 8, "0" * 8)] += len(orphans) datetimespans[("0" * 14, "0" * 14)] += len(orphans) - dateinfo = datefrom.read_attributes(doc, (datefrom, dateto, timefrom, timeto)) + dateinfo = datefrom.read_attributes(file, (datefrom, dateto, timefrom, timeto)) for text in text_tokens: d = next(dateinfo) datespans[(d[0].zfill(8), d[1].zfill(8))] += len(text) @@ -83,16 +81,15 @@ def timespan_sql_with_dateinfo(corpus: Corpus = Corpus(), @annotator("Timespan SQL data for use in Korp, for when the corpus has no date metadata.", order=2) def timespan_sql_no_dateinfo(corpus: Corpus = Corpus(), - out: Export = Export("korp_timespan/timespan.sql"), - docs: AllDocuments = AllDocuments(), - token: AnnotationAllDocs = AnnotationAllDocs("")): + out: Export = Export("korp.timespan/timespan.sql"), + source_files: AllSourceFilenames = AllSourceFilenames(), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles("")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() token_count = 0 - for doc in docs: - tokens = token.read_spans(doc) - token_count += len(list(tokens)) + for file in source_files: + token_count += token.get_size(file) rows_date = [{ "corpus": corpus_name, @@ -112,7 +109,7 @@ def timespan_sql_no_dateinfo(corpus: Corpus = Corpus(), def create_sql(corpus_name: str, out: Export, rows_date, rows_datetime): """Create timespans SQL file.""" - log.info("Creating SQL") + logger.info("Creating SQL") mysql = MySQL(output=out) mysql.create_table(MYSQL_TABLE, drop=False, **MYSQL_TIMESPAN) mysql.create_table(MYSQL_TABLE_DATE, drop=False, **MYSQL_TIMESPAN_DATE) diff --git a/sparv/modules/lexical_classes/models.py b/sparv/modules/lexical_classes/models.py index 281e1b78..de6a272b 100644 --- a/sparv/modules/lexical_classes/models.py +++ b/sparv/modules/lexical_classes/models.py @@ -1,16 +1,14 @@ """Handle models for lexical classes.""" -import logging import os import subprocess import sys import xml.etree.ElementTree as etree from collections import defaultdict -import sparv.util as util -from sparv import Model, ModelOutput, modelbuilder +from sparv.api import Model, ModelOutput, get_logger, modelbuilder, util -log = logging.getLogger(__name__) +logger = get_logger(__name__) # Path to the cwb binaries CWB_SCAN_EXECUTABLE = "cwb-scan-corpus" @@ -77,11 +75,11 @@ def read_blingbring(tsv, classmap, verbose=True): import csv if verbose: - log.info("Reading tsv lexicon") + logger.info("Reading tsv lexicon") lexicon = {} classmapping = {} - with open(tsv) as f: + with open(tsv, encoding="utf-8") as f: for line in csv.reader(f, delimiter="\t"): if line[0].startswith("#"): continue @@ -115,17 +113,17 @@ def read_blingbring(tsv, classmap, verbose=True): "behjälplig..1", "köra_ner..1" ] - util.test_lexicon(lexicon, testwords) + util.misc.test_lexicon(lexicon, testwords) if verbose: - log.info("OK, read") + logger.info("OK, read") return lexicon def read_rogetmap(xml, verbose=True): """Parse Roget map (Roget hierarchy) into a dictionary with Roget head words as keys.""" if verbose: - log.info("Reading XML lexicon") + logger.info("Reading XML lexicon") lexicon = {} context = etree.iterparse(xml, events=("start", "end")) context = iter(context) @@ -145,10 +143,10 @@ def read_rogetmap(xml, verbose=True): "Health", "Amusement", "Marriage"] - util.test_lexicon(lexicon, testwords) + util.misc.test_lexicon(lexicon, testwords) if verbose: - log.info("OK, read.") + logger.info("OK, read.") return lexicon @@ -158,7 +156,7 @@ def read_swefn(xml, verbose=True): Return a lexicon dictionary, {saldoID: {swefnID}}. """ if verbose: - log.info("Reading XML lexicon") + logger.info("Reading XML lexicon") lexicon = {} context = etree.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element @@ -183,16 +181,16 @@ def read_swefn(xml, verbose=True): "granne..1", "sisådär..1", "mjölkcentral..1"] - util.test_lexicon(lexicon, testwords) + util.misc.test_lexicon(lexicon, testwords) if verbose: - log.info("OK, read.") + logger.info("OK, read.") return lexicon -def create_freq_pickle(corpus, annotation, model, class_set=None, score_separator=util.SCORESEP): +def create_freq_pickle(corpus, annotation, model, class_set=None, score_separator=util.constants.SCORESEP): """Build pickle with relative frequency for a given annotation in one or more reference corpora.""" - lexicon = util.PickledLexicon(model) + lexicon = util.misc.PickledLexicon(model) # Create a set of all possible classes if class_set: all_classes = set(cc for c in lexicon.lexicon.values() for cc in c[class_set]) @@ -216,7 +214,7 @@ def create_freq_pickle(corpus, annotation, model, class_set=None, score_separato if error: error = error.decode() - log.error(error) + logger.error(error) sys.exit(1) for line in reply.splitlines(): @@ -225,14 +223,14 @@ def create_freq_pickle(corpus, annotation, model, class_set=None, score_separato corpus_size += int(size.strip()) # Get frequency of annotation - log.info("Getting frequencies from %s", c) + logger.info("Getting frequencies from %s", c) process = subprocess.Popen([CWB_SCAN_EXECUTABLE, "-q", "-r", CORPUS_REGISTRY, c] + [annotation], stdout=subprocess.PIPE, stderr=subprocess.PIPE) reply, error = process.communicate() reply = reply.decode() if error: if "Error: can't open attribute" in error.decode(): - log.error("Annotation '%s' not found", annotation) + logger.error("Annotation '%s' not found", annotation) sys.exit(1) for line in reply.splitlines(): diff --git a/sparv/modules/lexical_classes/text.py b/sparv/modules/lexical_classes/text.py index 48589896..bb876edc 100644 --- a/sparv/modules/lexical_classes/text.py +++ b/sparv/modules/lexical_classes/text.py @@ -1,13 +1,12 @@ """Annotate text chunks with lexical classes from Blingbring or SweFN.""" -import logging from collections import defaultdict from typing import Optional -import sparv.util as util -from sparv import Annotation, Config, Model, Output, annotator +from sparv.api import Annotation, Config, Model, Output, annotator, get_logger, util +from sparv.api.util.constants import AFFIX, DELIM, SCORESEP -log = logging.getLogger(__name__) +logger = get_logger(__name__) @annotator("Annotate text chunks with Blingbring classes", language=["swe"], config=[ @@ -22,8 +21,8 @@ def blingbring_text(out: Output = Output(":lexical_classes.blingbring", saldoids: Optional[Annotation] = Annotation(""), cutoff: int = 3, types: bool = False, - delimiter: str = util.DELIM, - affix: str = util.AFFIX, + delimiter: str = DELIM, + affix: str = AFFIX, freq_model: Model = Model("[lexical_classes.bb_freq_model]"), decimals: int = 3): """Annotate text chunks with Blingbring classes.""" @@ -44,8 +43,8 @@ def swefn_text(out: Output = Output(":lexical_classes.swefn", saldoids: Optional[Annotation] = Annotation(""), cutoff: int = 3, types: bool = False, - delimiter: str = util.DELIM, - affix: str = util.AFFIX, + delimiter: str = DELIM, + affix: str = AFFIX, freq_model: Model = Model("[lexical_classes.swefn_freq_model]"), decimals: int = 3): """Annotate text chunks with SweFN classes.""" @@ -78,7 +77,7 @@ def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotati sense = list(saldoids.read()) if types else None if freq_model: - freq_model = util.PickledLexicon(freq_model.path) + freq_model = util.misc.PickledLexicon(freq_model.path) out_annotation = text.create_empty_attribute() @@ -89,13 +88,13 @@ def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotati for token_index in words: # Count only sense types if types: - senses = str(sorted([s.split(util.SCORESEP)[0] for s in sense[token_index].strip(util.AFFIX).split(util.DELIM)])) + senses = str(sorted([s.split(SCORESEP)[0] for s in sense[token_index].strip(AFFIX).split(DELIM)])) if senses in seen_types: continue else: seen_types.add(senses) - rogwords = classes[token_index].strip(util.AFFIX).split(util.DELIM) if classes[token_index] != util.AFFIX else [] + rogwords = classes[token_index].strip(AFFIX).split(DELIM) if classes[token_index] != AFFIX else [] for w in rogwords: class_freqs[w] += 1 @@ -106,7 +105,7 @@ def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotati # Calculate class dominance ref_freq = freq_model.lookup(c.replace("_", " "), 0) if not ref_freq: - log.error("Class '%s' is missing" % ref_freq) + logger.error("Class '%s' is missing" % ref_freq) class_freqs[c] = (rel / ref_freq) # Sort words according to frequency/dominance @@ -123,7 +122,7 @@ def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotati ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq] # Join words and frequencies/dominances - ordered_words = [util.SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words] - out_annotation[text_index] = util.cwbset(ordered_words, delimiter, affix) if ordered_words else affix + ordered_words = [SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words] + out_annotation[text_index] = util.misc.cwbset(ordered_words, delimiter, affix) if ordered_words else affix out.write(out_annotation) diff --git a/sparv/modules/lexical_classes/token.py b/sparv/modules/lexical_classes/token.py index 5ecb912a..e0f36fd7 100644 --- a/sparv/modules/lexical_classes/token.py +++ b/sparv/modules/lexical_classes/token.py @@ -1,12 +1,11 @@ """Annotate words with lexical classes from Blingbring or SweFN.""" -import logging from typing import List -import sparv.util as util -from sparv import Annotation, Config, Model, Output, annotator +from sparv.api import Annotation, Config, Model, Output, annotator, get_logger, util +from sparv.api.util.constants import AFFIX, DELIM, SCORESEP -log = logging.getLogger(__name__) +logger = get_logger(__name__) @annotator("Annotate tokens with Blingbring classes", language=["swe"], config=[ @@ -22,19 +21,19 @@ def blingbring_words(out: Output = Output(":lexical_classes.blingbring", class_set: str = "bring", disambiguate: bool = True, connect_ids: bool = False, - delimiter: str = util.DELIM, - affix: str = util.AFFIX, - scoresep: str = util.SCORESEP, + delimiter: str = DELIM, + affix: str = AFFIX, + scoresep: str = SCORESEP, lexicon=None): """Blingbring specific wrapper for annotate_words. See annotate_words for more info.""" # pos_limit="NN VB JJ AB" | None if class_set not in ["bring", "roget_head", "roget_subsection", "roget_section", "roget_class"]: - log.warning("Class '%s' not available. Fallback to 'bring'.") + logger.warning("Class '%s' not available. Fallback to 'bring'.") class_set = "bring" # Blingbring annotation function - def annotate_bring(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP): + def annotate_bring(saldo_ids, lexicon, connect_IDs=False, scoresep=SCORESEP): rogetid = set() if saldo_ids: for sid in saldo_ids: @@ -61,14 +60,14 @@ def swefn_words(out: Output = Output(":lexical_classes.swefn", pos_limit: List[str] = ["NN", "VB", "JJ", "AB"], disambiguate: bool = True, connect_ids: bool = False, - delimiter: str = util.DELIM, - affix: str = util.AFFIX, - scoresep: str = util.SCORESEP, + delimiter: str = DELIM, + affix: str = AFFIX, + scoresep: str = SCORESEP, lexicon=None): """Swefn specific wrapper for annotate_words. See annotate_words for more info.""" # SweFN annotation function - def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP): + def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=SCORESEP): swefnid = set() if saldo_ids: for sid in saldo_ids: @@ -83,8 +82,8 @@ def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotation, annotate, pos_limit: List[str], - class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX, - scoresep=util.SCORESEP, lexicon=None): + class_set=None, disambiguate=True, connect_ids=False, delimiter=DELIM, affix=AFFIX, + scoresep=SCORESEP, lexicon=None): """ Annotate words with blingbring classes (rogetID). @@ -106,7 +105,7 @@ def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotat but is used in the catapult. This argument must be last. """ if not lexicon: - lexicon = util.PickledLexicon(model.path) + lexicon = util.misc.PickledLexicon(model.path) # Otherwise use pre-loaded lexicon (from catapult) sense = saldoids.read() @@ -124,10 +123,10 @@ def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotat out_annotation[token_index] = affix continue - if wsd and util.SCORESEP in token_sense: - ranked_saldo = token_sense.strip(util.AFFIX).split(util.DELIM) \ - if token_sense != util.AFFIX else None - saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo] + if wsd and SCORESEP in token_sense: + ranked_saldo = token_sense.strip(AFFIX).split(DELIM) \ + if token_sense != AFFIX else None + saldo_tuples = [(i.split(SCORESEP)[0], i.split(SCORESEP)[1]) for i in ranked_saldo] if not disambiguate: saldo_ids = [i[0] for i in saldo_tuples] @@ -144,11 +143,11 @@ def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotat saldo_ids = [i[0] for i in saldo_ids] else: # No WSD - saldo_ids = token_sense.strip(util.AFFIX).split(util.DELIM) \ - if token_sense != util.AFFIX else None + saldo_ids = token_sense.strip(AFFIX).split(DELIM) \ + if token_sense != AFFIX else None result = annotate(saldo_ids, lexicon, connect_ids, scoresep) - out_annotation[token_index] = util.cwbset(result, delimiter, affix) if result else affix + out_annotation[token_index] = util.misc.cwbset(result, delimiter, affix) if result else affix out.write(out_annotation) diff --git a/sparv/modules/malt/malt.py b/sparv/modules/malt/malt.py index 08345199..87d8d8d3 100644 --- a/sparv/modules/malt/malt.py +++ b/sparv/modules/malt/malt.py @@ -1,12 +1,10 @@ """Dependency parsing using MaltParser.""" -import logging import re -import sparv.util as util -from sparv import Annotation, Binary, Config, Model, ModelOutput, Output, annotator, modelbuilder +from sparv.api import Annotation, Binary, Config, Model, ModelOutput, Output, annotator, get_logger, modelbuilder, util -log = logging.getLogger(__name__) +logger = get_logger(__name__) # Running Malt processes are only kept if the input is small: otherwise @@ -37,7 +35,7 @@ def cleanup(maltjar, model, encoding, process_dict): """Cleanup function used by preloader to restart Malt.""" if process_dict["restart"]: util.system.kill_process(process_dict["process"]) - log.info("Restarting MaltParser process") + logger.info("Restarting MaltParser process") process_dict = preloader(maltjar, model, encoding) return process_dict @@ -45,10 +43,9 @@ def cleanup(maltjar, model, encoding, process_dict): @annotator("Dependency parsing using MaltParser", language=["swe"], config=[ Config("malt.jar", default="maltparser-1.7.2/maltparser-1.7.2.jar", description="Path name of the executable .jar file"), - Config("malt.model", default="malt/swemalt-1.7.2.mco", description="Path to Malt model") - ], - preloader=preloader, preloader_params=["maltjar", "model", "encoding"], preloader_target="process_dict", - preloader_cleanup=cleanup, preloader_shared=False) + Config("malt.model", default="malt/swemalt-1.7.2.mco", description="Path to Malt model")], + preloader=preloader, preloader_params=["maltjar", "model", "encoding"], preloader_target="process_dict", + preloader_cleanup=cleanup, preloader_shared=False) def annotate(maltjar: Binary = Binary("[malt.jar]"), model: Model = Model("[malt.model]"), out_dephead: Output = Output(":malt.dephead", cls="token:dephead", @@ -60,10 +57,10 @@ def annotate(maltjar: Binary = Binary("[malt.jar]"), word: Annotation = Annotation(""), pos: Annotation = Annotation(""), msd: Annotation = Annotation(""), - ref: Annotation = Annotation(":misc.number_rel_"), + ref: Annotation = Annotation(":malt.ref"), sentence: Annotation = Annotation(""), token: Annotation = Annotation(""), - encoding: str = util.UTF8, + encoding: str = util.constants.UTF8, process_dict=None): """ Run the malt parser, in an already started process defined in process_dict, or start a new process (default). @@ -82,8 +79,8 @@ def annotate(maltjar: Binary = Binary("[malt.jar]"), sentences, orphans = sentence.get_children(token) if orphans: - log.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated with " - f"dependency relations.") + logger.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated with " + f"dependency relations.") word_annotation = list(word.read()) pos_annotation = list(pos.read()) @@ -104,7 +101,7 @@ def conll_token(nr, token_index): stdin = stdin.encode(encoding) keep_process = len(stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None - log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) + logger.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) if process_dict is not None: process_dict["restart"] = not keep_process @@ -112,7 +109,7 @@ def conll_token(nr, token_index): if keep_process: # Chatting with malt: send a SENT_SEP and read correct number of lines stdin_fd, stdout_fd = process.stdin, process.stdout - stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8)) + stdin_fd.write(stdin + SENT_SEP.encode(util.constants.UTF8)) stdin_fd.flush() malt_sentences = [] @@ -150,13 +147,23 @@ def conll_token(nr, token_index): out_deprel.write(out_deprel_annotation) +@annotator("Annotate tokens with IDs relative to their sentences", language=["swe"]) +def make_ref(out: Output = Output(":malt.ref", cls="token:ref", + description="Token IDs relative to their sentences"), + sentence: Annotation = Annotation(""), + token: Annotation = Annotation("")): + """Annotate tokens with IDs relative to their sentences.""" + from sparv.modules.misc import number + number.number_relative(out, sentence, token) + + def maltstart(maltjar, model, encoding, send_empty_sentence=False): """Start a malt process and return it.""" java_opts = ["-Xmx1024m"] malt_args = ["-ic", encoding, "-oc", encoding, "-m", "parse"] if str(model).startswith("http://") or str(model).startswith("https://"): malt_args += ["-u", str(model)] - log.info("Using Malt model from URL: %s", model) + logger.info("Using Malt model from URL: %s", model) else: model_dir = model.path.parent model_file = model.path.name @@ -165,7 +172,7 @@ def maltstart(maltjar, model, encoding, send_empty_sentence=False): if model_dir: malt_args += ["-w", model_dir] malt_args += ["-c", model_file] - log.info("Using local Malt model: %s (in directory %s)", model_file, model_dir or ".") + logger.info("Using local Malt model: %s (in directory %s)", model_file, model_dir or ".") process = util.system.call_java(maltjar, malt_args, options=java_opts, encoding=encoding, return_command=True) @@ -173,8 +180,8 @@ def maltstart(maltjar, model, encoding, send_empty_sentence=False): # Send a simple sentence to malt, this greatly enhances performance # for subsequent requests. stdin_fd, stdout_fd = process.stdin, process.stdout - log.info("Sending empty sentence to malt") - stdin_fd.write("1\t.\t_\tMAD\tMAD\tMAD\n\n\n".encode(util.UTF8)) + logger.info("Sending empty sentence to malt") + stdin_fd.write("1\t.\t_\tMAD\tMAD\tMAD\n\n\n".encode(util.constants.UTF8)) stdin_fd.flush() stdout_fd.readline() stdout_fd.readline() diff --git a/sparv/modules/misc/ids.py b/sparv/modules/misc/ids.py index a675c7e9..ff83992d 100644 --- a/sparv/modules/misc/ids.py +++ b/sparv/modules/misc/ids.py @@ -5,67 +5,75 @@ from binascii import hexlify from typing import Optional -from sparv import AllDocuments, Annotation, AnnotationData, Document, Output, Wildcard, OutputDataAllDocs, annotator +from sparv.api import (AllSourceFilenames, Annotation, AnnotationData, SourceFilename, Output, Wildcard, + OutputDataAllSourceFiles, annotator, get_logger) +logger = get_logger(__name__) _ID_LENGTH = 10 -@annotator("Give every document a unique ID") -def doc_id(out: OutputDataAllDocs = OutputDataAllDocs("misc.docid", cls="docid"), - docs: Optional[AllDocuments] = AllDocuments(), - doclist: Optional[str] = None, - prefix: str = "", - add: bool = False): - """Create unique IDs for every document in a list, using the document names as seed. +@annotator("Give every source file a unique ID") +def file_id(out: OutputDataAllSourceFiles = OutputDataAllSourceFiles("misc.fileid", cls="fileid"), + source_files: Optional[AllSourceFilenames] = AllSourceFilenames(), + source_files_list: Optional[str] = None, + prefix: str = "", + add: bool = False): + """Create unique IDs for every source file in a list, using the source filenames as seed. The resulting IDs are written to the annotation specified by 'out'. If 'add' is True, existing IDs will not be overwritten. """ - assert docs or doclist, "docs or doclist must be specified" + assert source_files or source_files_list, "source_files or source_files_list must be specified" - if doclist: - with open(doclist) as f: - docs = f.read().strip().splitlines() + if source_files_list: + with open(source_files_list, encoding="utf-8") as f: + source_files = f.read().strip().splitlines() - docs.sort() + source_files.sort() + logger.progress(total=len(source_files)) - numdocs = len(docs) * 2 + numfiles = len(source_files) * 2 used_ids = set() - docs_with_ids = set() + files_with_ids = set() if add: - for doc in docs: - if out.exists(doc): - used_ids.add(out.read(doc)) - docs_with_ids.add(doc) + for file in source_files: + if out.exists(file): + used_ids.add(out.read(file)) + files_with_ids.add(file) - for doc in docs: - if add and doc in docs_with_ids: + for file in source_files: + if add and file in files_with_ids: continue - _reset_id(doc, numdocs) + _reset_id(file, numfiles) new_id = _make_id(prefix, used_ids) used_ids.add(new_id) - out.write(new_id, doc) + out.write(new_id, file) + logger.progress() @annotator("Unique IDs for {annotation}", wildcards=[Wildcard("annotation", Wildcard.ANNOTATION)]) -def ids(doc: Document = Document(), +def ids(source_file: SourceFilename = SourceFilename(), annotation: Annotation = Annotation("{annotation}"), out: Output = Output("{annotation}:misc.id", description="Unique ID for {annotation}"), - docid: AnnotationData = AnnotationData(""), + fileid: AnnotationData = AnnotationData(""), prefix: str = ""): """Create unique IDs for every span of an existing annotation.""" - docid = docid.read() - prefix = prefix + docid + logger.progress() + fileid = fileid.read() + prefix = prefix + fileid ann = list(annotation.read()) out_annotation = [] - # Use doc name and annotation name as seed for the IDs - _reset_id("{}/{}".format(doc, annotation), len(ann)) + logger.progress(total=len(ann) + 1) + # Use source filename and annotation name as seed for the IDs + _reset_id("{}/{}".format(source_file, annotation), len(ann)) for _ in ann: new_id = _make_id(prefix, out_annotation) out_annotation.append(new_id) + logger.progress() out.write(out_annotation) + logger.progress() def _reset_id(seed, max_ids=None): diff --git a/sparv/modules/misc/misc.py b/sparv/modules/misc/misc.py index 4fa37784..62186927 100644 --- a/sparv/modules/misc/misc.py +++ b/sparv/modules/misc/misc.py @@ -3,7 +3,8 @@ import re from typing import List, Optional -from sparv import Annotation, Config, Output, Text, Wildcard, annotator, util +from sparv.api import Annotation, Config, SourceFilename, Output, SparvErrorMessage, Text, Wildcard, annotator, util +from sparv.api.util.tagsets import tagmappings, pos_to_upos, suc_to_feats @annotator("Text value of a span (usually a token)", config=[ @@ -22,7 +23,7 @@ def text_spans(text: Text = Text(), for span in chunk: token = corpus_text[span[0]:span[1]] if not keep_formatting_chars: - new_token = util.remove_formatting_characters(token) + new_token = util.misc.remove_formatting_characters(token) # If this token consists entirely of formatting characters, don't remove them. Empty tokens are bad! if new_token: token = new_token @@ -84,7 +85,7 @@ def translate_tag(out: Output, Example mappings: parole_to_suc, suc_to_simple, ... """ if isinstance(mapping, str): - mapping = util.tagsets.mappings[mapping] + mapping = tagmappings.mappings[mapping] out.write((mapping.get(t, t) for t in tag.read())) @@ -96,7 +97,7 @@ def upostag(out: Output = Output(":misc.upos", cls="token:upos", descript out_annotation = [] for tag in pos_tags: - out_annotation.append(util.tagsets.pos_to_upos(tag, "swe", "SUC")) + out_annotation.append(pos_to_upos(tag, "swe", "SUC")) out.write(out_annotation) @@ -112,8 +113,8 @@ def ufeatstag(out: Output = Output(":misc.ufeats", cls="token:ufeats", out_annotation = [] for pos_tag, msd_tag in zip(pos_tags, msd_tags): - feats = util.tagsets.suc_to_feats(pos_tag, msd_tag) - out_annotation.append(util.cwbset(feats)) + feats = suc_to_feats(pos_tag, msd_tag) + out_annotation.append(util.misc.cwbset(feats)) out.write(out_annotation) @@ -141,7 +142,7 @@ def chain(out, annotations, default=None): if isinstance(annotations, str): annotations = annotations.split() annotations = [a.read() for a in annotations] - out.write(util.chain(annotations, default)) + out.write(util.misc.chain(annotations, default)) @annotator("Create new annotation, with spans as values") @@ -154,8 +155,8 @@ def span_as_value(chunk: Annotation, @annotator("Select a specific index from the values of an annotation") def select(out: Output, annotation: Annotation, - index: Optional[int] = 0, - separator: Optional[str] = " "): + index: int = 0, + separator: str = " "): """Select a specific index from the values of an annotation. The given annotation values are separated by 'separator', @@ -177,8 +178,8 @@ def constant(chunk: Annotation, @annotator("Add prefix and/or suffix to an annotation") def affix(chunk: Annotation, out: Output, - prefix: Optional[str] = "", - suffix: Optional[str] = ""): + prefix: str = "", + suffix: str = ""): """Add prefix and/or suffix to annotation.""" out.write([(prefix + val + suffix) for val in chunk.read()]) @@ -186,7 +187,7 @@ def affix(chunk: Annotation, @annotator("Replace every character in an annotation with an anonymous character") def anonymise(chunk: Annotation, out: Output, - anonym_char: Optional[str] = "*"): + anonym_char: str = "*"): """Replace every character in an annotation with an anonymous character (* per default).""" out.write([(anonym_char * len(val)) for val in chunk.read()]) @@ -195,7 +196,7 @@ def anonymise(chunk: Annotation, def replace(chunk: Annotation, out: Output, find: str = "", - sub: Optional[str] = ""): + sub: str = ""): """Find and replace whole annotation. Find string must match whole annotation.""" out.write((sub if val == find else val for val in chunk.read())) @@ -213,7 +214,7 @@ def replace_list(chunk: Annotation, find = find.split() sub = sub.split() if len(find) != len(sub): - raise util.SparvErrorMessage("Find and sub must have the same number of words.") + raise SparvErrorMessage("Find and sub must have the same number of words.") translate = dict((f, s) for (f, s) in zip(find, sub)) out.write((translate.get(val, val) for val in chunk.read())) @@ -274,6 +275,35 @@ def backoff(chunk: Annotation, out.write((val if val else backoff[n] for (n, val) in enumerate(chunk.read()))) +@annotator("Replace empty values in 'chunk' with values from 'backoff' and output info about which annotator each " + "annotation was produced with.") +def backoff_with_info( + chunk: Annotation, + backoff: Annotation, + out: Output, + out_info: Output, + chunk_name: str = "", + backoff_name: str = ""): + """Replace empty values in 'chunk' with values from 'backoff'.""" + backoffs = list(backoff.read()) + out_annotation = [] + out_info_annotation = [] + if not chunk_name: + chunk_name = chunk.name + if not backoff_name: + backoff_name = backoff.name + + for n, val in enumerate(chunk.read()): + if val: + out_annotation.append(val) + out_info_annotation.append(chunk_name) + else: + out_annotation.append(backoffs[n]) + out_info_annotation.append(backoff_name) + out.write(out_annotation) + out_info.write(out_info_annotation) + + @annotator("Replace values in 'chunk' with non empty values from 'repl'") def override(chunk: Annotation, repl: Annotation, @@ -297,3 +327,63 @@ def roundfloat(chunk: Annotation, decimals = int(decimals) strformat = "%." + str(decimals) + "f" out.write((strformat % round(float(val), decimals) for val in chunk.read())) + + +@annotator("Merge two annotations (which may be sets) into one set") +def merge_to_set(out: Output, + left: Annotation, + right: Annotation, + unique: bool = True, + sort: bool = True): + """Merge two sets of annotations (which may be sets) into one set. + + Setting unique to True will remove duplicate values. + Setting sort to True will sort the values within the new set. + """ + le = left.read() + ri = right.read() + out_annotation = [] + for left_annot, right_annot in zip(le, ri): + annots = util.misc.set_to_list(left_annot) + util.misc.set_to_list(right_annot) + if unique: + annots = list(dict.fromkeys(annots)) + out_annotation.append(util.misc.cwbset(annots, sort=sort)) + out.write(out_annotation) + + +@annotator("Source filename as attribute on text annotation") +def source(out: Output = Output(":misc.source"), + name: SourceFilename = SourceFilename(), + text: Annotation = Annotation("")): + """Create a text attribute based on the filename of the source file.""" + out.write(name for _ in text.read()) + + +@annotator("Get the first annotation from a cwb set") +def first_from_set(out: Output, + chunk: Annotation,): + """"Get the first annotation from a set.""" + out_annotation = [] + for val in chunk.read(): + out_annotation.append(util.misc.set_to_list(val)[0] if util.misc.set_to_list(val) else "") + out.write(out_annotation) + + +@annotator("Get the best annotation from a cwb set with scores") +def best_from_set(out: Output, + chunk: Annotation, + is_sorted: bool = False, + score_sep = ":"): + """Get the best annotation from a set with scores. + + If 'is_sorted = True' the input is already sorted. In this case the first value is taken and its score is removed. + """ + out_annotation = [] + for val in chunk.read(): + if is_sorted: + values = [(v.split(score_sep)[1], v.split(score_sep)[0]) for v in util.misc.set_to_list(val)] + else: + values = sorted([(v.split(score_sep)[1], v.split(score_sep)[0]) for v in util.misc.set_to_list(val)], + key=lambda x:x[0], reverse=True) + out_annotation.append(values[0][1] if values else "") + out.write(out_annotation) diff --git a/sparv/modules/misc/number.py b/sparv/modules/misc/number.py index 333e84d6..540ed5db 100644 --- a/sparv/modules/misc/number.py +++ b/sparv/modules/misc/number.py @@ -5,11 +5,12 @@ from binascii import hexlify from collections import defaultdict -from sparv import AllDocuments, Annotation, AnnotationAllDocs, Output, OutputCommonData, Wildcard, annotator, util +from sparv.api import (AllSourceFilenames, Annotation, AnnotationAllSourceFiles, Output, OutputCommonData, Wildcard, annotator, + get_logger) START_DEFAULT = 1 -logger = util.get_logger(__name__) +logger = get_logger(__name__) @annotator("Number {annotation} by position", wildcards=[Wildcard("annotation", Wildcard.ANNOTATION)]) @@ -127,17 +128,26 @@ def number_relative(out: Output = Output("{annotation}:misc.number_rel_{parent}" out.write(result) +@annotator("Annotate tokens with IDs relative to their sentences") +def make_ref(out: Output = Output(":misc.ref", cls="token:ref", + description="Token IDs relative to their sentences"), + sentence: Annotation = Annotation(""), + token: Annotation = Annotation("")): + """Annotate tokens with IDs relative to their sentences.""" + number_relative(out, sentence, token) + + @annotator("Chunk count file with number of {annotation} chunks in corpus", order=1, wildcards=[ Wildcard("annotation", Wildcard.ANNOTATION)]) def count_chunks(out: OutputCommonData = OutputCommonData("misc.{annotation}_count"), - chunk: AnnotationAllDocs = AnnotationAllDocs("{annotation}"), - docs: AllDocuments = AllDocuments()): + chunk: AnnotationAllSourceFiles = AnnotationAllSourceFiles("{annotation}"), + files: AllSourceFilenames = AllSourceFilenames()): """Count the number of occurrences of 'chunk' in the corpus.""" # Read 'chunk' annotations and count the number of chunks chunk_count = 0 - for doc in docs: + for file in files: try: - chunk_count += len(list(chunk.read_spans(doc))) + chunk_count += chunk.get_size(file) except FileNotFoundError: pass @@ -151,7 +161,7 @@ def count_chunks(out: OutputCommonData = OutputCommonData("misc.{annotation}_cou @annotator("Create chunk count file for non-existent {annotation} chunks", order=2, wildcards=[ Wildcard("annotation", Wildcard.ANNOTATION)]) def count_zero_chunks(out: OutputCommonData = OutputCommonData("misc.{annotation}_count"), - docs: AllDocuments = AllDocuments()): + _files: AllSourceFilenames = AllSourceFilenames()): """Create chunk count file for non-existent 'annotation' chunks.""" logger.info(f"No {out.name[5:-6]} chunks found in corpus") out.write("0") diff --git a/sparv/modules/odt_import/__init__.py b/sparv/modules/odt_import/__init__.py new file mode 100644 index 00000000..5ba7c907 --- /dev/null +++ b/sparv/modules/odt_import/__init__.py @@ -0,0 +1,3 @@ +"""Import of odt source files.""" + +from . import odt_import diff --git a/sparv/modules/odt_import/odt_import.py b/sparv/modules/odt_import/odt_import.py new file mode 100644 index 00000000..c614bc07 --- /dev/null +++ b/sparv/modules/odt_import/odt_import.py @@ -0,0 +1,114 @@ +"""Import module for odt source files.""" + +import unicodedata +import xml.etree.ElementTree as etree +import zipfile + +from sparv.api import Config, SourceFilename, Output, Source, SourceStructure, Text, get_logger, importer, util + +logger = get_logger(__name__) + + +@importer("odt import", file_extension="odt", outputs=["text"], text_annotation="text", config=[ + Config("odt_import.prefix", "", description="Optional prefix to add to annotation names."), + Config("odt_import.keep_control_chars", False, description="Set to True if control characters should not be " + "removed from the text."), + Config("odt_import.normalize", "NFC", description="Normalize input using any of the following forms: " + "'NFC', 'NFKC', 'NFD', and 'NFKD'.") +]) +def parse(source_file: SourceFilename = SourceFilename(), + source_dir: Source = Source(), + prefix: str = Config("odt_import.prefix"), + keep_control_chars: bool = Config("odt_import.keep_control_chars"), + normalize: str = Config("odt_import.normalize")) -> None: + """Parse odt file as input to the Sparv Pipeline. + + Args: + source_file: The source filename. + source_dir: The source directory. + prefix: Optional prefix for output annotation. + keep_control_chars: Set to True to keep control characters in the text. + normalize: Normalize input text using any of the following forms: 'NFC', 'NFKC', 'NFD', and 'NFKD'. + 'NFC' is used by default. + """ + source_file_path = str(source_dir.get_path(source_file, ".odt")) + + # Parse odt and extract all text content + text = OdtParser(source_file_path).text + + if not keep_control_chars: + text = util.misc.remove_control_characters(text) + + if normalize: + text = unicodedata.normalize(normalize, text) + + Text(source_file).write(text) + + # Make up a text annotation surrounding the whole file + text_annotation = "{}.text".format(prefix) if prefix else "text" + Output(text_annotation, source_file=source_file).write([(0, len(text))]) + SourceStructure(source_file).write([text_annotation]) + + +class OdtParser(): + """ + Parse an odt file and extract its text content. + + Inspired by https://github.com/deanmalmgren/textract + """ + + def __init__(self, filename): + self.filename = filename + self.extract() + + def extract(self): + """Extract text content from odt file.""" + # Get content XML file from ODT zip archive + with open(self.filename, "rb") as stream: + zip_stream = zipfile.ZipFile(stream) + content = etree.fromstring(zip_stream.read("content.xml")) + # Iterate the XML and extract all strings + self.text = "" + for child in content.iter(): + if child.tag in [self.ns("text:p"), self.ns("text:h")]: + self.text += self.get_text(child) + "\n\n" + # Remove the final two linebreaks + if self.text: + self.text = self.text[:-2] + + def get_text(self, element): + """Recursively extract all text from element.""" + buffer = "" + if element.text is not None: + buffer += element.text + for child in element: + if child.tag == self.ns("text:tab"): + buffer += "\t" + if child.tail is not None: + buffer += child.tail + elif child.tag == self.ns("text:s"): + buffer += " " + if child.get(self.ns("text:c")) is not None: + buffer += " " * (int(child.get(self.ns("text:c"))) - 1) + if child.tail is not None: + buffer += child.tail + # Add placeholders for images + elif child.tag == self.ns("drawing:image"): + image = child.get(self.ns("xmlns:href")) + if image: + buffer += f"----{image}----" + else: + buffer += self.get_text(child) + if element.tail is not None: + buffer += element.tail + return buffer + + def ns(self, tag): + """Get the name for 'tag' including its namespace.""" + nsmap = { + "text": "urn:oasis:names:tc:opendocument:xmlns:text:1.0", + "drawing": "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0", + "xmlns": "http://www.w3.org/1999/xlink" + } + domain, tagname = tag.split(":") + return f"{{{nsmap[domain]}}}{tagname}" diff --git a/sparv/modules/phrase_structure/phrase_structure.py b/sparv/modules/phrase_structure/phrase_structure.py index 5e2122ab..21f31781 100644 --- a/sparv/modules/phrase_structure/phrase_structure.py +++ b/sparv/modules/phrase_structure/phrase_structure.py @@ -1,12 +1,12 @@ """Module for converting Mamba-Dep dependencies to phrase structure trees.""" -import logging import pprint from collections import defaultdict -from sparv import Annotation, Output, annotator +from sparv.api import Annotation, Output, annotator, get_logger + +logger = get_logger(__name__) -log = logging.getLogger(__name__) @annotator("Convert Mamba-Dep dependencies into phrase structure", language=["swe"]) @@ -20,7 +20,7 @@ def annotate(out_phrase: Output = Output("phrase_structure.phrase", description= sentence: Annotation = Annotation(""), pos: Annotation = Annotation(""), msd: Annotation = Annotation(""), - ref: Annotation = Annotation(":misc.number_rel_"), + ref: Annotation = Annotation(""), dephead_ref: Annotation = Annotation(""), deprel: Annotation = Annotation("")): """Annotate sentence with phrase structures.""" @@ -47,31 +47,31 @@ def get_token_span(index): # Make nodes children = flatten_tree(tree[1], []) - log.debug("\n\nSENTENCE:") + logger.debug("\n\nSENTENCE:") position = 0 open_elem_stack = [] for child in children: if not child[0].startswith("WORD:"): start_pos = get_token_span(s[position])[0] open_elem_stack.append(child + (start_pos,)) - log.debug(f" {s[position]}") + logger.debug(f" {s[position]}") else: # Close nodes while open_elem_stack[-1][2] == child[2]: start_pos = open_elem_stack[-1][3] end_pos = get_token_span(s[position - 1])[1] nodes.append(((start_pos, end_pos), open_elem_stack[-1][0], open_elem_stack[-1][1])) - log.debug(f" {start_pos}-{end_pos}") + logger.debug(f" {start_pos}-{end_pos}") open_elem_stack.pop() position += 1 - log.debug(f" {child[0][5:]}") + logger.debug(f" {child[0][5:]}") # Close remaining open nodes end_pos = get_token_span(s[-1])[1] for elem in reversed(open_elem_stack): start_pos = elem[3] nodes.append(((start_pos, end_pos), elem[0], elem[1])) - log.debug(f" {start_pos}-{end_pos}") + logger.debug(f" {start_pos}-{end_pos}") # Sort nodes sorted_nodes = sorted(nodes) diff --git a/sparv/modules/readability/readability.py b/sparv/modules/readability/readability.py index 6c055a7b..c7da9711 100644 --- a/sparv/modules/readability/readability.py +++ b/sparv/modules/readability/readability.py @@ -3,7 +3,9 @@ from math import log from typing import List -from sparv import Annotation, Output, annotator +from sparv.api import Annotation, Output, annotator, get_logger + +logger = get_logger(__name__) @annotator("Annotate text chunks with LIX values") @@ -17,6 +19,7 @@ def lix(text: Annotation = Annotation(""), """Create LIX annotation for text.""" # Read annotation files and get parent_children relations text_children, _orphans = text.get_children(sentence) + logger.progress(total=len(text_children) + 1) word_pos = list(word.read_attributes((word, pos))) sentence_children, _orphans = sentence.get_children(word) sentence_children = list(sentence_children) @@ -29,8 +32,10 @@ def lix(text: Annotation = Annotation(""), s = sentence_children[sentence_index] in_sentences.append(list(actual_words([word_pos[token_index] for token_index in s], skip_pos))) lix_annotation.append(fmt % lix_calc(in_sentences)) + logger.progress() out.write(lix_annotation) + logger.progress() def lix_calc(sentences): @@ -65,6 +70,7 @@ def ovix(text: Annotation = Annotation(""), fmt: str = "%.2f"): """Create OVIX annotation for text.""" text_children, _orphans = text.get_children(word) + logger.progress(total=len(text_children) + 1) word_pos = list(word.read_attributes((word, pos))) # Calculate OVIX for every text element @@ -72,8 +78,10 @@ def ovix(text: Annotation = Annotation(""), for text in text_children: in_words = list(actual_words([word_pos[token_index] for token_index in text], skip_pos)) ovix_annotation.append(fmt % ovix_calc(in_words)) + logger.progress() out.write(ovix_annotation) + logger.progress() def ovix_calc(words): @@ -116,6 +124,7 @@ def nominal_ratio(text: Annotation = Annotation(""), fmt: str = "%.2f"): """Create nominal ratio annotation for text.""" text_children, _orphans = text.get_children(pos) + logger.progress(total=len(text_children) + 1) pos_annotation = list(pos.read()) # Calculate OVIX for every text element @@ -123,7 +132,9 @@ def nominal_ratio(text: Annotation = Annotation(""), for text in text_children: in_pos = [pos_annotation[token_index] for token_index in text] nk_annotation.append(fmt % nominal_ratio_calc(in_pos, noun_pos, verb_pos)) + logger.progress() out.write(nk_annotation) + logger.progress() def nominal_ratio_calc(pos: List[str], noun_pos: List[str], verb_pos: List[str]): diff --git a/sparv/modules/saldo/compound.py b/sparv/modules/saldo/compound.py index 76db3925..f3ffdffe 100644 --- a/sparv/modules/saldo/compound.py +++ b/sparv/modules/saldo/compound.py @@ -1,7 +1,6 @@ """Compound analysis.""" import itertools -import logging import pathlib import pickle import re @@ -9,13 +8,16 @@ import xml.etree.ElementTree as etree from functools import reduce -import sparv.util as util -from sparv import Annotation, Config, Model, ModelOutput, Output, annotator, modelbuilder +from sparv.api import Annotation, Config, Model, ModelOutput, Output, annotator, get_logger, modelbuilder, util +from sparv.api.util.tagsets import tagmappings -log = logging.getLogger(__name__) +logger = get_logger(__name__) +MAX_WORD_LEN = 75 SPLIT_LIMIT = 200 COMP_LIMIT = 100 +INVALID_PREFIXES = ("http:", "https:", "www.") +INVALID_REGEX = re.compile(r"(..?)\1{3}") # SALDO: Delimiters that hopefully are never found in an annotation or in a POS tag: PART_DELIM = "^" @@ -33,7 +35,8 @@ def preloader(saldo_comp_model, stats_model): Config("saldo.comp_model", default="saldo/saldo.compound.pickle", description="Path to SALDO compound model"), Config("saldo.comp_nst_model", default="saldo/nst_comp_pos.pickle", description="Path to NST part of speech compound model"), - Config("saldo.comp_stats_model", default="saldo/stats.pickle", description="Path to statistics model") + Config("saldo.comp_stats_model", default="saldo/stats.pickle", description="Path to statistics model"), + Config("saldo.comp_use_source", default=True, description="Also use source text as lexicon for compound analysis") ], preloader=preloader, preloader_params=["saldo_comp_model", "stats_model"], preloader_target="preloaded_models") def annotate(out_complemgrams: Output = Output(":saldo.complemgram", description="Compound analysis using lemgrams"), @@ -46,10 +49,11 @@ def annotate(out_complemgrams: Output = Output(":saldo.complemgram", saldo_comp_model: Model = Model("[saldo.comp_model]"), nst_model: Model = Model("[saldo.comp_nst_model]"), stats_model: Model = Model("[saldo.comp_stats_model]"), - complemgramfmt: str = util.SCORESEP + "%.3e", - delimiter: str = util.DELIM, - compdelim: str = util.COMPSEP, - affix: str = util.AFFIX, + comp_use_source: bool = Config("saldo.comp_use_source"), + complemgramfmt: str = util.constants.SCORESEP + "%.3e", + delimiter: str = util.constants.DELIM, + compdelim: str = util.constants.COMPSEP, + affix: str = util.constants.AFFIX, cutoff: bool = True, preloaded_models=None): """Divide compound words into prefix(es) and suffix. @@ -67,6 +71,7 @@ def annotate(out_complemgrams: Output = Output(":saldo.complemgram", (use empty string to omit probablility) - preloaded_models: Preloaded models if using preloader """ + logger.progress() ################## # Load models ################## @@ -80,9 +85,10 @@ def annotate(out_complemgrams: Output = Output(":saldo.complemgram", nst_model = pickle.load(f) word_msd_baseform_annotations = list(word.read_attributes((word, msd, baseform_tmp))) + logger.progress(total=len(word_msd_baseform_annotations) + 3) - # Create alternative lexicon (for words within the file) - altlexicon = InFileLexicon(word_msd_baseform_annotations) + # Create alternative lexicon (for words within the source file) + altlexicon = InFileLexicon(word_msd_baseform_annotations if comp_use_source else []) ################## # Do annotation @@ -126,9 +132,14 @@ def annotate(out_complemgrams: Output = Output(":saldo.complemgram", else: make_new_baseforms(baseform_annotation, msd, compounds, stats_lexicon, altlexicon, delimiter, affix) + logger.progress() + out_complemgrams.write(complem_annotation) + logger.progress() out_compwf.write(compwf_annotation) + logger.progress() out_baseform.write(baseform_annotation) + logger.progress() @modelbuilder("SALDO compound model", language=["swe"]) @@ -148,11 +159,11 @@ class SaldoCompLexicon: def __init__(self, saldofile: pathlib.Path, verbose=True): """Load lexicon.""" if verbose: - log.info("Reading Saldo lexicon: %s", saldofile) + logger.info("Reading Saldo lexicon: %s", saldofile) with open(saldofile, "rb") as F: self.lexicon = pickle.load(F) if verbose: - log.info("OK, read %d words", len(self.lexicon)) + logger.info("OK, read %d words", len(self.lexicon)) def lookup(self, word): """Lookup a word in the lexicon.""" @@ -197,11 +208,11 @@ class StatsLexicon: def __init__(self, stats_model: pathlib.Path, verbose=True): """Load lexicon.""" if verbose: - log.info("Reading statistics model: %s", stats_model) + logger.info("Reading statistics model: %s", stats_model) with open(stats_model, "rb") as s: self.lexicon = pickle.load(s) if verbose: - log.info("Done") + logger.info("Done") def lookup_prob(self, word): """Look up the probability of the word.""" @@ -213,13 +224,13 @@ def lookup_word_tag_freq(self, word, tag): class InFileLexicon: - """A dictionary of all words occuring in the input file. + """A dictionary of all words occurring in the input file. keys = words, values = MSD tags """ - def __init__(self, annotations): - """Create a lexicon for the words occuring in this file.""" + def __init__(self, annotations: list): + """Create a lexicon for the words occurring in this file.""" lex = {} for word, msd, _ in annotations: w = word.lower() @@ -254,7 +265,7 @@ def get_suffixes(self, suffix, msd=None): def split_word(saldo_lexicon, altlexicon, w, msd): """Split word w into every possible combination of substrings.""" - MAX_ITERATIONS = 500000 + MAX_ITERATIONS = 250000 MAX_TIME = 20 # Seconds invalid_spans = set() valid_spans = set() @@ -275,11 +286,11 @@ def split_word(saldo_lexicon, altlexicon, w, msd): iterations += 1 if iterations > MAX_ITERATIONS: giveup = True - log.info("Too many iterations for word '%s'", w) + logger.info("Too many iterations for word '%s'", w) break if time.time() - start_time > MAX_TIME: giveup = True - log.info("Compound analysis took to long for word '%s'", w) + logger.info("Compound analysis took to long for word '%s'", w) break if first: @@ -377,7 +388,7 @@ def split_word(saldo_lexicon, altlexicon, w, msd): counter += 1 if counter > SPLIT_LIMIT: giveup = True - log.info("Too many possible compounds for word '%s'" % w) + logger.info("Too many possible compounds for word '%s'" % w) break yield comp @@ -449,7 +460,7 @@ def deep_len(lst): def compound(saldo_lexicon, altlexicon, w, msd=None): """Create a list of compound analyses for word w.""" - if len(w) > 75 or re.search(r"(.)\1{4,}", w): + if len(w) > MAX_WORD_LEN or INVALID_REGEX.search(w) or any(w.startswith(p) for p in INVALID_PREFIXES): return [] in_compounds = list(split_word(saldo_lexicon, altlexicon, w, msd)) @@ -529,8 +540,8 @@ def make_complem_and_compwf(out_complem, out_compwf, complemgramfmt, compounds, compwf_list.append(wf) # Add to annotations - out_complem.append(util.cwbset(complem_list, delimiter, affix) if compounds and complem_list else affix) - out_compwf.append(util.cwbset(compwf_list, delimiter, affix) if compounds else affix) + out_complem.append(util.misc.cwbset(complem_list, delimiter, affix) if compounds and complem_list else affix) + out_compwf.append(util.misc.cwbset(compwf_list, delimiter, affix) if compounds else affix) def make_new_baseforms(out_baseform, msd_tag, compounds, stats_lexicon, altlexicon, delimiter, affix): @@ -560,13 +571,13 @@ def make_new_baseforms(out_baseform, msd_tag, compounds, stats_lexicon, altlexic baseform_list.append(baseform) # Add to annotation - out_baseform.append(util.cwbset(baseform_list, delimiter, affix) if (compounds and baseform_list) else affix) + out_baseform.append(util.misc.cwbset(baseform_list, delimiter, affix) if (compounds and baseform_list) else affix) def read_lmf(xml: pathlib.Path, tagset: str = "SUC"): """Read the XML version of SALDO's morphological lexicon (saldom.xml).""" - tagmap = util.tagsets.mappings["saldo_to_" + tagset.lower() + "_compound"] - log.info("Reading XML lexicon") + tagmap = tagmappings.mappings["saldo_to_" + tagset.lower() + "_compound"] + logger.info("Reading XML lexicon") lexicon = {} context = etree.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element @@ -602,7 +613,7 @@ def read_lmf(xml: pathlib.Path, tagset: str = "SUC"): if elem.tag in ["LexicalEntry", "frame", "resFrame"]: root.clear() - log.info("OK, read") + logger.info("OK, read") return lexicon @@ -613,7 +624,7 @@ def save_to_picklefile(saldofile, lexicon, protocol=-1, verbose=True): - lexicon = {wordform: {lemgram: {"msd": set(), "pos": str}}} """ if verbose: - log.info("Saving Saldo lexicon in Pickle format") + logger.info("Saving Saldo lexicon in Pickle format") picklex = {} for word in lexicon: @@ -629,4 +640,4 @@ def save_to_picklefile(saldofile, lexicon, protocol=-1, verbose=True): with open(saldofile, "wb") as F: pickle.dump(picklex, F, protocol=protocol) if verbose: - log.info("OK, saved") + logger.info("OK, saved") diff --git a/sparv/modules/saldo/nst_comp_model.py b/sparv/modules/saldo/nst_comp_model.py index cfae0223..5f34c89b 100644 --- a/sparv/modules/saldo/nst_comp_model.py +++ b/sparv/modules/saldo/nst_comp_model.py @@ -1,14 +1,13 @@ """Train a POS probability model on the NST lexicon.""" -import logging import pickle import re from nltk import FreqDist, LidstoneProbDist -from sparv import Model, ModelOutput, modelbuilder +from sparv.api import Model, ModelOutput, get_logger, modelbuilder -log = logging.getLogger(__name__) +logger = get_logger(__name__) @modelbuilder("Compound POS model", language=["swe"], order=1) @@ -25,7 +24,7 @@ def build_nst_comp(out: ModelOutput = ModelOutput("saldo/nst_comp_pos.pickle"), The NST lexicon can be retrieved from SVN with credentials: svn export https://svn.spraakdata.gu.se/sb-arkiv/lexikon/NST_svensk_leksikon/nst_utf8.txt saldo/nst_utf8.txt """ - log.info("Building compound POS probability model...") + logger.info("Building compound POS probability model...") make_model(nst_lexicon, out) diff --git a/sparv/modules/saldo/saldo.py b/sparv/modules/saldo/saldo.py index c71ea33e..3573b0e2 100644 --- a/sparv/modules/saldo/saldo.py +++ b/sparv/modules/saldo/saldo.py @@ -1,15 +1,14 @@ """Create annotations from SALDO.""" import itertools -import logging import re from typing import List, Optional -import sparv.util as util -from sparv import Annotation, Config, Model, Output, annotator -from sparv.modules.saldo.saldo_model import SaldoLexicon +from sparv.api import Annotation, Config, Model, Output, annotator, get_logger, util -log = logging.getLogger(__name__) +from .saldo_model import SaldoLexicon + +logger = get_logger(__name__) # The minimum precision difference for two annotations to be considered equal PRECISION_DIFF = 0.01 @@ -24,65 +23,101 @@ def preloader(models): @annotator("SALDO annotations", language=["swe"], config=[ Config("saldo.model", default="saldo/saldo.pickle", description="Path to SALDO model"), + Config("saldo.delimiter", default=util.constants.DELIM, description="Character to put between ambiguous results"), + Config("saldo.affix", default=util.constants.AFFIX, description="Character to put before and after sets of results"), Config("saldo.precision", "", - description="Format string for appending precision to each value (e.g. ':%.3f')") + description="Format string for appending precision to each value (e.g. ':%.3f')"), + Config("saldo.precision_filter", default="max", + description="Precision filter with values 'max' (only use the annotations that are most probable), " + "'first' (only use the most probable annotation(s)), 'none' (use all annotations)"), + Config("saldo.min_precision", default=0.66, + description="Only use annotations with a probability score higher than this"), + Config("saldo.skip_multiword", default=False, description="Whether to disable annotation of multiword expressions"), + Config("saldo.max_mwe_gaps", default=1, description="Max amount of gaps allowed within a multiword expression"), + Config("saldo.allow_multiword_overlap", default=False, + description="Whether all multiword expressions may overlap with each other. " + "If set to False, some cleanup is done."), + Config("saldo.word_separator", default="", + description="Character used to split the values of 'word' into several word variations"), ], preloader=preloader, preloader_params=["models"], preloader_target="models_preloaded") def annotate(token: Annotation = Annotation(""), word: Annotation = Annotation(""), sentence: Annotation = Annotation(""), - reference: Annotation = Annotation(":misc.number_rel_"), - out_sense: Output = Output(":saldo.sense", cls="token:sense", description="SALDO identifier"), - out_lemgram: Output = Output(":saldo.lemgram", description="SALDO lemgram"), + reference: Annotation = Annotation(""), + out_sense: Output = Output(":saldo.sense", cls="token:sense", description="SALDO identifiers"), + out_lemgram: Output = Output(":saldo.lemgram", cls="token:lemgram", description="SALDO lemgrams"), out_baseform: Output = Output(":saldo.baseform", cls="token:baseform", - description="Baseform from SALDO"), + description="Baseforms from SALDO"), models: List[Model] = [Model("[saldo.model]")], msd: Optional[Annotation] = Annotation(""), - delimiter: str = util.DELIM, - affix: str = util.AFFIX, + delimiter: str = Config("saldo.delimiter"), + affix: str = Config("saldo.affix"), precision: str = Config("saldo.precision"), - precision_filter: str = "max", - min_precision: float = 0.66, - skip_multiword: bool = False, - allow_multiword_overlap: bool = False, - word_separator: str = "", + precision_filter: str = Config("saldo.precision_filter"), + min_precision: float = Config("saldo.min_precision"), + skip_multiword: bool = Config("saldo.skip_multiword"), + max_gaps: int = Config("saldo.max_mwe_gaps"), + allow_multiword_overlap: bool = Config("saldo.allow_multiword_overlap"), + word_separator: str = Config("saldo.word_separator"), models_preloaded: Optional[dict] = None): - """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words. - - - token, word, msd, sentence, reference: existing annotations - - out_baseform, out_lemgram, out_sense: resulting annotations to be written - - models: a list of pickled lexica, typically the Saldo model (saldo.pickle) - and optional lexicons for older Swedish. - - delimiter: delimiter character to put between ambiguous results - - affix: an optional character to put before and after results - - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f" - (use empty string for no precision) - - precision_filter: an optional filter, currently there are the following values: - max: only use the annotations that are most probable - first: only use the most probable annotation (or one of the most probable if more than one) - none: use all annotations - - min_precision: only use annotations with a probability score higher than this - - skip_multiword: set to True to disable multi word annotations - - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations. - By setting this to True, all overlaps will be allowed. - - word_separator: an optional character used to split the values of "word" into several word variations - - models_preloaded: Preloaded models. + """Use the Saldo lexicon model to annotate msd-tagged words. + + Args: + token (Annotation): Input annotation with token spans. Defaults to Annotation(""). + word (Annotation): Input annotation with token strings. Defaults to Annotation(""). + sentence (Annotation): Input annotation with sentence spans. Defaults to Annotation(""). + reference (Annotation): Input annotation with token indices for each sentence. + Defaults to Annotation(""). + out_sense (Output): Output annotation with senses from SALDO. Defaults to Output(":saldo.sense"). + out_lemgram (Output): Output annotation with lemgrams from SALDO. Defaults to Output(":saldo.lemgram"). + out_baseform (Output): Output annotation with baseforms from SALDO. + Defaults to Output(":saldo.baseform"). + models (List[Model]): A list of pickled lexicons, typically the SALDO model (saldo.pickle) + and optional lexicons for older Swedish. Defaults to [Model("[saldo.model]")]. + msd (Annotation, optional): Input annotation with POS and morphological descriptions. + Defaults to Annotation(""). + delimiter (str): Character to put between ambiguous results. Defaults to Config("saldo.delimiter"). + affix (str): Character to put before and after sets of results. Defaults to Config("saldo.affix"). + precision (str): Format string for appending precision to each value (e.g. ':%.3f'). use empty string for no + precision. Defaults to Config("saldo.precision"). + precision_filter (str): Precision filter with values 'max' (only use the annotations that are most probable), + 'first' (only use the most probable annotation(s)), 'none' (use all annotations)". + Defaults to Config("saldo.precision_filter"). + min_precision (float): Only use annotations with a probability score higher than this. + Defaults to Config("saldo.min_precision"). + skip_multiword (bool): Whether to disable annotation of multiword expressions. + Defaults to Config("saldo.skip_multiword"). + max_gaps (int): Max amount of gaps allowed within a multiword expression. Defaults to Config("saldo.max_gaps"). + allow_multiword_overlap (bool): Whether all multiword expressions may overlap with each other. If set to False, + some cleanup is done. Defaults to Config("saldo.allow_multiword_overlap"). + word_separator (str): Character used to split the values of 'word' into several word variations. + Defaults to Config("saldo.word_separator"). + models_preloaded (dict, optional): Preloaded models. Defaults to None. """ + main(token=token, word=word, sentence=sentence, reference=reference, out_sense=out_sense, out_lemgram=out_lemgram, + out_baseform=out_baseform, models=models, msd=msd, delimiter=delimiter, affix=affix, precision=precision, + precision_filter=precision_filter, min_precision=min_precision, skip_multiword=skip_multiword, + max_gaps=max_gaps, allow_multiword_overlap=allow_multiword_overlap, word_separator=word_separator, + models_preloaded=models_preloaded) + + +def main(token, word, sentence, reference, out_sense, out_lemgram, out_baseform, models, msd, delimiter, affix, + precision, precision_filter, min_precision, skip_multiword, max_gaps, allow_multiword_overlap, word_separator, + models_preloaded): + """Do SALDO annotations with models.""" # Allow use of multiple lexicons + logger.progress() models_list = [(m.path.stem, m) for m in models] if not models_preloaded: lexicon_list = [(name, SaldoLexicon(lex.path)) for name, lex in models_list] - # Use pre-loaded lexicons (from catapult) + # Use pre-loaded lexicons else: lexicon_list = [] for name, _lex in models_list: assert models_preloaded.get(name, None) is not None, "Lexicon %s not found!" % name lexicon_list.append((name, models_preloaded[name])) - # Maximum number of gaps in multi-word units. - # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc) - max_gaps = 1 - - # Combine annotation names i SALDO lexicon with out annotations + # Combine annotation names in SALDO lexicon with out annotations annotations = [] if out_baseform: annotations.append((out_baseform, "gf")) @@ -92,7 +127,7 @@ def annotate(token: Annotation = Annotation(""), annotations.append((out_sense, "saldo")) if skip_multiword: - log.info("Skipping multi word annotations") + logger.info("Skipping multi word annotations") min_precision = float(min_precision) @@ -104,14 +139,17 @@ def annotate(token: Annotation = Annotation(""), ref_annotation = list(reference.read()) if msd: msd_annotation = list(msd.read()) + else: + msd_annotation = word.create_empty_attribute() sentences, orphans = sentence.get_children(token) sentences.append(orphans) if orphans: - log.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated.") + logger.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated.") out_annotation = word.create_empty_attribute() + logger.progress(total=len(sentences) + 1) for sent in sentences: incomplete_multis = [] # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}] @@ -133,22 +171,23 @@ def annotate(token: Annotation = Annotation(""), thewords = [theword] # First use MSD tags to find the most probable single word annotations - ann_tags_words = find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, - precision_filter, annotation_info) + ann_tags_words = _find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, + precision_filter, annotation_info) # Find multi-word expressions if not skip_multiword: - find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref, msdtag, max_gaps, - ann_tags_words, msd_annotation, sent, skip_pos_check) + _find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref, msdtag, max_gaps, + ann_tags_words, msd_annotation, sent, skip_pos_check) # Loop to next token + logger.progress() if not allow_multiword_overlap: # Check that we don't have any unwanted overlaps - remove_unwanted_overlaps(complete_multis) + _remove_unwanted_overlaps(complete_multis) # Then save the rest of the multi word expressions in sentence_tokens - save_multiwords(complete_multis, sentence_tokens) + _save_multiwords(complete_multis, sentence_tokens) for tok in list(sentence_tokens.values()): out_annotation[tok["token_index"]] = _join_annotation(tok["annotations"], delimiter, affix) @@ -157,14 +196,14 @@ def annotate(token: Annotation = Annotation(""), for out_annotation_obj, annotation_name in annotations: out_annotation_obj.write([v.get(annotation_name, delimiter) if v is not None else None for v in out_annotation]) + logger.progress() ################################################################################ # Auxiliaries ################################################################################ - -def find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, precision_filter, annotation_info): +def _find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, precision_filter, annotation_info): ann_tags_words = [] for w in thewords: @@ -177,7 +216,7 @@ def find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, p for a in lexicon.lookup(w): annotation.append(a + (prefix,)) ann_tags_words += annotation - # Set break if each word only gets annotations from first lexicon that has entry for word + # # Set break if each word only gets annotations from first lexicon that has entry for word # break annotation_precisions = [(get_precision(msdtag, msdtags), annotation, prefix) @@ -185,7 +224,7 @@ def find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, p if min_precision > 0: annotation_precisions = [x for x in annotation_precisions if x[0] >= min_precision] - annotation_precisions = normalize_precision(annotation_precisions) + annotation_precisions = _normalize_precision(annotation_precisions) annotation_precisions.sort(reverse=True, key=lambda x: x[0]) if precision_filter and annotation_precisions: @@ -223,8 +262,8 @@ def ismax(lemprec): return ann_tags_words -def find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref, msdtag, max_gaps, ann_tags_words, - msd_annotation, sent, skip_pos_check): +def _find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref, msdtag, max_gaps, ann_tags_words, + msd_annotation, sent, skip_pos_check): todelfromincomplete = [] # list to keep track of which expressions that have been completed for i, x in enumerate(incomplete_multis): @@ -300,12 +339,20 @@ def find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref incomplete_multis.extend(looking_for) -def remove_unwanted_overlaps(complete_multis): +def _remove_unwanted_overlaps(complete_multis): + """Remove certain overlapping MWEs if they have identical POS (remove 'a' if 'b1 a1 b2 a2' or 'a1 b1 ab2').""" remove = set() for ai, a in enumerate(complete_multis): + # For historical texts: Since we allow many words for one token (spelling variations) we must make sure that + # two words of an MWE are not made up by two variants of one token. That is, that the same ref ID is not + # used twice in an MWE. + if len(set(a[0])) != len(a[0]): + remove.add(ai) + continue for b in complete_multis: # Check if both are of same POS - if not a == b and re.search(r"\.(\w\w?)m?\.", a[1]["lem"][0]).groups()[0] == re.search(r"\.(\w\w?)m?\.", b[1]["lem"][0]).groups()[0]: + if not a == b and re.search(r"\.(\w\w?)m?\.", a[1]["lem"][0]).groups()[0] == re.search( + r"\.(\w\w?)m?\.", b[1]["lem"][0]).groups()[0]: if b[0][0] < a[0][0] < b[0][-1] < a[0][-1]: # A case of b1 a1 b2 a2. Remove a. remove.add(ai) @@ -317,7 +364,7 @@ def remove_unwanted_overlaps(complete_multis): del complete_multis[a] -def save_multiwords(complete_multis, sentence_tokens): +def _save_multiwords(complete_multis, sentence_tokens): for c in complete_multis: first = True first_ref = "" @@ -332,8 +379,9 @@ def save_multiwords(complete_multis, sentence_tokens): def _join_annotation(annotation, delimiter, affix): - seen = set() - return dict([(a, affix + delimiter.join(b for b in annotation[a] if b not in seen and not seen.add(b)) + affix) for a in annotation]) + """Convert annotations into cwb sets with unique values.""" + return dict([(a, util.misc.cwbset(list(dict.fromkeys(annotation[a])), delimiter=delimiter, affix=affix)) + for a in annotation]) def get_precision(msd, msdtags): @@ -350,7 +398,7 @@ def get_precision(msd, msdtags): 0.25) -def normalize_precision(annotations): +def _normalize_precision(annotations): """Normalize the rankings in the annotation list so that the sum is 1.""" total_precision = sum(prec for (prec, _annotation, prefix) in annotations) return [(prec / total_precision, annotation, prefix) for (prec, annotation, prefix) in annotations] diff --git a/sparv/modules/saldo/saldo_model.py b/sparv/modules/saldo/saldo_model.py index a4272471..7d3c6ea7 100644 --- a/sparv/modules/saldo/saldo_model.py +++ b/sparv/modules/saldo/saldo_model.py @@ -1,15 +1,15 @@ """SALDO Model builders.""" -import logging import pathlib import pickle import re import xml.etree.ElementTree as etree -import sparv.util as util -from sparv import Model, ModelOutput, modelbuilder +from sparv.api import Model, ModelOutput, get_logger, modelbuilder, util +from sparv.api.util.tagsets import tagmappings + +logger = get_logger(__name__) -log = logging.getLogger(__name__) # SALDO: Delimiters that hopefully are never found in an annotation or in a POS tag: PART_DELIM = "^" @@ -28,7 +28,8 @@ def download_saldo(out: ModelOutput = ModelOutput("saldo/saldom.xml")): def build_saldo(out: ModelOutput = ModelOutput("saldo/saldo.pickle"), saldom: Model = Model("saldo/saldom.xml")): """Save SALDO morphology as a pickle file.""" - lmf_to_pickle(saldom.path, out.path) + tagmap = tagmappings.mappings["saldo_to_suc"] + lmf_to_pickle(saldom.path, out.path, tagmap) class SaldoLexicon: @@ -40,7 +41,7 @@ class SaldoLexicon: def __init__(self, saldofile: pathlib.Path, verbose=True): """Read lexicon.""" if verbose: - log.info("Reading Saldo lexicon: %s", saldofile) + logger.info("Reading Saldo lexicon: %s", saldofile) if saldofile.suffix == ".pickle": with open(saldofile, "rb") as F: self.lexicon = pickle.load(F) @@ -48,11 +49,11 @@ def __init__(self, saldofile: pathlib.Path, verbose=True): lexicon = self.lexicon = {} with open(saldofile, "rb") as F: for line in F: - row = line.decode(util.UTF8).split() + row = line.decode(util.constants.UTF8).split() word = row.pop(0) lexicon[word] = row if verbose: - log.info("OK, read %d words", len(self.lexicon)) + logger.info("OK, read %d words", len(self.lexicon)) def lookup(self, word): """Lookup a word in the lexicon. @@ -73,7 +74,7 @@ def save_to_picklefile(saldofile, lexicon, protocol=-1, verbose=True): - lexicon = {wordform: {{annotation-type: annotation}: (set(possible tags), set(tuples with following words), gap-allowed-boolean, is-particle-verb-boolean)}} """ if verbose: - log.info("Saving LMF lexicon in Pickle format") + logger.info("Saving LMF lexicon in Pickle format") picklex = {} for word in lexicon: @@ -92,7 +93,7 @@ def save_to_picklefile(saldofile, lexicon, protocol=-1, verbose=True): with open(saldofile, "wb") as F: pickle.dump(picklex, F, protocol=protocol) if verbose: - log.info("OK, saved") + logger.info("OK, saved") @staticmethod def save_to_textfile(saldofile, lexicon, verbose=True): @@ -103,14 +104,14 @@ def save_to_textfile(saldofile, lexicon, verbose=True): NOT UP TO DATE """ if verbose: - log.info("Saving LMF lexicon in text format") + logger.info("Saving LMF lexicon in text format") with open(saldofile, "w") as F: for word in sorted(lexicon): annotations = [PART_DELIM.join([annotation] + sorted(postags)) for annotation, postags in list(lexicon[word].items())] - print(" ".join([word] + annotations).encode(util.UTF8), file=F) + print(" ".join([word] + annotations).encode(util.constants.UTF8), file=F) if verbose: - log.info("OK, saved") + logger.info("OK, saved") def split_triple(annotation_tag_words): @@ -133,23 +134,21 @@ def split_triple(annotation_tag_words): ################################################################################ -def lmf_to_pickle(xml, filename, annotation_elements=("gf", "lem", "saldo")): +def lmf_to_pickle(xml, filename, tagmap, annotation_elements=("gf", "lem", "saldo")): """Read an XML dictionary and save as a pickle file.""" - xml_lexicon = read_lmf(xml, annotation_elements) + xml_lexicon = read_lmf(xml, tagmap, annotation_elements) SaldoLexicon.save_to_picklefile(filename, xml_lexicon) -def read_lmf(xml, annotation_elements=("gf", "lem", "saldo"), tagset="SUC", verbose=True): +def read_lmf(xml, tagmap, annotation_elements=("gf", "lem", "saldo"), verbose=True): """Read the XML version of SALDO's morphological lexicon (saldom.xml). Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}} - annotation_element is the XML element for the annotation value (currently: 'gf' for baseform, 'lem' for lemgram or 'saldo' for SALDO id) - tagset is the tagset for the possible tags (currently: 'SUC', 'Parole', 'Saldo') """ - # assert annotation_element in ("gf", "lem", "saldo"), "Invalid annotation element" - tagmap = util.tagsets.mappings["saldo_to_" + tagset.lower()] if verbose: - log.info("Reading XML lexicon") + logger.info("Reading XML lexicon") lexicon = {} context = etree.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element @@ -225,10 +224,10 @@ def read_lmf(xml, annotation_elements=("gf", "lem", "saldo"), tagset="SUC", verb "formar", "in", "datorrelaterade"] - util.test_lexicon(lexicon, testwords) + util.misc.test_lexicon(lexicon, testwords) if verbose: - log.info("OK, read") + logger.info("OK, read") return lexicon @@ -250,25 +249,6 @@ def __eq__(self, other): ################################################################################ -def save_to_cstlemmatizer(cstfile, lexicon, encoding="latin-1", verbose=True): - """Save a JSON lexicon as an external file that can be used for training the CST lemmatizer. - - The default encoding of the resulting file is ISO-8859-1 (Latin-1). - """ - if verbose: - log.info("Saving CST lexicon") - with open(cstfile, "w") as F: - for word in sorted(lexicon): - for lemma in sorted(lexicon[word]): - for postag in sorted(lexicon[word][lemma]): - # the order between word, lemma, postag depends on - # the argument -c to cstlemma, this order is -cBFT: - line = "%s\t%s\t%s" % (word, lemma, postag) - print(line.encode(encoding), file=F) - if verbose: - log.info("OK, saved") - - def extract_tags(lexicon): """Extract the set of all tags that are used in a lexicon. diff --git a/sparv/modules/saldo/stats_model.py b/sparv/modules/saldo/stats_model.py index ecdf5b2d..72a792d4 100644 --- a/sparv/modules/saldo/stats_model.py +++ b/sparv/modules/saldo/stats_model.py @@ -1,15 +1,14 @@ """Train a probability model on a Korp statistics file.""" -import logging import os import pickle import urllib.request from nltk import FreqDist, LidstoneProbDist -from sparv import Model, ModelOutput, modelbuilder +from sparv.api import Model, ModelOutput, get_logger, modelbuilder -log = logging.getLogger(__name__) +logger = get_logger(__name__) MIN_FREQ = 4 @@ -26,10 +25,10 @@ def build_korp_stats(out: ModelOutput = ModelOutput("saldo/stats.pickle"), """Download Korp's word frequency file and convert it to a model.""" txt_file = Model("saldo/stats_all.txt") try: - log.info("Downloading Korp stats file...") + logger.info("Downloading Korp stats file...") download_stats_file("https://svn.spraakdata.gu.se/sb-arkiv/pub/frekvens/stats_all.txt", txt_file.path) - log.info("Building frequency model...") + logger.info("Building frequency model...") make_model(txt_file.path, out.path) finally: # Clean up diff --git a/sparv/modules/segment/crf.py b/sparv/modules/segment/crf.py deleted file mode 100644 index bc6143ed..00000000 --- a/sparv/modules/segment/crf.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -Tokenization based on Conditional Random Fields, implemented for Old Swedish. - -Used by segment.CRFTokenizer. -Requires installation of crfpp (https://taku910.github.io/crfpp/). -""" - -import CRFPP -from .flat_txt2crf import normalize, features - -""" Expects a model that operating on the tags - SNG | LF0 (LF1 MID*)? RHT - SNG = single word - LF0 = first word - LF1 = second word - MID = middle word - RHT = right most word - """ - - -def segment(sentence, model): - try: - tagger = CRFPP.Tagger("-m " + model) - - # clear internal context - tagger.clear() - - l_features = features - - splitted = split_enumerate(sentence, '.') - raws = [word for word, span in splitted] - words = [(normalize(word), span) for word, span in splitted] - words_length = len(words) - - raws = iter(raws) - - if words_length == 0: - return [(0, 0)] - else: - lastword, last_span = words.pop() - words = iter(words) - last_span = str(last_span[0]), str(last_span[0]) - - # add context - for i, (w, _span) in enumerate(words): - nextline = '\t'.join((next(raws),) + l_features(w, u'LF%s' % (i,))).encode('utf-8') - tagger.add(nextline) - - if i >= 1: - break - - for w, _span in words: - # s_span = (str(span[0]), str(span[1])) - nextline = '\t'.join((next(raws),) + l_features(w, u'MID')).encode('utf-8') - tagger.add(nextline) - - nextline = '\t'.join((next(raws),) + l_features(lastword, u'RHT')).encode('utf-8') - tagger.add(nextline) - - # Parse and change internal stated as 'parsed' - tagger.parse() - anchors = crf_anchors(tagger, splitted) - # print "Done tagging crf" - return anchors - - except RuntimeError as e: - print("RuntimeError: ", e, end=' ') - - -def crf_anchors(tagger, enumerated_sent): - anchors = [] - last_start, last_stop = 0, -1 - size = tagger.size() - # xsize = tagger.xsize() - # ysize = tagger.ysize() - - # print enumerated_sent[25:] - words = iter(enumerated_sent) - - for i in range(0, size): - label = tagger.y2(i) - _w, span = next(words) - # print w,span - - if label == 'SNG': - # SNG (singleton) tag - anchors.append((span[0], span[1])) - last_stop = -1 - - elif label == 'LF0': - # Start new sentence on LF0 (first word in sentenceq) - if last_stop: - anchors.append((last_start, last_stop)) - last_start = span[0] - last_stop = span[1] - - else: - # Otherwise add token to current sentence - last_stop = span[1] - - if last_stop != -1: - anchors.append((int(last_start), int(span[1]))) - # print anchors - return anchors - - -def split_enumerate(words, delimiters=[]): - res = [] - tmp, tmp_i = '', -1 - - for i, w in enumerate(words + ' '): - if w in delimiters: - if tmp: - res.append((tmp, (tmp_i, i))) - res.append((w, (i, i + 1))) - tmp, tmp_i = '', -1 - - elif not w.isspace(): - if tmp_i == -1: - tmp_i = i - tmp += w - - elif tmp: - res.append((tmp, (tmp_i, i))) - tmp, tmp_i = '', -1 - - if tmp: - res.append((tmp, (tmp_i, i))) - return res diff --git a/sparv/modules/segment/flat_txt2crf.py b/sparv/modules/segment/flat_txt2crf.py deleted file mode 100644 index c9f0fcd8..00000000 --- a/sparv/modules/segment/flat_txt2crf.py +++ /dev/null @@ -1,166 +0,0 @@ -"""Util function used by crf.py.""" - -punctuation = frozenset([u',', u':', u'/', u'.', u'·', u'¶', u';', '°', '-', '—']) -vowels = frozenset(u'aeiouvöäåy') - - -def features(xxx_todo_changeme, tag): - (word, lookslikeanumber) = xxx_todo_changeme - return (word.lower(), - 'CAP' if word[0].isupper() else 'NOCAP', - word.lower()[-2:], - 'NUMLIKE' if lookslikeanumber else 'PNCLIKE' if word in punctuation else 'WRDLIKE', - tag) - - -def thousands(w): - return w.lstrip('Mm') - - -def hundreds(w): - if w.lower()[0:5] == u'dcccc': - return w[5:] - elif w.lower()[0:4] in (u'cccc', u'dccc'): - return w[4:] - elif w.lower()[0:3] in (u'ccc', u'dcc'): - return w[3:] - elif w.lower()[0:2] in (u'cc', 'cd', 'dc', 'cm'): - return w[2:] - elif w.lower()[0:1] in (u'c', 'd'): - return w[1:] - else: - return w - - -def tens(w): - if w.lower()[0:5] == u'lxxxx': - return w[5:] - elif w.lower()[0:4] in (u'xxxx', u'lxxx'): - return w[4:] - elif w.lower()[0:3] in (u'xxx', u'lxx'): - return w[3:] - elif w.lower()[0:2] in (u'xx', 'xl', 'lx', 'xc'): - return w[2:] - elif w.lower()[0:1] in (u'x', 'l'): - return w[1:] - else: - return w - - -def ones(w): - if w.lower()[0:5] == u'viiii': - return w[5:] - elif w.lower()[0:4] in (u'iiii', u'viii'): - return w[4:] - elif w.lower()[0:3] in (u'iii', u'vii'): - return w[3:] - elif w.lower()[0:2] in (u'ii', 'iv', 'vi', 'ix'): - return w[2:] - elif w.lower()[0:1] in (u'i', 'v'): - return w[1:] - else: - return w - - -def lookslikearomananumber(w): - return not ones(tens(hundreds(thousands(w)))) - - -def lookslikeanarabicnumber(w): - return any(c in '0123456789' for c in w) - - -def lookslikeanumber(w): - return lookslikearomananumber(w) or lookslikeanarabicnumber(w) - - -# NB! normalize does a [jJ] -> [iI] conversion first... -twonormdict = dict([(u'AA', u'A'), (u'Aa', u'A'), (u'aa', u'a'), - (u'EE', u'E'), (u'Ee', u'E'), (u'ee', u'e'), - (u'II', u'I'), (u'Ii', u'I'), (u'ii', u'i'), - (u'OO', u'O'), (u'Oo', u'O'), (u'oo', u'o'), - (u'UU', u'V'), (u'Uu', u'V'), (u'uu', u'v'), - (u'WW', u'V'), (u'Ww', u'V'), (u'ww', u'v'), - (u'ÖÖ', u'Ö'), (u'Öö', u'Ö'), (u'öö', u'ö'), - (u'ÄÄ', u'Ä'), (u'Ää', u'Ä'), (u'ää', u'ä'), - (u'ÅÅ', u'a'), (u'Åå', u'a'), (u'åå', u'a'), - (u'YY', u'Y'), (u'Yy', u'Y'), (u'yy', u'y'), - (u'ØØ', u'Ö'), (u'Øø', u'Ö'), (u'øø', u'ö'), - (u'ÆÆ', u'Ä'), (u'Ææ', u'Ä'), (u'ææ', u'ä'), - (u'TH', u'T'), (u'Th', u'T'), (u'th', u't'), - (u'DH', u'D'), (u'Dh', u'D'), (u'dh', u'd'), - (u'GH', u'G'), (u'Gh', u'G'), (u'gh', u'g'), - (u'FF', u'F'), (u'Ff', u'F'), (u'ff', u'f'), - (u'ch', u'k')]) - -onenormdict = dict([(u'Ø', u'Ö'), (u'ø', u'ö'), - (u'Æ', u'Ä'), (u'æ', u'ä'), - (u'Å', u'a'), (u'å', u'a'), - (u'W', u'V'), (u'w', u'v'), - (u'U', u'V'), (u'u', u'v'), - (u'C', u'K'), (u'c', u'k'), - (u'Q', u'K'), (u'q', u'k'), - (u'Þ', u'D'), (u'þ', u'd'), - (u'Ð', u'D'), (u'ð', u'd')]) - - -def normalize(word): - word = word.replace(u'j', 'i').replace(u'J', u'I') - if lookslikeanumber(word): - return word, 1 - else: - normword = [] - i = 0 - while i < len(word): - if word[i:i + 2] in twonormdict: - normword.append(twonormdict[word[i:i + 2]]) - i += 2 - elif word[i] in onenormdict: - normword.append(onenormdict[word[i]]) - i += 1 - else: - normword.append(word[i]) - i += 1 - - return ''.join(normword), 0 - - -def main(stream): - newdiv = 1 - - l_features = features - - for line in stream: - raws = line.strip().split() - words = [normalize(word) for word in raws] - words_length = len(words) - - raws = iter(raws) - - print('words length %d' % words_length) - if words_length == 0: - if not newdiv: - print() - newdiv = 1 - else: - pass - elif words_length == 1: - print('\t'.join((next(raws),) + l_features(words[0], u'SNG') + ('id', '52'))) - newdiv = 0 - else: - lastword = words.pop() - words = iter(words) - for i, w in enumerate(words): - print('\t'.join((next(raws), ) + l_features(w, u'LF%s' % (i,)) + ('id', '53'))) - if i >= 1: - break - - for w in words: - print('\t'.join((next(raws),) + l_features(w, u'MID') + ('id', '54'))) - - print('\t'.join((next(raws),) + l_features(lastword, u'RHT') + ('id', '54'))) - - newdiv = 0 - - if not newdiv: - print() diff --git a/sparv/modules/segment/segment.py b/sparv/modules/segment/segment.py index c189603b..f7ce3614 100644 --- a/sparv/modules/segment/segment.py +++ b/sparv/modules/segment/segment.py @@ -1,15 +1,13 @@ """Segmentation mostly based on NLTK.""" import inspect -import logging import pickle import re from typing import Optional import nltk -import sparv.util as util -from sparv import Annotation, Config, Model, ModelOutput, Output, Text, annotator, modelbuilder +from sparv.api import Annotation, Config, Model, ModelOutput, Output, Text, annotator, get_logger, modelbuilder, util from sparv.modules.saldo.saldo_model import split_triple try: @@ -17,7 +15,7 @@ except ImportError: pass -log = logging.getLogger(__name__) +logger = get_logger(__name__) @annotator("Automatic tokenization", config=[ @@ -127,10 +125,12 @@ def do_segmentation(text: Text, out: Output, segmenter, chunk: Optional[Annotati chunk_start = segment_end chunk_spans[n] = (chunk_start, chunk_end) chunk_spans.sort() - log.info("Reorganized into %d chunks" % len(chunk_spans)) + logger.info("Reorganized into %d chunks" % len(chunk_spans)) else: segments = [] + logger.progress(total=len(chunk_spans) + 1) + # Now we can segment each chunk span into tokens for start, end in chunk_spans: for spanstart, spanend in segmenter.span_tokenize(corpus_text[start:end]): @@ -139,9 +139,11 @@ def do_segmentation(text: Text, out: Output, segmenter, chunk: Optional[Annotati if corpus_text[spanstart:spanend].strip(): span = (spanstart, spanend) segments.append(span) + logger.progress() segments.sort() out.write(segments) + logger.progress() @modelbuilder("Model for PunktSentenceTokenizer", language=["swe"]) @@ -197,23 +199,23 @@ def build_tokenlist(saldo_model: Model = Model("saldo/saldo.pickle"), ###################################################################### -def train_punkt_segmenter(textfiles, modelfile, encoding=util.UTF8, protocol=-1): +def train_punkt_segmenter(textfiles, modelfile, encoding=util.constants.UTF8, protocol=-1): """Train a Punkt sentence tokenizer.""" if isinstance(textfiles, str): textfiles = textfiles.split() - log.info("Reading files") + logger.info("Reading files") text = u"" for filename in textfiles: with open(filename, encoding=encoding) as stream: text += stream.read() - log.info("Training model") + logger.info("Training model") trainer = nltk.tokenize.PunktTrainer(text, verbose=True) - log.info("Saving pickled model") + logger.info("Saving pickled model") params = trainer.get_params() with open(modelfile, "wb") as stream: pickle.dump(params, stream, protocol=protocol) - log.info("OK") + logger.info("OK") ###################################################################### @@ -240,15 +242,12 @@ def span_tokenize(self, s): """Tokenize s and return list with tokens.""" result = [] spans = nltk.RegexpTokenizer.span_tokenize(self, s) - first = True temp = [0, 0] for start, _ in spans: - if not first: - temp[1] = start - result.append(tuple(temp)) + temp[1] = start + result.append(tuple(temp)) temp[0] = start - first = False temp[1] = len(s) result.append(tuple(temp)) @@ -311,7 +310,7 @@ def __init__(self, model, token_list=None): try: key, val = line.strip().split(None, 1) except ValueError as e: - log.error("Error parsing configuration file: %s", line) + logger.error("Error parsing configuration file: %s", line) raise e key = key[:-1] diff --git a/sparv/modules/sensaldo/sensaldo.py b/sparv/modules/sensaldo/sensaldo.py index 80eb0cd7..3ef80ff8 100644 --- a/sparv/modules/sensaldo/sensaldo.py +++ b/sparv/modules/sensaldo/sensaldo.py @@ -1,11 +1,9 @@ """Sentiment annotation per token using SenSALDO.""" -import logging -import sparv.util as util -from sparv import Annotation, Config, Model, ModelOutput, Output, annotator, modelbuilder +from sparv.api import Annotation, Config, Model, ModelOutput, Output, annotator, get_logger, modelbuilder, util -log = logging.getLogger(__name__) +logger = get_logger(__name__) SENTIMENT_LABLES = { -1: "negative", @@ -17,7 +15,7 @@ @annotator("Sentiment annotation per token using SenSALDO", language=["swe"], config=[ Config("sensaldo.model", default="sensaldo/sensaldo.pickle", description="Path to SenSALDO model") ]) -def annotate(sense: Annotation = Annotation(":saldo.sense"), +def annotate(sense: Annotation = Annotation(""), out_scores: Output = Output(":sensaldo.sentiment_score", description="SenSALDO sentiment score"), out_labels: Output = Output(":sensaldo.sentiment_label", description="SenSALDO sentiment label"), model: Model = Model("[sensaldo.model]"), @@ -32,17 +30,16 @@ def annotate(sense: Annotation = Annotation(":saldo.sense"), but is used in the catapult. This argument must be last. """ if not lexicon: - lexicon = util.PickledLexicon(model.path) + lexicon = util.misc.PickledLexicon(model.path) # Otherwise use pre-loaded lexicon (from catapult) - sense = sense.read() result_scores = [] result_labels = [] - for token in sense: + for token in sense.read(): # Get set of senses for each token and sort them according to their probabilities - token_senses = [tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else (s, -1.0) - for s in token.split(util.DELIM) if s] + token_senses = [tuple(s.rsplit(util.constants.SCORESEP, 1)) if util.constants.SCORESEP in s else (s, -1.0) + for s in token.split(util.constants.DELIM) if s] token_senses.sort(key=lambda x: float(x[1]), reverse=True) # Lookup the sentiment score for the most probable sense and assign a sentiment label @@ -88,7 +85,7 @@ def read_sensaldo(tsv, verbose=True): Return a lexicon dictionary: {senseid: (class, ranking)} """ if verbose: - log.info("Reading TSV lexicon") + logger.info("Reading TSV lexicon") lexicon = {} f = tsv.read() @@ -104,8 +101,8 @@ def read_sensaldo(tsv, verbose=True): "ödmjukhet..1", "handla..1" ] - util.test_lexicon(lexicon, testwords) + util.misc.test_lexicon(lexicon, testwords) if verbose: - log.info("OK, read") + logger.info("OK, read") return lexicon diff --git a/sparv/modules/sent_align/sent_align.py b/sparv/modules/sent_align/sent_align.py index 19f8a349..e07dc882 100644 --- a/sparv/modules/sent_align/sent_align.py +++ b/sparv/modules/sent_align/sent_align.py @@ -1,8 +1,7 @@ """NB: Not adapted to Sparv v4 yet!""" -# -*- coding: utf-8 -*- import math -import sparv.util as util +from sparv.api import util def align_texts(sentence1, sentence2, link1, link2, sent_parents1, sent_parents2, out_sentlink1, out_sentlink2): diff --git a/sparv/modules/stanford/stanford.py b/sparv/modules/stanford/stanford.py index cce5c09d..acbe9e39 100644 --- a/sparv/modules/stanford/stanford.py +++ b/sparv/modules/stanford/stanford.py @@ -9,11 +9,10 @@ import tempfile from pathlib import Path -import sparv.util as util -from sparv import Annotation, BinaryDir, Config, Language, Output, Text, annotator +from sparv.api import Annotation, BinaryDir, Config, Language, Output, Text, annotator, get_logger, util +from sparv.api.util.tagsets import pos_to_upos -import logging -log = logging.getLogger(__name__) +logger = get_logger(__name__) @annotator("Parse and annotate with Stanford Parser", language=["eng"], config=[ @@ -24,8 +23,6 @@ def annotate(corpus_text: Text = Text(), text: Annotation = Annotation(""), out_sentence: Output = Output("stanford.sentence", cls="sentence", description="Sentence segments"), out_token: Output = Output("stanford.token", cls="token", description="Token segments"), - out_word: Output = Output(":stanford.word", cls="token:word", description="Token strings"), - out_ref: Output = Output(":stanford.ref", description="Token ID relative to sentence"), out_baseform: Output = Output(":stanford.baseform", description="Baseforms from Stanford Parser"), out_upos: Output = Output(":stanford.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output(":stanford.pos", cls="token:pos", @@ -51,20 +48,20 @@ def annotate(corpus_text: Text = Text(), sentence_segments = [] all_tokens = [] - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - log.debug("Creating temporary directoty: %s", tmpdir) + with tempfile.TemporaryDirectory() as tmpdirstr: + tmpdir = Path(tmpdirstr) + logger.debug("Creating temporary directoty: %s", tmpdir) # Write all texts to temporary files filelist = tmpdir / "filelist.txt" - with open(filelist, "w") as LIST: + with open(filelist, "w", encoding="utf-8") as LIST: for nr, (start, end) in enumerate(text_spans): filename = tmpdir / f"text-{nr}.txt" print(filename, file=LIST) - with open(filename, "w") as F: + with open(filename, "w", encoding="utf-8") as F: print(text_data[start:end], file=F) - log.debug("Writing text %d (%d-%d): %r...%r --> %s", nr, start, end, - text_data[start:start + 20], text_data[end - 20:end], filename.name) + logger.debug("Writing text %d (%d-%d): %r...%r --> %s", nr, start, end, + text_data[start:start + 20], text_data[end - 20:end], filename.name) # Call the Stanford parser with all the text files args += ["-filelist", filelist] @@ -74,26 +71,24 @@ def annotate(corpus_text: Text = Text(), # Read and parse each of the output files for nr, (start, end) in enumerate(text_spans): filename = tmpdir / f"text-{nr}.txt.conll" - with open(filename) as F: + with open(filename, encoding="utf-8") as F: output = F.read() - log.debug("Reading text %d (%d-%d): %s --> %r...%r", nr, start, end, - filename.name, output[:20], output[-20:]) + logger.debug("Reading text %d (%d-%d): %s --> %r...%r", nr, start, end, + filename.name, output[:20], output[-20:]) processed_sentences = _parse_output(output, lang, start) for sentence in processed_sentences: - log.debug("Parsed: %s", " ".join(f"{tok.baseform}/{tok.pos}" for tok in sentence)) + logger.debug("Parsed: %s", " ".join(f"{tok.baseform}/{tok.pos}" for tok in sentence)) for token in sentence: all_tokens.append(token) if token.word != text_data[token.start:token.end]: - log.warning("Stanford word (%r) differs from surface word (%r), using the Stanford word", - token.word, text_data[token.start:token.end]) + logger.warning("Stanford word (%r) differs from surface word (%r), using the Stanford word", + token.word, text_data[token.start:token.end]) sentence_segments.append((sentence[0].start, sentence[-1].end)) # Write annotations out_sentence.write(sentence_segments) out_token.write([(t.start, t.end) for t in all_tokens]) - out_ref.write([t.ref for t in all_tokens]) - out_word.write([t.word for t in all_tokens]) out_baseform.write([t.baseform for t in all_tokens]) out_upos.write([t.upos for t in all_tokens]) out_pos.write([t.pos for t in all_tokens]) @@ -102,6 +97,16 @@ def annotate(corpus_text: Text = Text(), out_deprel.write([t.deprel for t in all_tokens]) +@annotator("Annotate tokens with IDs relative to their sentences", language=["eng"]) +def make_ref(out: Output = Output(":stanford.ref", cls="token:ref", + description="Token IDs relative to their sentences"), + sentence: Annotation = Annotation(""), + token: Annotation = Annotation("")): + """Annotate tokens with IDs relative to their sentences.""" + from sparv.modules.misc import number + number.number_relative(out, sentence, token) + + def _parse_output(stdout, lang, add_to_index): """Parse the CoNLL format output from the Stanford Parser.""" sentences = [] @@ -117,7 +122,7 @@ def _parse_output(stdout, lang, add_to_index): # -output.columns from the parser (see the args to the parser, in annotate() above): # idx, current, lemma, pos, ner, headidx, deprel, BEGIN_POS, END_POS ref, word, lemma, pos, named_entity, dephead_ref, deprel, start, end = line.split("\t") - upos = util.tagsets.pos_to_upos(pos, lang, "Penn") + upos = pos_to_upos(pos, lang, "Penn") if named_entity == "O": # O = empty name tag named_entity = "" if dephead_ref == "0": # 0 = empty dephead diff --git a/sparv/modules/stanza/__init__.py b/sparv/modules/stanza/__init__.py index fcea1bea..6f2903df 100644 --- a/sparv/modules/stanza/__init__.py +++ b/sparv/modules/stanza/__init__.py @@ -1,23 +1,41 @@ """POS tagging, lemmatisation and dependency parsing with Stanza.""" -from sparv import Config +from sparv.api import Config -from . import stanza, models +from . import models, stanza, stanza_swe __config__ = [ - Config("stanza.resources_file", default="stanza/resources.json", description="Stanza resources file"), - Config("stanza.lem_model", default="stanza/lem/sv_suc_lemmatizer.pt", description="Stanza lemmatisation model"), - Config("stanza.pos_model", default="stanza/pos/sv_talbanken_tagger.pt", description="Stanza POS model"), - Config("stanza.pretrain_pos_model", default="stanza/sv_talbanken.pretrain.pt", - description="Stanza pretrain POS model"), - Config("stanza.dep_model", default="stanza/dep/sv_talbanken_parser.pt", description="Stanza dependency model"), - Config("stanza.pretrain_dep_model", default="stanza/sv_talbanken.pretrain.pt", - description="Stanza pretrain dependency model"), + Config("stanza.resources_file", default="stanza/[metadata.language]/resources.json", + description="Stanza resources file"), Config("stanza.use_gpu", default=True, description="Use GPU instead of CPU if available"), - Config("stanza.batch_size", default=5000, description="Limit Stanza batch size"), + Config("stanza.batch_size", default=5000, + description="Limit Stanza batch size. Sentences with a token count exceeding this value will be excluded " + "from analysis."), Config("stanza.max_sentence_length", default=250, - description="Max length of sentences that will get dependence annotations (set to 0 for no limit)"), + description="Max length (in number of tokens) of sentences that will get dependence annotations (set to 0 " + "for no limit)"), Config("stanza.cpu_fallback", default=False, description="Fall back to CPU for sentences exceeding the max_sentence_length, instead of " - "excluding them from dependence parsing. Only usable with use_gpu enabled.") + "excluding them from dependence parsing. Only usable with use_gpu enabled."), + Config("stanza.max_token_length", default=0, + description="Max number of characters per token. Any sentence containing a token exceeding this limit will " + "be excluded from analysis. Disabled by default."), + Config("stanza.sentence_chunk", default="", + description="Text chunk (annotation) to use as input when segmenting sentences (not used for Swedish)"), + Config("stanza.sentence_annotation", default="", + description="Optional existing sentence segmentation annotation (not used for Swedish)"), + Config("stanza.token_annotation", default="", + description="Optional existing token annotation (not used for Swedish)"), + + # Config for Swedish + Config("stanza.swe_lem_model", default="stanza/swe/lem/sv_suc_lemmatizer.pt", + description="Stanza lemmatisation model for Swedish"), + Config("stanza.swe_pos_model", default="stanza/swe/pos/sv_talbanken_tagger.pt", + description="Stanza POS model for Swedish"), + Config("stanza.swe_pretrain_pos_model", default="stanza/swe/sv_talbanken.pretrain.pt", + description="Stanza pretrain POS model for Swedish"), + Config("stanza.swe_dep_model", default="stanza/swe/dep/sv_talbanken_parser.pt", + description="Stanza dependency model for Swedish"), + Config("stanza.swe_pretrain_dep_model", default="stanza/swe/sv_talbanken.pretrain.pt", + description="Stanza pretrain dependency model for Swedish"), ] diff --git a/sparv/modules/stanza/models.py b/sparv/modules/stanza/models.py index 5b838180..fdc4285a 100644 --- a/sparv/modules/stanza/models.py +++ b/sparv/modules/stanza/models.py @@ -1,12 +1,17 @@ """Download models for Stanza.""" import json +import logging -from sparv import Model, ModelOutput, modelbuilder +import iso639 +from sparv.api import Language, Model, ModelOutput, modelbuilder, get_logger -@modelbuilder("Stanza resources file", language=["swe"]) -def stanza_resources_file(resources_file: ModelOutput = ModelOutput("stanza/resources.json")): +logger = get_logger(__name__) + + +@modelbuilder("Stanza resources file for Swedish", language=["swe"]) +def stanza_resources_file(resources_file: ModelOutput = ModelOutput("stanza/swe/resources.json")): """Download and unzip the Stanza dependency model.""" # Write resources.json file to keep Stanza from complaining res = json.dumps({ @@ -24,37 +29,51 @@ def stanza_resources_file(resources_file: ModelOutput = ModelOutput("stanza/reso resources_file.write(res) -@modelbuilder("Stanza pretrain (embeddings) model", language=["swe"]) -def stanza_pretrain_model(model: ModelOutput = ModelOutput("stanza/sv_talbanken.pretrain.pt")): +@modelbuilder("Stanza pretrain (embeddings) model for Swedish", language=["swe"]) +def stanza_pretrain_model(model: ModelOutput = ModelOutput("stanza/swe/sv_talbanken.pretrain.pt")): """Download and unzip the Stanza pretrain (embeddings) model.""" - zip_model = Model("stanza/stanza_pretrain.zip") + zip_model = Model("stanza/swe/stanza_pretrain.zip") zip_model.download("https://svn.spraakdata.gu.se/sb-arkiv/!svn/bc/230835/pub/stanza/stanza_pretrain.zip") zip_model.unzip() zip_model.remove() -@modelbuilder("Stanza POS-tagging model", language=["swe"]) -def stanza_pos_model(model: ModelOutput = ModelOutput("stanza/pos/sv_talbanken_tagger.pt")): +@modelbuilder("Stanza POS-tagging model for Swedish", language=["swe"]) +def stanza_pos_model(model: ModelOutput = ModelOutput("stanza/swe/pos/sv_talbanken_tagger.pt")): """Download and unzip the Stanza POS-tagging model.""" - zip_model = Model("stanza/pos/synt_stanza_full2.zip") + zip_model = Model("stanza/swe/pos/synt_stanza_full2.zip") zip_model.download("https://svn.spraakdata.gu.se/sb-arkiv/!svn/bc/230835/pub/stanza/morph_stanza_full2.zip") zip_model.unzip() zip_model.remove() -@modelbuilder("Stanza lemmatisation model", language=["swe"]) -def stanza_lem_model(model: ModelOutput = ModelOutput("stanza/lem/sv_suc_lemmatizer.pt")): +@modelbuilder("Stanza lemmatisation model for Swedish", language=["swe"]) +def stanza_lem_model(model: ModelOutput = ModelOutput("stanza/swe/lem/sv_suc_lemmatizer.pt")): """Download and unzip the Stanza POS-tagging model.""" - zip_model = Model("stanza/lem/lem_stanza.zip") + zip_model = Model("stanza/swe/lem/lem_stanza.zip") zip_model.download("https://svn.spraakdata.gu.se/sb-arkiv/!svn/bc/230835/pub/stanza/lem_stanza.zip") zip_model.unzip() zip_model.remove() -@modelbuilder("Stanza dependency model", language=["swe"]) -def stanza_dep_model(model: ModelOutput = ModelOutput("stanza/dep/sv_talbanken_parser.pt")): +@modelbuilder("Stanza dependency model for Swedish", language=["swe"]) +def stanza_dep_model(model: ModelOutput = ModelOutput("stanza/swe/dep/sv_talbanken_parser.pt")): """Download and unzip the Stanza dependency model.""" - zip_model = Model("stanza/dep/synt_stanza_full2.zip") + zip_model = Model("stanza/swe/dep/synt_stanza_full2.zip") zip_model.download("https://svn.spraakdata.gu.se/sb-arkiv/!svn/bc/230835/pub/stanza/synt_stanza_full2.zip") zip_model.unzip() zip_model.remove() + + +@modelbuilder("Stanza models for other languages than Swedish", language=["eng"]) +def get_model(lang: Language = Language(), + resources_file: ModelOutput = ModelOutput("stanza/[metadata.language]/resources.json")): + """Download Stanza language models.""" + import stanza + lang_name = iso639.languages.get(part3=lang).name if lang in iso639.languages.part3 else lang + stanza_lang = iso639.languages.get(part3=lang).part1 + logger.info(f"Downloading Stanza language model for {lang_name}") + stanza.download(lang=stanza_lang, model_dir=str(resources_file.path.parent), verbose=False, + logging_level=logging.WARNING) + zip_file = Model(f"stanza/{lang}/{stanza_lang}/default.zip") + zip_file.remove() diff --git a/sparv/modules/stanza/stanza.py b/sparv/modules/stanza/stanza.py index 894dc5f0..ecd6617e 100644 --- a/sparv/modules/stanza/stanza.py +++ b/sparv/modules/stanza/stanza.py @@ -1,404 +1,260 @@ """POS tagging, lemmatisation and dependency parsing with Stanza.""" -from contextlib import redirect_stderr -from os import devnull +from typing import Optional -import sparv.util as util -from sparv import Annotation, Config, Model, Output, annotator +import iso639 -logger = util.get_logger(__name__) +from sparv.api import Annotation, Config, Language, Model, Output, Text, annotator, get_logger, util +from . import stanza_utils +logger = get_logger(__name__) -@annotator("POS, lemma and dependency relations from Stanza", language=["swe"], order=1) -def annotate(out_msd: Output = Output(":stanza.msd", cls="token:msd", - description="Part-of-speeches with morphological descriptions"), - out_pos: Output = Output(":stanza.pos", cls="token:pos", description="Part-of-speech tags"), - out_feats: Output = Output(":stanza.ufeats", cls="token:ufeats", - description="Universal morphological features"), + +@annotator("POS, lemma and dependency relations from Stanza", language=["eng"]) +def annotate(corpus_text: Text = Text(), + lang: Language = Language(), + sentence_chunk: Optional[Annotation] = Annotation("[stanza.sentence_chunk]"), + sentence_annotation: Optional[Annotation] = Annotation("[stanza.sentence_annotation]"), + token_annotation: Optional[Annotation] = Annotation("[stanza.token_annotation]"), + out_sentence: Optional[Output] = Output("stanza.sentence", cls="sentence", + description="Sentence segments"), + out_token: Output = Output("stanza.token", cls="token", description="Token segments"), + out_upos: Output = Output(":stanza.upos", cls="token:upos", description="Part-of-speeches in UD"), + out_pos: Output = Output(":stanza.pos", cls="token:pos", + description="Part-of-speeches from Stanza"), out_baseform: Output = Output(":stanza.baseform", cls="token:baseform", description="Baseform from Stanza"), - out_dephead: Output = Output(":stanza.dephead", cls="token:dephead", - description="Positions of the dependency heads"), - out_dephead_ref: Output = Output(":stanza.dephead_ref", cls="token:dephead_ref", - description="Sentence-relative positions of the dependency heads"), + out_feats: Output = Output(":stanza.ufeats", cls="token:ufeats", + description="Universal morphological features"), out_deprel: Output = Output(":stanza.deprel", cls="token:deprel", description="Dependency relations to the head"), - word: Annotation = Annotation(""), - token: Annotation = Annotation(""), - sentence: Annotation = Annotation(""), - pos_model: Model = Model("[stanza.pos_model]"), - pos_pretrain_model: Model = Model("[stanza.pretrain_pos_model]"), - lem_model: Model = Model("[stanza.lem_model]"), - dep_model: Model = Model("[stanza.dep_model]"), - dep_pretrain_model: Model = Model("[stanza.pretrain_dep_model]"), + out_dephead_ref: Output = Output(":stanza.dephead_ref", cls="token:dephead_ref", + description="Sentence-relative positions of the dependency heads"), + out_dephead: Output = Output(":stanza.dephead", cls="token:dephead", + description="Positions of the dependency heads"), + out_ne: Output = Output("stanza.ne", cls="named_entity", description="Named entity segments from Stanza"), + out_ne_type: Output = Output("stanza.ne:stanza.ne_type", cls="token:named_entity_type", + description="Named entitiy types from Stanza"), resources_file: Model = Model("[stanza.resources_file]"), use_gpu: bool = Config("stanza.use_gpu"), batch_size: int = Config("stanza.batch_size"), max_sentence_length: int = Config("stanza.max_sentence_length"), cpu_fallback: bool = Config("stanza.cpu_fallback")): """Do dependency parsing using Stanza.""" - import stanza - # cpu_fallback only makes sense if use_gpu is True cpu_fallback = cpu_fallback and use_gpu - sentences_all, orphans = sentence.get_children(token) - if orphans: - logger.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated with " - f"dependency relations.") - - sentences_dep = [] - sentences_pos = [] - skipped = 0 - - for s in sentences_all: - if len(s) > batch_size: - skipped += 1 - elif len(s) <= max_sentence_length or not max_sentence_length: - sentences_dep.append(s) + # Read corpus_text and text_spans + text_data = corpus_text.read() + + # Define some values needed for Stanza Pipeline + nlp_args = { + "lang": iso639.languages.get(part3=lang).part1, + "processors": "tokenize,mwt,pos,lemma,depparse,ner", # Comma-separated list of processors to use + "dir": str(resources_file.path.parent), + "depparse_max_sentence_size": 200, # Create new batch when encountering sentences larger than this + "depparse_batch_size": batch_size, + "pos_batch_size": batch_size, + "lemma_batch_size": batch_size, + "use_gpu": use_gpu, + "verbose": False + } + stanza_args = { + "use_gpu": use_gpu, + "batch_size": batch_size, + "max_sentence_length": max_sentence_length, + } + + write_tokens = True + + if token_annotation: + write_tokens = False + sentences, _orphans = sentence_annotation.get_children(token_annotation) + # sentences.append(orphans) + token_spans = list(token_annotation.read()) + sentence_segments, all_tokens, ne_segments, ne_types = process_tokens(sentences, token_spans, text_data, + nlp_args, stanza_args) + elif sentence_annotation: + sentence_spans = list(sentence_annotation.read_spans()) + sentence_segments, all_tokens, ne_segments, ne_types = process_sentences(sentence_spans, text_data, nlp_args, + stanza_args) + else: + text_spans = sentence_chunk.read_spans() + sentence_segments, all_tokens, ne_segments, ne_types = process_text(text_spans, text_data, nlp_args, + stanza_args) + + # Write annotations + if all_tokens: + if write_tokens: + out_token.write([(t.start, t.end) for t in all_tokens]) else: - sentences_pos.append(s) - - if sentences_pos and not cpu_fallback: - n = len(sentences_pos) - logger.warning(f"Found {n} sentence{'s' if n > 1 else ''} exceeding the max sentence length " - f"({max_sentence_length}). {'These' if n > 1 else 'This'} sentence{'s' if n > 1 else ''} will " - "not be annotated with dependency relations.") - if skipped: - logger.warning(f"Found {skipped} sentence{'s' if skipped > 1 else ''} exceeding the batch size " - f"({batch_size}) in number of tokens. {'These' if skipped > 1 else 'This'} " - f"sentence{'s' if skipped > 1 else ''} will not be annotated.") - if orphans: - sentences_pos.append(orphans) - word_list = list(word.read()) - msd = word.create_empty_attribute() - pos = word.create_empty_attribute() - feats = word.create_empty_attribute() - baseforms = word.create_empty_attribute() - dephead = word.create_empty_attribute() - dephead_ref = word.create_empty_attribute() - deprel = word.create_empty_attribute() - - for sentences, dep, fallback in ((sentences_dep, True, False), (sentences_pos, False, cpu_fallback)): - if not sentences: - continue - - # Temporarily suppress stderr to silence warning about not having an NVIDIA GPU - with open(devnull, "w") as fnull: - with redirect_stderr(fnull): - # Initialize the pipeline - if dep or fallback: - logger.debug(f"Running dependency parsing and POS-taggning on {len(sentences)} sentences" - f" (using {'GPU' if use_gpu and not fallback else 'CPU'}).") - nlp = stanza.Pipeline( - lang="sv", - processors="tokenize,pos,lemma,depparse", # Comma-separated list of processors to use - dir=str(resources_file.path.parent), - lemma_model_path=str(lem_model.path), - pos_pretrain_path=str(pos_pretrain_model.path), - pos_model_path=str(pos_model.path), - depparse_pretrain_path=str(dep_pretrain_model.path), - depparse_model_path=str(dep_model.path), - tokenize_pretokenized=True, # Assume the text is tokenized by whitespace and sentence split by - # newline. Do not run a model. - tokenize_no_ssplit=True, # Disable sentence segmentation - depparse_max_sentence_size=200, # Create new batch when encountering sentences larger than this - depparse_batch_size=batch_size, - pos_batch_size=batch_size, - lemma_batch_size=batch_size, - use_gpu=use_gpu and not fallback, - verbose=False - ) - else: - logger.debug(f"Running POS-taggning on {len(sentences)} sentences.") - nlp = stanza.Pipeline( - lang="sv", - processors="tokenize,pos", # Comma-separated list of processors to use - dir=str(resources_file.path.parent), - pos_pretrain_path=str(pos_pretrain_model.path), - pos_model_path=str(pos_model.path), - tokenize_pretokenized=True, # Assume the text is tokenized by whitespace and sentence split by - # newline. Do not run a model. - tokenize_no_ssplit=True, # Disable sentence segmentation - pos_batch_size=batch_size, - use_gpu=use_gpu, - verbose=False - ) - - # Format document for stanza: separate tokens by whitespace and sentences by double new lines - document = "\n\n".join([" ".join(word_list[i] for i in sent) for sent in sentences]) - - doc = run_stanza(nlp, document, batch_size, max_sentence_length) - word_count_real = sum(len(s) for s in sentences) - word_count = 0 - for sent, tagged_sent in zip(sentences, doc.sentences): - for w_index, w in zip(sent, tagged_sent.words): - feats_str = util.cwbset(w.feats.split("|") if w.feats else "") - # logger.debug(f"word: {w.text}" - # f"\tlemma: {w.lemma}" - # f"\tmsd: {w.xpos}" - # f"\tpos: {w.upos}" - # f"\tfeats: {feats_str}" - # f"\tdephead_ref: {dephead_ref_str}" - # f"\tdephead: {dephead_str}" - # f"\tdeprel: {w.deprel}" - # f"\thead word: {tagged_sent.words[w.head - 1].text if w.head > 0 else 'root'}") - msd[w_index] = w.xpos - pos[w_index] = w.upos - feats[w_index] = feats_str - baseforms[w_index] = w.lemma - if dep or fallback: - dephead[w_index] = str(sent[w.head - 1]) if w.head > 0 else "-" - dephead_ref[w_index] = str(w.head) if w.head > 0 else "" - deprel[w_index] = w.deprel - word_count += len(tagged_sent.words) - - if word_count != word_count_real: - raise util.SparvErrorMessage( - "Stanza POS tagger did not seem to respect the given tokenisation! Do your tokens contain whitespaces?") - - out_msd.write(msd) - out_pos.write(pos) - out_feats.write(feats) - out_baseform.write(baseforms) - out_dephead_ref.write(dephead_ref) - out_dephead.write(dephead) - out_deprel.write(deprel) - - -@annotator("Part-of-speech annotation with morphological descriptions from Stanza", language=["swe"], order=2) -def msdtag(out_msd: Output = Output(":stanza.msd", cls="token:msd", - description="Part-of-speeches with morphological descriptions"), - out_pos: Output = Output(":stanza.pos", cls="token:pos", description="Part-of-speech tags"), - out_feats: Output = Output(":stanza.ufeats", cls="token:ufeats", - description="Universal morphological features"), - word: Annotation = Annotation(""), - token: Annotation = Annotation(""), - sentence: Annotation = Annotation(""), - model: Model = Model("[stanza.pos_model]"), - pretrain_model: Model = Model("[stanza.pretrain_pos_model]"), - resources_file: Model = Model("[stanza.resources_file]"), - use_gpu: bool = Config("stanza.use_gpu"), - batch_size: int = Config("stanza.batch_size")): - """Do dependency parsing using Stanza.""" + out_token.write([]) + out_upos.write([t.upos for t in all_tokens]) + out_pos.write([t.pos for t in all_tokens]) + out_baseform.write([t.baseform for t in all_tokens]) + out_feats.write([t.feats for t in all_tokens]) + out_deprel.write([t.deprel for t in all_tokens]) + out_dephead_ref.write([t.dephead_ref for t in all_tokens]) + out_dephead.write([t.dephead for t in all_tokens]) + # TODO: Sparv does not support optional outputs yet, so always write these, even if they're empty + out_sentence.write(sentence_segments) + out_ne.write(ne_segments) + out_ne_type.write(ne_types) + + +def process_tokens(sentences, token_spans, text_data, nlp_args, stanza_args): + """Process pre-tokenized text with Stanza.""" import stanza - sentences, orphans = sentence.get_children(token) - sentences.append(orphans) - word_list = list(word.read()) - msd = word.create_empty_attribute() - pos = word.create_empty_attribute() - feats = word.create_empty_attribute() - - # Format document for stanza: separate tokens by whitespace and sentences by double new lines - document = "\n\n".join([" ".join(word_list[i] for i in sent) for sent in sentences]) - - # Temporarily suppress stderr to silence warning about not having an NVIDIA GPU - with open(devnull, "w") as fnull: - with redirect_stderr(fnull): - # Initialize the pipeline - nlp = stanza.Pipeline( - lang="sv", # Language code for the language to build the Pipeline in - processors="tokenize,pos", # Comma-separated list of processors to use - dir=str(resources_file.path.parent), - pos_pretrain_path=str(pretrain_model.path), - pos_model_path=str(model.path), - tokenize_pretokenized=True, # Assume the text is tokenized by whitespace and sentence split by - # newline. Do not run a model. - tokenize_no_ssplit=True, # Disable sentence segmentation - pos_batch_size=batch_size, - use_gpu=use_gpu, - verbose=False - ) - - doc = run_stanza(nlp, document, batch_size) - word_count = 0 - for sent, tagged_sent in zip(sentences, doc.sentences): - for w_index, w in zip(sent, tagged_sent.words): - word_count += 1 - feats_str = util.cwbset(w.feats.split("|") if w.feats else "") - # logger.debug(f"word: {w.text}" - # f"\tmsd: {w.xpos}" - # f"\tpos: {w.upos}" - # f"\tfeats: {feats_str}") - msd[w_index] = w.xpos - pos[w_index] = w.upos - feats[w_index] = feats_str - - if len(word_list) != word_count: - raise util.SparvErrorMessage( - "Stanza POS tagger did not seem to respect the given tokenisation! Do your tokens contain whitespaces?") - - out_msd.write(msd) - out_pos.write(pos) - out_feats.write(feats) - - -@annotator("Dependency parsing using Stanza", language=["swe"], order=2) -def dep_parse(out_dephead: Output = Output(":stanza.dephead", cls="token:dephead", - description="Positions of the dependency heads"), - out_dephead_ref: Output = Output(":stanza.dephead_ref", cls="token:dephead_ref", - description="Sentence-relative positions of the dependency heads"), - out_deprel: Output = Output(":stanza.deprel", cls="token:deprel", - description="Dependency relations to the head"), - word: Annotation = Annotation(""), - token: Annotation = Annotation(""), - baseform: Annotation = Annotation(""), - msd: Annotation = Annotation(""), - feats: Annotation = Annotation(""), - ref: Annotation = Annotation(":misc.number_rel_"), - sentence: Annotation = Annotation(""), - model: Model = Model("[stanza.dep_model]"), - pretrain_model: Model = Model("[stanza.pretrain_dep_model]"), - resources_file: Model = Model("[stanza.resources_file]"), - use_gpu: bool = Config("stanza.use_gpu"), - batch_size: int = Config("stanza.batch_size"), - max_sentence_length: int = Config("stanza.max_sentence_length"), - cpu_fallback: bool = Config("stanza.cpu_fallback")): - """Do dependency parsing using Stanza.""" + # Init Stanza pipeline + nlp_args["tokenize_pretokenized"] = True + nlp = stanza.Pipeline(**nlp_args) + + # Format document for stanza: list of lists of string + document = [[text_data[token_spans[i][0][0]:token_spans[i][1][0]] for i in s] for s in sentences] + + # Run Stanza and process output + doc = stanza_utils.run_stanza(nlp, document, stanza_args["batch_size"], stanza_args["max_sentence_length"]) + all_tokens = [] + ne_segments = [] + ne_types = [] + token_dephead_count = 0 + token_positions = [] + + stanza_utils.check_sentence_respect(len(list(s for s in sentences if s)), len(doc.sentences)) + for sent_span, tagged_sent in zip(sentences, doc.sentences): + current_sentence_len = 0 + for w_index, tagged_w in zip(sent_span, tagged_sent.words): + token = Token(tagged_w, offset=0, token_dephead_count=token_dephead_count) + all_tokens.append(token) + current_sentence_len += 1 + token_positions.append((token.start, token.end, token_spans[w_index][0][0], token_spans[w_index][1][0])) + token_dephead_count += current_sentence_len + stanza_utils.check_token_respect(len(sent_span), len(tagged_sent.words)) + + # Get named entities + token_positions = iter(token_positions) + stanza_end = -1 + for entity in doc.entities: + # Get positions for NE spans + if entity.start_char > stanza_end: + for stanza_start, stanza_end, start, end in token_positions: + if stanza_start <= entity.start_char < stanza_end: + sparv_start = start + if stanza_start < entity.end_char <= stanza_end: + sparv_end = end + break + ne_segments.append((sparv_start, sparv_end)) + ne_types.append(entity.type) + + return [], all_tokens, ne_segments, ne_types + + +def process_sentences(sentence_spans, text_data, nlp_args, stanza_args): + """Process pre-sentence segmented text with Stanza.""" import stanza - from stanza.models.common.doc import Document - # cpu_fallback only makes sense if use_gpu is True - cpu_fallback = cpu_fallback and use_gpu + # Init Stanza pipeline + nlp_args["tokenize_no_ssplit"] = True + nlp = stanza.Pipeline(**nlp_args) + + # Format document for stanza: separate sentences by double new lines + document = "\n\n".join([text_data[sent_span[0]:sent_span[1]].replace("\n", " ") for sent_span in sentence_spans]) + + # Run Stanza and process output + doc = stanza_utils.run_stanza(nlp, document, stanza_args["batch_size"], stanza_args["max_sentence_length"]) + all_tokens = [] + ne_segments = [] + ne_types = [] + token_dephead_count = 0 + offset = 0 + sentence_offsets = [] + previous_sentence_end_position = -2 + + stanza_utils.check_sentence_respect(len(sentence_spans), len(doc.sentences)) + for sent_span, tagged_sent in zip(sentence_spans, doc.sentences): + # Calculate the difference between the positions in the document and the ones from Stanza. + # -2 is to compensate for two line breaks between sentences in the Stanza input + offset += sent_span[0] - previous_sentence_end_position - 2 + current_sentence_len = 0 + for w in tagged_sent.words: + token = Token(w, offset=offset, token_dephead_count=token_dephead_count) + current_sentence_len += 1 + all_tokens.append(token) + sentence_offsets.append((previous_sentence_end_position, token.end - offset, offset)) + previous_sentence_end_position = token.end + token_dephead_count += current_sentence_len + + # Get named entities + sentence_offsets = iter(sentence_offsets) + end = -1 + for entity in doc.entities: + # Calculate positions for NE spans + if entity.start_char > end: + for start, end, offs in sentence_offsets: + if start <= entity.start_char < end: + break + ne_segments.append((entity.start_char + offs, entity.end_char + offs)) + ne_types.append(entity.type) + + return [], all_tokens, ne_segments, ne_types + + +def process_text(text_spans, text_data, nlp_args, stanza_args): + """Process text with Stanza (including sentence segmentation).""" + import stanza - sentences_all, orphans = sentence.get_children(token) - if orphans: - logger.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated with " - f"dependency relations.") - sentences_dep = [] - sentences_fallback = [] - skipped_sent = 0 - skipped_batch = 0 - - for s in sentences_all: - if len(s) > batch_size: - skipped_batch += 1 - elif max_sentence_length and len(s) > max_sentence_length: - if cpu_fallback: - sentences_fallback.append(s) - else: - skipped_sent += 1 - else: - sentences_dep.append(s) - - if skipped_sent: - logger.warning(f"Found {skipped_sent} sentence{'s' if skipped_sent > 1 else ''} exceeding the max sentence " - f"length ({max_sentence_length}). {'These' if skipped_sent > 1 else 'This'} " - f"sentence{'s' if skipped_sent > 1 else ''} will not be annotated.") - if skipped_batch: - logger.warning(f"Found {skipped_batch} sentence{'s' if skipped_batch > 1 else ''} exceeding the batch size " - f"({batch_size}) in number of tokens. {'These' if skipped_batch > 1 else 'This'} " - f"sentence{'s' if skipped_batch > 1 else ''} will not be annotated.") - - word_vals = list(word.read()) - baseform_vals = list(baseform.read()) - msd_vals = list(msd.read()) - feats_vals = list(feats.read()) - ref_vals = list(ref.read()) - - dephead = word.create_empty_attribute() - dephead_ref = word.create_empty_attribute() - deprel = word.create_empty_attribute() - - for sentences, fallback in ((sentences_dep, False), (sentences_fallback, cpu_fallback)): - if not sentences: - continue - - document = _build_doc(sentences, - word_vals, - baseform_vals, - msd_vals, - feats_vals, - ref_vals) - - # Temporarily suppress stderr to silence warning about not having an NVIDIA GPU - with open(devnull, "w") as fnull: - with redirect_stderr(fnull): - logger.debug(f"Running dependency parsing on {len(sentences)} sentences" - f" (using {'GPU' if use_gpu and not fallback else 'CPU'}).") - # Initialize the pipeline - nlp = stanza.Pipeline( - lang="sv", # Language code for the language to build the Pipeline in - processors="depparse", # Comma-separated list of processors to use - dir=str(resources_file.path.parent), - depparse_pretrain_path=str(pretrain_model.path), - depparse_model_path=str(model.path), - depparse_pretagged=True, # Only run dependency parsing on the document - depparse_max_sentence_size=200, # Create new batch when encountering sentences larger than this - depparse_batch_size=batch_size, - pos_batch_size=batch_size, - lemma_batch_size=batch_size, - use_gpu=use_gpu and not fallback, - verbose=False - ) - - doc = run_stanza(nlp, Document(document), batch_size, max_sentence_length) - for sent, tagged_sent in zip(sentences, doc.sentences): - for w_index, w in zip(sent, tagged_sent.words): - dephead_str = str(sent[w.head - 1]) if w.head > 0 else "-" - dephead_ref_str = str(w.head) if w.head > 0 else "" - # logger.debug(f"word: {w.text}" - # f"\tdephead_ref: {dephead_ref_str}" - # f"\tdephead: {dephead_str}" - # f"\tdeprel: {w.deprel}" - # f"\thead word: {tagged_sent.words[w.head - 1].text if w.head > 0 else 'root'}") - dephead[w_index] = dephead_str - dephead_ref[w_index] = dephead_ref_str - deprel[w_index] = w.deprel - - out_dephead_ref.write(dephead_ref) - out_dephead.write(dephead) - out_deprel.write(deprel) - - -def _build_doc(sentences, word, baseform, msd, feats, ref): - """Build stanza input for dependency parsing.""" - document = [] - for sent in sentences: - in_sent = [] - for i in sent: - # Format feats - feats_list = util.set_to_list(feats[i]) - if not feats_list: - feats_str = "_" - else: - feats_str = "|".join(feats_list) - # Format baseform - baseform_list = util.set_to_list(baseform[i]) - if not baseform_list: - baseform_str = word[i] - else: - baseform_str = baseform_list[0] - - token_dict = {"id": int(ref[i]), "text": word[i], "lemma": baseform_str, - "xpos": msd[i], "feats": feats_str} - in_sent.append(token_dict) - # logger.debug("\t".join(str(v) for v in token_dict.values())) - if in_sent: - document.append(in_sent) - return document - - -def run_stanza(nlp, document, batch_size, max_sentence_length: int = 0): - """Run Stanza and handle possible errors.""" - try: - doc = nlp(document) - except RuntimeError as e: - gpu_error = "CUDA out of memory" in str(e) - cpu_error = "DefaultCPUAllocator: can't allocate memory" in str(e) - if gpu_error or cpu_error: - msg = "Stanza ran out of memory. You can try the following options to prevent this from happening:\n" \ - " - Limit the number of parallel Stanza processes by using the 'threads' section in your Sparv " \ - "configuration.\n" \ - " - Limit the Stanza batch size by setting the 'stanza.batch_size' config variable to something " \ - f"lower (current value: {batch_size}).\n" \ - " - Exclude excessively long sentences from dependency parsing by setting the " \ - "'stanza.max_sentence_length' config variable to something lower (current value: " \ - f"{max_sentence_length})." - if gpu_error: - msg += "\n - Switch to using CPU by setting the 'stanza.use_gpu' config variable to false." + # Init Stanza pipeline + nlp = stanza.Pipeline(**nlp_args) + + sentence_segments = [] + all_tokens = [] + token_dephead_count = 0 + ne_segments = [] + ne_types = [] + + # Run Stanza once for every input document + for text_span in text_spans: + inputtext = text_data[text_span[0]:text_span[1]] + offset = text_span[0] + doc = stanza_utils.run_stanza(nlp, inputtext, stanza_args["batch_size"], stanza_args["max_sentence_length"]) + for sent in doc.sentences: + current_sentence = [] + for w in sent.words: + token = Token(w, offset=offset, token_dephead_count=token_dephead_count) + current_sentence.append(token) + all_tokens.append(token) + token_dephead_count += len(current_sentence) + sentence_segments.append((current_sentence[0].start, current_sentence[-1].end)) + # Get named entities + for entity in doc.entities: + ne_segments.append((entity.start_char + offset, entity.end_char + offset)) + ne_types.append(entity.type) + + return sentence_segments, all_tokens, ne_segments, ne_types + + +class Token: + """Object to store annotation information for a token.""" + + def __init__(self, stanza_w, offset=0, token_dephead_count=0): + """Set attributes.""" + self.word = stanza_w.text # Mostly used for debugging + self.start = int(stanza_w.misc.split("|")[0].strip("start_char=")) + offset + self.end = int(stanza_w.misc.split("|")[1].strip("end_char=")) + offset + self.upos = stanza_w.upos + self.pos = stanza_w.xpos + self.baseform = stanza_w.lemma + # Format feats + feats_list = util.misc.set_to_list(stanza_w.feats or "") + if not feats_list: + feats_str = "_" else: - msg = str(e) - raise util.SparvErrorMessage(msg) - return doc + feats_str = "|".join(feats_list) + self.feats = feats_str + self.dephead_ref = str(stanza_w.head) if stanza_w.head > 0 else "" + self.deprel = stanza_w.deprel + self.dephead = str(stanza_w.head - 1 + token_dephead_count) if stanza_w.head > 0 else "-" + + def __repr__(self): + return f"{self.word} <{self.baseform} {self.upos} {self.deprel}> ({self.start}-{self.end})" diff --git a/sparv/modules/stanza/stanza_swe.py b/sparv/modules/stanza/stanza_swe.py new file mode 100644 index 00000000..a1c6d6d1 --- /dev/null +++ b/sparv/modules/stanza/stanza_swe.py @@ -0,0 +1,378 @@ +"""POS tagging, lemmatisation and dependency parsing with Stanza.""" + +from sparv.api import Annotation, Config, Model, Output, annotator, get_logger, util +from . import stanza_utils + +logger = get_logger(__name__) + + +@annotator("POS, lemma and dependency relations from Stanza", language=["swe"], order=1) +def annotate_swe( + out_msd: Output = Output(":stanza.msd", cls="token:msd", + description="Part-of-speeches with morphological descriptions"), + out_pos: Output = Output(":stanza.pos", cls="token:pos", description="Part-of-speech tags"), + out_feats: Output = Output(":stanza.ufeats", cls="token:ufeats", + description="Universal morphological features"), + out_baseform: Output = Output(":stanza.baseform", cls="token:baseform", + description="Baseform from Stanza"), + out_dephead: Output = Output(":stanza.dephead", cls="token:dephead", + description="Positions of the dependency heads"), + out_dephead_ref: Output = Output(":stanza.dephead_ref", cls="token:dephead_ref", + description="Sentence-relative positions of the dependency heads"), + out_deprel: Output = Output(":stanza.deprel", cls="token:deprel", + description="Dependency relations to the head"), + word: Annotation = Annotation(""), + token: Annotation = Annotation(""), + sentence: Annotation = Annotation(""), + pos_model: Model = Model("[stanza.swe_pos_model]"), + pos_pretrain_model: Model = Model("[stanza.swe_pretrain_pos_model]"), + lem_model: Model = Model("[stanza.swe_lem_model]"), + dep_model: Model = Model("[stanza.swe_dep_model]"), + dep_pretrain_model: Model = Model("[stanza.swe_pretrain_dep_model]"), + resources_file: Model = Model("[stanza.resources_file]"), + use_gpu: bool = Config("stanza.use_gpu"), + batch_size: int = Config("stanza.batch_size"), + max_sentence_length: int = Config("stanza.max_sentence_length"), + cpu_fallback: bool = Config("stanza.cpu_fallback"), + max_token_length: int = Config("stanza.max_token_length")): + """Do dependency parsing using Stanza.""" + import stanza + + # cpu_fallback only makes sense if use_gpu is True + cpu_fallback = cpu_fallback and use_gpu + + sentences_all, orphans = sentence.get_children(token) + if orphans: + logger.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated with " + f"dependency relations.") + + sentences_dep = [] + sentences_pos = [] + skipped = 0 + skipped_token = 0 + + word_list = list(word.read()) + + for s in sentences_all: + if not s: + continue + elif len(s) > batch_size: + skipped += 1 + else: + if max_token_length: + skip = False + for i in s: + if len(word_list[i]) > max_token_length: + skipped_token += 1 + skip = True + break + if skip: + continue + if len(s) <= max_sentence_length or not max_sentence_length: + sentences_dep.append(s) + else: + sentences_pos.append(s) + + if sentences_pos and not cpu_fallback: + n = len(sentences_pos) + logger.warning(f"Found {n} sentence{'s' if n > 1 else ''} exceeding the max sentence length " + f"({max_sentence_length}). {'These' if n > 1 else 'This'} sentence{'s' if n > 1 else ''} will " + "not be annotated with dependency relations.") + if skipped: + logger.warning(f"Found {skipped} sentence{'s' if skipped > 1 else ''} exceeding the batch size " + f"({batch_size}) in number of tokens. {'These' if skipped > 1 else 'This'} " + f"sentence{'s' if skipped > 1 else ''} will not be annotated.") + if skipped_token: + logger.warning(f"Found {skipped_token} sentence{'s' if skipped_token > 1 else ''} with tokens exceeding the " + f"max token length ({max_token_length}). {'These' if skipped_token > 1 else 'This'} " + f"sentence{'s' if skipped_token > 1 else ''} will not be annotated.") + if orphans: + sentences_pos.append(orphans) + msd = word.create_empty_attribute() + pos = word.create_empty_attribute() + feats = word.create_empty_attribute() + baseforms = word.create_empty_attribute() + dephead = word.create_empty_attribute() + dephead_ref = word.create_empty_attribute() + deprel = word.create_empty_attribute() + + nlp_args = { + "lang": "sv", + "dir": str(resources_file.path.parent), + "tokenize_pretokenized": True, # Assume the text is tokenized by whitespace and sentence split by newline. + "lemma_model_path": str(lem_model.path), + "pos_pretrain_path": str(pos_pretrain_model.path), + "pos_model_path": str(pos_model.path), + "depparse_pretrain_path": str(dep_pretrain_model.path), + "depparse_model_path": str(dep_model.path), + "depparse_max_sentence_size": 200, # Create new batch when encountering sentences larger than this + "depparse_batch_size": batch_size, + "pos_batch_size": batch_size, + "lemma_batch_size": batch_size, + "verbose": False + } + + for sentences, dep, fallback in ((sentences_dep, True, False), (sentences_pos, False, cpu_fallback)): + if not sentences: + continue + + # Init Stanza pipeline + if dep or fallback: + logger.debug(f"Running dependency parsing and POS-taggning on {len(sentences)} sentences" + f" (using {'GPU' if use_gpu and not fallback else 'CPU'}).") + nlp_args["processors"] = "tokenize,pos,lemma,depparse" # Comma-separated list of processors to use + nlp_args["use_gpu"] = use_gpu and not fallback, + nlp = stanza.Pipeline(**nlp_args) + + else: + logger.debug(f"Running POS-taggning on {len(sentences)} sentences.") + nlp_args["processors"] = "tokenize,pos" # Comma-separated list of processors to use + nlp_args["use_gpu"] = use_gpu + nlp = stanza.Pipeline(**nlp_args) + + # Format document for stanza: list of lists of string + document = [[word_list[i] for i in s] for s in sentences] + + doc = stanza_utils.run_stanza(nlp, document, batch_size, max_sentence_length) + stanza_utils.check_sentence_respect(len(list(s for s in sentences if s)), len(doc.sentences)) + word_count_real = sum(len(s) for s in sentences) + word_count = 0 + for sent, tagged_sent in zip(sentences, doc.sentences): + for w_index, w in zip(sent, tagged_sent.words): + feats_str = util.misc.cwbset(w.feats.split("|") if w.feats else "") + msd[w_index] = w.xpos + pos[w_index] = w.upos + feats[w_index] = feats_str + baseforms[w_index] = w.lemma + if dep or fallback: + dephead[w_index] = str(sent[w.head - 1]) if w.head > 0 else "-" + dephead_ref[w_index] = str(w.head) if w.head > 0 else "" + deprel[w_index] = w.deprel + word_count += len(tagged_sent.words) + stanza_utils.check_token_respect(word_count_real, word_count) + + out_msd.write(msd) + out_pos.write(pos) + out_feats.write(feats) + out_baseform.write(baseforms) + out_dephead_ref.write(dephead_ref) + out_dephead.write(dephead) + out_deprel.write(deprel) + + +@annotator("Part-of-speech annotation with morphological descriptions from Stanza", language=["swe"], order=2) +def msdtag(out_msd: Output = Output(":stanza.msd", cls="token:msd", + description="Part-of-speeches with morphological descriptions"), + out_pos: Output = Output(":stanza.pos", cls="token:pos", description="Part-of-speech tags"), + out_feats: Output = Output(":stanza.ufeats", cls="token:ufeats", + description="Universal morphological features"), + word: Annotation = Annotation(""), + token: Annotation = Annotation(""), + sentence: Annotation = Annotation(""), + model: Model = Model("[stanza.swe_pos_model]"), + pretrain_model: Model = Model("[stanza.swe_pretrain_pos_model]"), + resources_file: Model = Model("[stanza.resources_file]"), + use_gpu: bool = Config("stanza.use_gpu"), + batch_size: int = Config("stanza.batch_size")): + """Do dependency parsing using Stanza.""" + import stanza + + sentences, orphans = sentence.get_children(token) + sentences.append(orphans) + word_list = list(word.read()) + msd = word.create_empty_attribute() + pos = word.create_empty_attribute() + feats = word.create_empty_attribute() + + # Format document for stanza: list of lists of string + document = [[word_list[i] for i in s] for s in sentences] + + # Init Stanza Pipeline + nlp = stanza.Pipeline({ + "lang": "sv", + "processors": "tokenize,pos", + "dir": str(resources_file.path.parent), + "tokenize_pretokenized": True, # Assume the text is tokenized by whitespace and sentence split by newline. + "pos_pretrain_path": str(pretrain_model.path), + "pos_model_path": str(model.path), + "pos_batch_size": batch_size, + "use_gpu": use_gpu, + "verbose": False + }) + + doc = stanza_utils.run_stanza(nlp, document, batch_size) + stanza_utils.check_sentence_respect(len(list(s for s in sentences if s)), len(doc.sentences)) + word_count = 0 + for sent, tagged_sent in zip(sentences, doc.sentences): + for w_index, w in zip(sent, tagged_sent.words): + word_count += 1 + feats_str = util.misc.cwbset(w.feats.split("|") if w.feats else "") + msd[w_index] = w.xpos + pos[w_index] = w.upos + feats[w_index] = feats_str + stanza_utils.check_token_respect(len(word_list), word_count) + + out_msd.write(msd) + out_pos.write(pos) + out_feats.write(feats) + + +@annotator("Dependency parsing using Stanza", language=["swe"], order=2) +def dep_parse(out_dephead: Output = Output(":stanza.dephead", cls="token:dephead", + description="Positions of the dependency heads"), + out_dephead_ref: Output = Output(":stanza.dephead_ref", cls="token:dephead_ref", + description="Sentence-relative positions of the dependency heads"), + out_deprel: Output = Output(":stanza.deprel", cls="token:deprel", + description="Dependency relations to the head"), + word: Annotation = Annotation(""), + token: Annotation = Annotation(""), + baseform: Annotation = Annotation(""), + msd: Annotation = Annotation(""), + feats: Annotation = Annotation(""), + ref: Annotation = Annotation(":stanza.ref"), + sentence: Annotation = Annotation(""), + model: Model = Model("[stanza.swe_dep_model]"), + pretrain_model: Model = Model("[stanza.swe_pretrain_dep_model]"), + resources_file: Model = Model("[stanza.resources_file]"), + use_gpu: bool = Config("stanza.use_gpu"), + batch_size: int = Config("stanza.batch_size"), + max_sentence_length: int = Config("stanza.max_sentence_length"), + cpu_fallback: bool = Config("stanza.cpu_fallback")): + """Do dependency parsing using Stanza.""" + import stanza + from stanza.models.common.doc import Document + + # cpu_fallback only makes sense if use_gpu is True + cpu_fallback = cpu_fallback and use_gpu + + sentences_all, orphans = sentence.get_children(token) + if orphans: + logger.warning(f"Found {len(orphans)} tokens not belonging to any sentence. These will not be annotated with " + f"dependency relations.") + sentences_dep = [] + sentences_fallback = [] + skipped_sent = 0 + skipped_batch = 0 + + for s in sentences_all: + if len(s) > batch_size: + skipped_batch += 1 + elif max_sentence_length and len(s) > max_sentence_length: + if cpu_fallback: + sentences_fallback.append(s) + else: + skipped_sent += 1 + else: + sentences_dep.append(s) + + if skipped_sent: + logger.warning(f"Found {skipped_sent} sentence{'s' if skipped_sent > 1 else ''} exceeding the max sentence " + f"length ({max_sentence_length}). {'These' if skipped_sent > 1 else 'This'} " + f"sentence{'s' if skipped_sent > 1 else ''} will not be annotated.") + if skipped_batch: + logger.warning(f"Found {skipped_batch} sentence{'s' if skipped_batch > 1 else ''} exceeding the batch size " + f"({batch_size}) in number of tokens. {'These' if skipped_batch > 1 else 'This'} " + f"sentence{'s' if skipped_batch > 1 else ''} will not be annotated.") + + word_vals = list(word.read()) + baseform_vals = list(baseform.read()) + msd_vals = list(msd.read()) + feats_vals = list(feats.read()) + ref_vals = list(ref.read()) + + dephead = word.create_empty_attribute() + dephead_ref = word.create_empty_attribute() + deprel = word.create_empty_attribute() + + for sentences, fallback in ((sentences_dep, False), (sentences_fallback, cpu_fallback)): + if not sentences: + continue + + document = _build_doc(sentences, + word_vals, + baseform_vals, + msd_vals, + feats_vals, + ref_vals) + + # Init Stanza Pipeline + nlp = stanza.Pipeline({ + "lang": "sv", + "dir": str(resources_file.path.parent), + "processors": "depparse", + "depparse_pretrain_path": str(pretrain_model.path), + "depparse_model_path": str(model.path), + "depparse_max_sentence_size": 200, # Create new batch when encountering sentences larger than this + "depparse_batch_size": batch_size, + "use_gpu": use_gpu and not fallback, + "verbose": False + }) + + doc = stanza_utils.run_stanza(nlp, Document(document), batch_size, max_sentence_length) + for sent, tagged_sent in zip(sentences, doc.sentences): + for w_index, w in zip(sent, tagged_sent.words): + dephead_str = str(sent[w.head - 1]) if w.head > 0 else "-" + dephead_ref_str = str(w.head) if w.head > 0 else "" + dephead[w_index] = dephead_str + dephead_ref[w_index] = dephead_ref_str + deprel[w_index] = w.deprel + + out_dephead_ref.write(dephead_ref) + out_dephead.write(dephead) + out_deprel.write(deprel) + + +@annotator("Extract POS from MSD", language=["swe", "swe-1800"]) +def msd_backoff_hunpos( + stanza_msd: Annotation = Annotation(":stanza.msd"), + hunpos_msd: Annotation = Annotation(":hunpos.msd"), + out: Output = Output(":stanza.msd_hunpos_backoff", cls="token:msd", description="Part-of-speech tags with " + "morphological descriptions from Stanza or Hunpos."), + info: Output = Output(":stanza.msd_hunpos_backoff_info", description="Info about which annotator each msd " + "annotation was produced with.")): + """Replace empty values in 'stanza_msd' with values from 'hunpos_msd'.""" + from sparv.modules.misc import misc + misc.backoff_with_info(chunk=stanza_msd, backoff=hunpos_msd, out=out, out_info=info, chunk_name="stanza", + backoff_name="hunpos") + + +@annotator("Extract POS from MSD", language=["swe", "swe-1800"]) +def pos_backoff_hunpos( + stanza_pos: Annotation = Annotation(":stanza.pos"), + hunpos_pos: Annotation = Annotation(":hunpos.pos"), + out: Output = Output(":stanza.pos_hunpos_backoff", cls="token:pos", + description="Part-of-speech tags from Stanza or Hunpos."), + info: Output = Output(":stanza.pos_hunpos_backoff_info", description="Info about which annotator each pos " + "annotation was produced with.")): + """Replace empty values in 'stanza_pos' with values from 'hunpos_pos'.""" + from sparv.modules.misc import misc + misc.backoff_with_info(chunk=stanza_pos, backoff=hunpos_pos, out=out, out_info=info, chunk_name="stanza", + backoff_name="hunpos") + + +def _build_doc(sentences, word, baseform, msd, feats, ref): + """Build stanza input for dependency parsing.""" + document = [] + for sent in sentences: + in_sent = [] + for i in sent: + # Format feats + feats_list = util.misc.set_to_list(feats[i]) + if not feats_list: + feats_str = "_" + else: + feats_str = "|".join(feats_list) + # Format baseform + baseform_list = util.misc.set_to_list(baseform[i]) + if not baseform_list: + baseform_str = word[i] + else: + baseform_str = baseform_list[0] + + token_dict = {"id": int(ref[i]), "text": word[i], "lemma": baseform_str, + "xpos": msd[i], "feats": feats_str} + in_sent.append(token_dict) + # logger.debug("\t".join(str(v) for v in token_dict.values())) + if in_sent: + document.append(in_sent) + return document diff --git a/sparv/modules/stanza/stanza_utils.py b/sparv/modules/stanza/stanza_utils.py new file mode 100644 index 00000000..f5419468 --- /dev/null +++ b/sparv/modules/stanza/stanza_utils.py @@ -0,0 +1,52 @@ +"""Util functions used in stanza.""" + +from sparv.api import Annotation, Output, SparvErrorMessage, annotator, util + + +@annotator("Annotate tokens with IDs relative to their sentences") +def make_ref(out: Output = Output(":stanza.ref", cls="token:ref", + description="Token IDs relative to their sentences"), + sentence: Annotation = Annotation(""), + token: Annotation = Annotation("")): + """Annotate tokens with IDs relative to their sentences.""" + from sparv.modules.misc import number + number.number_relative(out, sentence, token) + + +def run_stanza(nlp, document, batch_size, max_sentence_length: int = 0, max_token_length: int = 0): + """Run Stanza and handle possible errors.""" + try: + doc = nlp(document) + except RuntimeError as e: + gpu_error = "CUDA out of memory" in str(e) + cpu_error = "DefaultCPUAllocator: can't allocate memory" in str(e) + if gpu_error or cpu_error: + msg = "Stanza ran out of memory. You can try the following options to prevent this from happening:\n" \ + " - Limit the number of parallel Stanza processes by using the 'threads' section in your Sparv " \ + "configuration.\n" \ + " - Limit the Stanza batch size by setting the 'stanza.batch_size' config variable to something " \ + f"lower (current value: {batch_size}).\n" \ + " - Exclude excessively long sentences from dependency parsing by setting the " \ + "'stanza.max_sentence_length' config variable to something lower (current value: " \ + f"{max_sentence_length or 'disabled'}).\n" \ + " - Exclude sentences with unreasonably long tokens by setting the " \ + "'stanza.max_token_length' config variable to something lower (current value: " \ + f"{max_token_length or 'disabled'})." + if gpu_error: + msg += "\n - Switch to using CPU by setting the 'stanza.use_gpu' config variable to false." + else: + msg = str(e) + raise SparvErrorMessage(msg) + return doc + + +def check_sentence_respect(sparv_sent_len: int, stanza_sent_len: int): + """Check whether Stanza respected the given sentence segmentation.""" + if sparv_sent_len != stanza_sent_len: + raise SparvErrorMessage("The Stanza pipeline did not seem to respect the given sentence segmentation!") + + +def check_token_respect(sparv_token_len: int, stanza_token_len: int): + """Check whether Stanza respected the given tokenization.""" + if sparv_token_len != stanza_token_len: + raise SparvErrorMessage("Stanza pipeline did not seem to respect the given tokenisation!") diff --git a/sparv/modules/stats_export/__init__.py b/sparv/modules/stats_export/__init__.py index dcd7d7af..88ba44df 100644 --- a/sparv/modules/stats_export/__init__.py +++ b/sparv/modules/stats_export/__init__.py @@ -1,3 +1,17 @@ """Word frequency list generation.""" -from . import stats_export +from sparv.api import Config + +from . import stats_export, sbx_stats + +__config__ = [ + Config("stats_export.annotations", description="Sparv annotations to include."), + Config("stats_export.source_annotations", + description="List of annotations and attributes from the source data to include. None will be included by " + "default."), + Config("stats_export.delimiter", default="\t", description="Delimiter separating columns"), + Config("stats_export.cutoff", default=1, + description="The minimum frequency a word must have in order to be included in the result"), + Config("stats_export.remote_host", "", description="Remote host to install to"), + Config("stats_export.remote_dir", "", description="Path on remote host to install to") +] diff --git a/sparv/modules/stats_export/sbx_stats.py b/sparv/modules/stats_export/sbx_stats.py new file mode 100644 index 00000000..f262d621 --- /dev/null +++ b/sparv/modules/stats_export/sbx_stats.py @@ -0,0 +1,217 @@ +"""SBX specific annotation and export functions related to the stats export.""" + +from sparv.api import (AllSourceFilenames, Annotation, AnnotationAllSourceFiles, Config, Export, ExportInput, Output, + OutputCommonData, annotator, exporter, get_logger, installer, util) + +from .stats_export import freq_list + +logger = get_logger(__name__) + + +@annotator("Extract the complemgram with the highest score", language=["swe"]) +def best_complemgram( + out: Output = Output(":stats_export.complemgram_best", description="Complemgram annotation with highest score"), + complemgram: Annotation = Annotation(":saldo.complemgram")): + """Extract the complemgram with the highest score.""" + from sparv.modules.misc import misc + misc.best_from_set(out, complemgram, is_sorted=True) + + +@annotator("Extract the sense with the highest score", language=["swe"]) +def best_sense( + out: Output = Output(":stats_export.sense_best", description="Sense annotation with highest score"), + sense: Annotation = Annotation(":wsd.sense")): + """Extract the sense annotation with the highest score.""" + from sparv.modules.misc import misc + misc.best_from_set(out, sense, is_sorted=True) + + +@annotator("Extract the first baseform annotation from a set of baseforms", language=["swe"]) +def first_baseform( + out: Output = Output(":stats_export.baseform_first", description="First baseform from a set of baseforms"), + baseform: Annotation = Annotation("")): + """Extract the first baseform annotation from a set of baseforms.""" + from sparv.modules.misc import misc + misc.first_from_set(out, baseform) + + +@annotator("Extract the first lemgram annotation from a set of lemgrams", language=["swe"]) +def first_lemgram( + out: Output = Output(":stats_export.lemgram_first", description="First lemgram from a set of lemgrams"), + lemgram: Annotation = Annotation(":saldo.lemgram")): + """Extract the first lemgram annotation from a set of lemgrams.""" + from sparv.modules.misc import misc + misc.first_from_set(out, lemgram) + + +@annotator("Get the best complemgram if the token is lacking a sense annotation", language=["swe"]) +def conditional_best_complemgram( + out_complemgrams: Output = Output(":stats_export.complemgram_best_cond", + description="Compound analysis using lemgrams"), + complemgrams: Annotation= Annotation(":stats_export.complemgram_best"), + sense: Annotation = Annotation("")): + """Get the best complemgram if the token is lacking a sense annotation.""" + all_annotations = list(complemgrams.read_attributes((complemgrams, sense))) + short_complemgrams = [] + for complemgram, sense in all_annotations: + if sense and sense != "|": + complemgram = "" + short_complemgrams.append(complemgram) + out_complemgrams.write(short_complemgrams) + + +@exporter("Corpus word frequency list", language=["swe"], order=1) +def sbx_freq_list( + source_files: AllSourceFilenames = AllSourceFilenames(), + word: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + msd: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + baseform: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":stats_export.baseform_first"), + sense: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":stats_export.sense_best"), + lemgram: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":stats_export.lemgram_first"), + complemgram: AnnotationAllSourceFiles = AnnotationAllSourceFiles( + ":stats_export.complemgram_best_cond"), + out: Export = Export("stats_export.frequency_list_sbx/stats_[metadata.id].csv"), + delimiter: str = Config("stats_export.delimiter"), + cutoff: int = Config("stats_export.cutoff")): + """Create a word frequency list for the entire corpus. + + Args: + source_files (list, optional): The source files belonging to this corpus. Defaults to AllSourceFilenames. + word (str, optional): Word annotations. Defaults to AnnotationAllSourceFiles(""). + token (str, optional): Token span annotations. Defaults to AnnotationAllSourceFiles(""). + msd (str, optional): MSD annotations. Defaults to AnnotationAllSourceFiles(""). + baseform (str, optional): Annotations with first baseform from each set. + Defaults to AnnotationAllSourceFiles(""). + sense (str, optional): Best sense annotations. Defaults to AnnotationAllSourceFiles(""). + lemgram (str, optional): Annotations with first lemgram from each set. + Defaults to AnnotationAllSourceFiles(":saldo.lemgram"). + complemgram (str, optional): Conditional best compound lemgram annotations. + Defaults to AnnotationAllSourceFiles(":saldo.complemgram"). + out (str, optional): The output word frequency file. + Defaults to Export("stats_export.frequency_list_sbx/[metadata.id].csv"). + delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter"). + cutoff (int, optional): The minimum frequency a word must have in order to be included in the result. + Defaults to Config("stats_export.cutoff"). + """ + annotations = [(word, "token"), (msd, "POS"), (baseform, "lemma"), (sense, "SALDO sense"), (lemgram, "lemgram"), + (complemgram, "compound")] + + freq_list(source_files=source_files, word=word, token=token, annotations=annotations, source_annotations=[], + out=out, sparv_namespace="", source_namespace="", delimiter=delimiter, cutoff=cutoff) + + +@exporter("Corpus word frequency list", language=["swe"]) +def sbx_freq_list_date( + source_files: AllSourceFilenames = AllSourceFilenames(), + word: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + msd: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + baseform: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":stats_export.baseform_first"), + sense: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":stats_export.sense_best"), + lemgram: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":stats_export.lemgram_first"), + complemgram: AnnotationAllSourceFiles = AnnotationAllSourceFiles( + ":stats_export.complemgram_best_cond"), + date: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[dateformat.datetime_from]"), + out: Export = Export("stats_export.frequency_list_sbx_date/stats_[metadata.id].csv"), + delimiter: str = Config("stats_export.delimiter"), + cutoff: int = Config("stats_export.cutoff")): + """Create a word frequency list for the entire corpus. + + Args: + source_files (list, optional): The source files belonging to this corpus. Defaults to AllSourceFilenames. + word (str, optional): Word annotations. Defaults to AnnotationAllSourceFiles(""). + token (str, optional): Token span annotations. Defaults to AnnotationAllSourceFiles(""). + msd (str, optional): MSD annotations. Defaults to AnnotationAllSourceFiles(""). + baseform (str, optional): Annotations with first baseform from each set. + Defaults to AnnotationAllSourceFiles(""). + sense (str, optional): Best sense annotations. Defaults to AnnotationAllSourceFiles(""). + lemgram (str, optional): Annotations with first lemgram from each set. + Defaults to AnnotationAllSourceFiles(":saldo.lemgram"). + complemgram (str, optional): Conditional best compound lemgram annotations. + Defaults to AnnotationAllSourceFiles(":saldo.complemgram"). + date (str, optional): date annotation + out (str, optional): The output word frequency file. + Defaults to Export("stats_export.frequency_list_sbx_date/[metadata.id].csv"). + delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter"). + cutoff (int, optional): The minimum frequency a word must have in order to be included in the result. + Defaults to Config("stats_export.cutoff"). + """ + annotations = [(word, "token"), (msd, "POS"), (baseform, "lemma"), (sense, "SALDO sense"), (lemgram, "lemgram"), + (complemgram, "compound"), (date, "date")] + + freq_list(source_files=source_files, word=word, token=token, annotations=annotations, source_annotations=[], + out=out, sparv_namespace="", source_namespace="", delimiter=delimiter, cutoff=cutoff) + + +@exporter("Corpus word frequency list (without Swedish annotations)", language=["swe"], order=2) +def sbx_freq_list_simple_swe( + source_files: AllSourceFilenames = AllSourceFilenames(), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + word: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + pos: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + baseform: AnnotationAllSourceFiles = AnnotationAllSourceFiles(":stats_export.baseform_first"), + out: Export = Export("stats_export.frequency_list_sbx/stats_[metadata.id].csv"), + delimiter: str = Config("stats_export.delimiter"), + cutoff: int = Config("stats_export.cutoff")): + """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations.""" + annotations = [(word, "token"), (pos, "POS"), (baseform, "lemma")] + + freq_list(source_files=source_files, word=word, token=token, annotations=annotations, source_annotations=[], + out=out, sparv_namespace="", source_namespace="", delimiter=delimiter, cutoff=cutoff) + + +@exporter("Corpus word frequency list (without Swedish annotations)", order=3) +def sbx_freq_list_simple( + source_files: AllSourceFilenames = AllSourceFilenames(), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + word: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + pos: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + baseform: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + out: Export = Export("stats_export.frequency_list_sbx/stats_[metadata.id].csv"), + delimiter: str = Config("stats_export.delimiter"), + cutoff: int = Config("stats_export.cutoff")): + """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations.""" + annotations = [(word, "token"), (pos, "POS"), (baseform, "lemma")] + + freq_list(source_files=source_files, word=word, token=token, annotations=annotations, source_annotations=[], + out=out, sparv_namespace="", source_namespace="", delimiter=delimiter, cutoff=cutoff) + + +@exporter("Corpus word frequency list for Old Swedish (without part-of-speech)", language=["swe-fsv"], order=4) +def sbx_freq_list_fsv( + source_files: AllSourceFilenames = AllSourceFilenames(), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + word: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + baseform: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + lemgram: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + out: Export = Export("stats_export.frequency_list_sbx/stats_[metadata.id].csv"), + delimiter: str = Config("stats_export.delimiter"), + cutoff: int = Config("stats_export.cutoff")): + """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations.""" + annotations = [(word, "token"), (baseform, "lemma"), (lemgram, "lemgram")] + + freq_list(source_files=source_files, word=word, token=token, annotations=annotations, source_annotations=[], + out=out, sparv_namespace="", source_namespace="", delimiter=delimiter, cutoff=cutoff) + + +@installer("Install SBX word frequency list on remote host") +def install_sbx_freq_list( + freq_list: ExportInput = ExportInput("stats_export.frequency_list_sbx/stats_[metadata.id].csv"), + out: OutputCommonData = OutputCommonData("stats_export.install_sbx_freq_list_marker"), + host: str = Config("stats_export.remote_host"), + target_dir: str = Config("stats_export.remote_dir")): + """Install frequency list on server by rsyncing.""" + util.install.install_file(freq_list, host, target_dir) + out.write("") + + +@installer("Install SBX word frequency list with dates on remote host") +def install_sbx_freq_list_date( + freq_list: ExportInput = ExportInput("stats_export.frequency_list_sbx_date/stats_[metadata.id].csv"), + out: OutputCommonData = OutputCommonData("stats_export.install_sbx_freq_list_date_marker"), + host: str = Config("stats_export.remote_host"), + target_dir: str = Config("stats_export.remote_dir")): + """Install frequency list on server by rsyncing.""" + util.install.install_file(freq_list, host, target_dir) + out.write("") diff --git a/sparv/modules/stats_export/stats_export.py b/sparv/modules/stats_export/stats_export.py index 4f7e0fd1..08d263aa 100644 --- a/sparv/modules/stats_export/stats_export.py +++ b/sparv/modules/stats_export/stats_export.py @@ -1,109 +1,112 @@ """Build word frequency list.""" import csv -import logging from collections import defaultdict -from sparv import AllDocuments, AnnotationAllDocs, Corpus, Export, exporter, Config - -log = logging.getLogger(__name__) - - -@exporter("Corpus word frequency list", language=["swe"], order=1, config=[ - Config("stats_export.include_all_compounds", default=False, - description="Whether to include compound analyses for every word or just for the words that are lacking " - "a sense annotation") -]) -def freq_list(corpus: Corpus = Corpus(), - docs: AllDocuments = AllDocuments(), - word: AnnotationAllDocs = AnnotationAllDocs(""), - msd: AnnotationAllDocs = AnnotationAllDocs(""), - baseform: AnnotationAllDocs = AnnotationAllDocs(""), - sense: AnnotationAllDocs = AnnotationAllDocs(""), - lemgram: AnnotationAllDocs = AnnotationAllDocs(":saldo.lemgram"), - complemgram: AnnotationAllDocs = AnnotationAllDocs(":saldo.complemgram"), - out: Export = Export("frequency_list/stats_[metadata.id].csv"), +from sparv.api import (AllSourceFilenames, Annotation, AnnotationAllSourceFiles, Config, Export, + ExportAnnotationsAllSourceFiles, ExportInput, OutputCommonData, SourceAnnotationsAllSourceFiles, + exporter, get_logger, installer, util) + +logger = get_logger(__name__) + + +@exporter("Corpus word frequency list") +def freq_list(source_files: AllSourceFilenames = AllSourceFilenames(), + word: AnnotationAllSourceFiles = AnnotationAllSourceFiles("[export.word]"), + token: AnnotationAllSourceFiles = AnnotationAllSourceFiles(""), + annotations: ExportAnnotationsAllSourceFiles = + ExportAnnotationsAllSourceFiles("stats_export.annotations"), + source_annotations: SourceAnnotationsAllSourceFiles = SourceAnnotationsAllSourceFiles( + "stats_export.source_annotations"), + remove_namespaces: bool = Config("export.remove_module_namespaces", True), + sparv_namespace: str = Config("export.sparv_namespace"), + source_namespace: str = Config("export.source_namespace"), + out: Export = Export("stats_export.frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), - cutoff: int = Config("stats_export.cutoff"), - include_all_compounds: bool = Config("stats_export.include_all_compounds")): + cutoff: int = Config("stats_export.cutoff")): """Create a word frequency list for the entire corpus. Args: - corpus (str, optional): The corpus ID. Defaults to Corpus. - docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments. - word (str, optional): Word annotations. Defaults to AnnotationAllDocs(""). - msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs(""). - baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs(""). - sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs(""). - lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs(":saldo.lemgram"). - complemgram (str, optional): Compound lemgram annotations. - Defaults to AnnotationAllDocs(":saldo.complemgram"). - out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv"). + source_files (list, optional): The source files belonging to this corpus. Defaults to AllSourceFilenames. + word (str, optional): Word annotations. Defaults to AnnotationAllSourceFiles(""). + token (str, optional): Token span annotations. Defaults to AnnotationAllSourceFiles(""). + annotations (str, optional): All automatic annotations to include in the export. Defaults to + ExportAnnotationsAllSourceFiles("stats_export.annotations"). + source_annotations (str, optional): All source annotations to include in the export. If left empty, none will be + included. Defaults to SourceAnnotations("stats_export.source_annotations"). + remove_namespaces: Whether to remove module "namespaces" from element and attribute names. + Disabled by default. + sparv_namespace: The namespace to be added to all Sparv annotations. + source_namespace: The namespace to be added to all annotations present in the source. + out (str, optional): The output word frequency file. + Defaults to Export("stats_export.frequency_list/[metadata.id].csv"). delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter"). cutoff (int, optional): The minimum frequency a word must have in order to be included in the result. Defaults to Config("stats_export.cutoff"). - include_all_compounds (bool, optional): Whether to include compound analyses for every word - or just for the words that are lacking a sense annotation. - Defaults to Config("stats_export.include_all_compounds"). """ - freq_dict = defaultdict(int) - - for doc in docs: - tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram]) - update_freqs(tokens, freq_dict, include_all_compounds) - - write_csv(out, freq_dict, delimiter, cutoff) - - -@exporter("Corpus word frequency list (without Swedish annotations)", order=2, config=[ - Config("stats_export.delimiter", default="\t", description="Delimiter separating columns"), - Config("stats_export.cutoff", default=1, - description="The minimum frequency a word must have in order to be included in the result"), -]) -def freq_list_simple(corpus: Corpus = Corpus(), - docs: AllDocuments = AllDocuments(), - word: AnnotationAllDocs = AnnotationAllDocs(""), - pos: AnnotationAllDocs = AnnotationAllDocs(""), - baseform: AnnotationAllDocs = AnnotationAllDocs(""), - out: Export = Export("frequency_list/stats_[metadata.id].csv"), - delimiter: str = Config("stats_export.delimiter"), - cutoff: int = Config("stats_export.cutoff")): - """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations.""" - freq_dict = defaultdict(int) - - for doc in docs: - simple_tokens = word.read_attributes(doc, [word, pos, baseform]) + # Add "word" to annotations + annotations = [(word, None)] + annotations - # Add empty annotations for sense, lemgram and complemgram - tokens = [] - for w, p, b in simple_tokens: - tokens.append((w, p, b, "|", "|", "|")) - update_freqs(tokens, freq_dict) + # Get annotations list and export names + annotation_list, token_attributes, export_names = util.export.get_annotation_names( + annotations, source_annotations or [], source_files=source_files, token_name=token.name, + remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) - write_csv(out, freq_dict, delimiter, cutoff) + # Get all token and struct annotations (except the span annotations) + token_annotations = [a for a in annotation_list if a.attribute_name in token_attributes] + struct_annotations = [a for a in annotation_list if ":" in a.name and a.attribute_name not in token_attributes] - -def update_freqs(tokens, freq_dict, include_all_compounds=False): - """Extract annotation info and update frequencies.""" - for word, msd, baseform, sense, lemgram, complemgram in tokens: - if "|" in baseform: - baseform = baseform.split("|")[1] - sense = sense.split("|")[1].split(":")[0] - lemgram = lemgram.split("|")[1].split(":")[0] - complemgram = complemgram.split("|")[1].split(":")[0] - if not include_all_compounds: - if sense: - complemgram = "" - freq_dict[(word, msd, baseform, sense, lemgram, complemgram)] += 1 - - -def write_csv(out, freq_dict, delimiter, cutoff): + # Calculate token frequencies + freq_dict = defaultdict(int) + for source_file in source_files: + # Get values for struct annotations (per token) + struct_values = [] + for struct_annotation in struct_annotations: + struct_annot = Annotation(struct_annotation.name, source_file=source_file) + token_parents = Annotation(token.name, source_file=source_file).get_parents(struct_annot) + try: + struct_annot_list = list(struct_annot.read()) + struct_values.append([struct_annot_list[p] if p is not None else "" for p in token_parents]) + # Handle cases where some source files are missing structural source annotations + except FileNotFoundError: + struct_values.append(["" for _ in token_parents]) + + # Create tuples with annotations for each token and count frequencies + tokens = word.read_attributes(source_file, token_annotations) + for n, token_annotations_tuple in enumerate(tokens): + structs_tuple = tuple([struct[n] for struct in struct_values]) + freq_dict[token_annotations_tuple + structs_tuple] += 1 + + # Create header + struct_header_names = [export_names.get(a.annotation_name, a.annotation_name) + ":" + export_names[a.name] + for a in struct_annotations] + column_names = [export_names[a.name] for a in token_annotations] + struct_header_names + column_names.append("count") + + write_csv(out, column_names, freq_dict, delimiter, cutoff) + + +@installer("Install word frequency list on remote host") +def install_freq_list(freq_list: ExportInput = ExportInput("stats_export.frequency_list/stats_[metadata.id].csv"), + out: OutputCommonData = OutputCommonData("stats_export.install_freq_list_marker"), + host: str = Config("stats_export.remote_host"), + target_dir: str = Config("stats_export.remote_dir")): + """Install frequency list on server by rsyncing.""" + util.install.install_file(freq_list, host, target_dir) + out.write("") + + +################################################################################ +# Auxiliaries +################################################################################ + +def write_csv(out, column_names, freq_dict, delimiter, cutoff): """Write csv file.""" - with open(out, "w") as csvfile: + with open(out, "w", encoding="utf-8") as csvfile: csv_writer = csv.writer(csvfile, delimiter=delimiter) - csv_writer.writerow(["token", "POS", "lemma", "SALDO sense", "lemgram", "compound", "count"]) - for (wordform, msd, lemma, sense, lemgram, complemgram), freq in sorted(freq_dict.items(), key=lambda x: -x[1]): + csv_writer.writerow(column_names) + for annotations, freq in sorted(freq_dict.items(), key=lambda x: -x[1]): if cutoff and cutoff > freq: break - csv_writer.writerow([wordform, msd, lemma, sense, lemgram, complemgram, freq]) - log.info("Exported: %s", out) + csv_writer.writerow(list(annotations) + [freq]) + logger.info("Exported: %s", out) diff --git a/sparv/modules/swener/swener.py b/sparv/modules/swener/swener.py index c0a94da2..d1d4b1e8 100644 --- a/sparv/modules/swener/swener.py +++ b/sparv/modules/swener/swener.py @@ -1,14 +1,12 @@ """Named entity tagging with SweNER.""" -import logging import re import xml.etree.ElementTree as etree import xml.sax.saxutils -import sparv.util as util -from sparv import Annotation, Binary, Config, Output, annotator +from sparv.api import Annotation, Binary, Config, Output, SparvErrorMessage, annotator, get_logger, util -log = logging.getLogger(__name__) +logger = get_logger(__name__) RESTART_THRESHOLD_LENGTH = 64000 SENT_SEP = "\n" @@ -34,12 +32,12 @@ def annotate(out_ne: Output = Output("swener.ne", cls="named_entity", descriptio SweNER is either run in an already started process defined in process_dict, or a new process is started(default) - - doc, word, sentence, token: existing annotations + - word, sentence, token: existing annotations - out_ne_ex, out_ne_type, out_ne_subtype: resulting annotation files for the named entities - process_dict is used in the catapult and should never be set from the command line """ - if process_dict is None: - process = swenerstart(binary, "", util.UTF8, verbose=False) + # if process_dict is None: + process = swenerstart(binary, "", util.constants.UTF8, verbose=False) # else: # process = process_dict["process"] # # If process seems dead, spawn a new one @@ -59,7 +57,7 @@ def annotate(out_ne: Output = Output("swener.ne", cls="named_entity", descriptio stdin = xml.sax.saxutils.escape(stdin) # keep_process = len(stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None - # log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) + # logger.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) # if process_dict is not None: # process_dict["restart"] = not keep_process @@ -74,12 +72,14 @@ def annotate(out_ne: Output = Output("swener.ne", cls="named_entity", descriptio # else: # Otherwise use communicate which buffers properly - # log.info("STDIN %s %s", type(stdin.encode(encoding)), stdin.encode(encoding)) - stdout, _ = process.communicate(stdin.encode(util.UTF8)) - # log.info("STDOUT %s %s", type(stdout.decode(encoding)), stdout.decode(encoding)) + # logger.info("STDIN %s %s", type(stdin.encode(encoding)), stdin.encode(encoding)) + stdout, stderr = process.communicate(stdin.encode(util.constants.UTF8)) + if process.returncode > 0: + raise SparvErrorMessage(f"An error occurred while running HFST-SweNER:\n\n{stderr.decode()}") + # logger.info("STDOUT %s %s", type(stdout.decode(encoding)), stdout.decode(encoding)) - parse_swener_output(sentences, token, stdout.decode(util.UTF8), out_ne, out_ne_ex, out_ne_type, out_ne_subtype, - out_ne_name) + parse_swener_output(sentences, token, stdout.decode(util.constants.UTF8), out_ne, out_ne_ex, out_ne_type, + out_ne_subtype, out_ne_name) def parse_swener_output(sentences: list, token: Annotation, output, out_ne: Output, out_ne_ex: Output, @@ -103,7 +103,7 @@ def parse_swener_output(sentences: list, token: Annotation, output, out_ne: Outp try: root = etree.fromstring(xml_sent) except: - log.warning("Error parsing sentence. Skipping.") + logger.warning("Error parsing sentence. Skipping.") continue # Init token counter; needed to get start_pos and end_pos @@ -124,7 +124,7 @@ def parse_swener_output(sentences: list, token: Annotation, output, out_ne: Outp if child.tag != "sroot": if start_i < previous_end: pass - # log.warning("Overlapping NE elements found; discarding one.") + # logger.warning("Overlapping NE elements found; discarding one.") else: end_pos = token_spans[sent[i - 1]][1] previous_end = i @@ -140,7 +140,7 @@ def parse_swener_output(sentences: list, token: Annotation, output, out_ne: Outp if (child.tail and child.tail.strip() and not child.tail[0] == " ") or ( not child.tail and count < len(children) - 1): i -= 1 - # log.warning("Split token returned by name tagger.") + # logger.warning("Split token returned by name tagger.") # If current child has text in the tail, increase token counter if child.tail and child.tail.strip(): @@ -151,7 +151,7 @@ def parse_swener_output(sentences: list, token: Annotation, output, out_ne: Outp # The next NE would start in the middle of a token, so decrease the counter by 1 i -= 1 except IndexError: - log.warning("Error parsing sentence. Skipping.") + logger.warning("Error parsing sentence. Skipping.") continue # Write annotations diff --git a/sparv/modules/text_import/text_import.py b/sparv/modules/text_import/text_import.py index 94dfbf3f..68a827e1 100644 --- a/sparv/modules/text_import/text_import.py +++ b/sparv/modules/text_import/text_import.py @@ -2,19 +2,18 @@ import unicodedata -from sparv import importer, util -from sparv.util.classes import Config, Document, Output, Source, SourceStructure, Text +from sparv.api import Config, SourceFilename, Output, Source, SourceStructure, Text, importer, util -@importer("TXT import", file_extension="txt", outputs=["text"], document_annotation="text", config=[ +@importer("TXT import", file_extension="txt", outputs=["text"], text_annotation="text", config=[ Config("text_import.prefix", "", description="Optional prefix to add to annotation names."), - Config("text_import.encoding", util.UTF8, description="Encoding of source document. Defaults to UTF-8."), + Config("text_import.encoding", util.constants.UTF8, description="Encoding of source file. Defaults to UTF-8."), Config("text_import.keep_control_chars", False, description="Set to True if control characters should not be " "removed from the text."), Config("text_import.normalize", "NFC", description="Normalize input using any of the following forms: " "'NFC', 'NFKC', 'NFD', and 'NFKD'.") ]) -def parse(doc: Document = Document(), +def parse(source_file: SourceFilename = SourceFilename(), source_dir: Source = Source(), prefix: str = Config("text_import.prefix"), encoding: str = Config("text_import.encoding"), @@ -23,7 +22,7 @@ def parse(doc: Document = Document(), """Parse plain text file as input to the Sparv Pipeline. Args: - doc: The document name. + source_file: The name of the source file. source_dir: The source directory. prefix: Optional prefix for output annotation. encoding: Encoding of source file. Default is UTF-8. @@ -31,18 +30,17 @@ def parse(doc: Document = Document(), normalize: Normalize input text using any of the following forms: 'NFC', 'NFKC', 'NFD', and 'NFKD'. 'NFC' is used by default. """ - source_file = source_dir.get_path(doc, ".txt") - text = source_file.read_text(encoding=encoding) + text = source_dir.get_path(source_file, ".txt").read_text(encoding=encoding) if not keep_control_chars: - text = util.remove_control_characters(text) + text = util.misc.remove_control_characters(text) if normalize: text = unicodedata.normalize(normalize, text) - Text(doc).write(text) + Text(source_file).write(text) # Make up a text annotation surrounding the whole file text_annotation = "{}.text".format(prefix) if prefix else "text" - Output(text_annotation, doc=doc).write([(0, len(text))]) - SourceStructure(doc).write([text_annotation]) + Output(text_annotation, source_file=source_file).write([(0, len(text))]) + SourceStructure(source_file).write([text_annotation]) diff --git a/sparv/modules/treetagger/treetagger.py b/sparv/modules/treetagger/treetagger.py index dacc14df..9da1e504 100644 --- a/sparv/modules/treetagger/treetagger.py +++ b/sparv/modules/treetagger/treetagger.py @@ -6,12 +6,12 @@ You do not need to download any parameter files as Sparv will download these for you when necessary. """ -import logging -import sparv.util as util -from sparv import Annotation, Binary, Config, Language, Model, ModelOutput, Output, annotator, modelbuilder +from sparv.api import (Annotation, Binary, Config, Language, Model, ModelOutput, Output, annotator, get_logger, + modelbuilder, util) +from sparv.api.util.tagsets import pos_to_upos -log = logging.getLogger(__name__) +logger = get_logger(__name__) SENT_SEP = "\n\n" TOK_SEP = "\n" @@ -53,7 +53,7 @@ def annotate(lang: Language = Language(), out_baseform: Output = Output(":treetagger.baseform", description="Baseforms from TreeTagger"), word: Annotation = Annotation(""), sentence: Annotation = Annotation(""), - encoding: str = util.UTF8): + encoding: str = util.constants.UTF8): """POS/MSD tag and lemmatize using TreeTagger.""" sentences, _orphans = sentence.get_children(word) word_annotation = list(word.read()) @@ -62,7 +62,7 @@ def annotate(lang: Language = Language(), args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "", model.path] stdout, stderr = util.system.call_binary(tt_binary, args, stdin, encoding=encoding) - log.debug("Message from TreeTagger:\n%s", stderr) + logger.debug("Message from TreeTagger:\n%s", stderr) # Write pos and upos annotations. out_upos_annotation = word.create_empty_attribute() @@ -73,10 +73,10 @@ def annotate(lang: Language = Language(), if len(cols) >= TAG_COLUMN + 1: tag = cols[TAG_COLUMN] else: - log.warning(f"TreeTagger failed to produce a POS tag for token '{cols[0]}'!") + logger.warning(f"TreeTagger failed to produce a POS tag for token '{cols[0]}'!") tag = "" out_pos_annotation[token_id] = tag - out_upos_annotation[token_id] = util.tagsets.pos_to_upos(tag, lang, TAG_SETS.get(lang)) + out_upos_annotation[token_id] = pos_to_upos(tag, lang, TAG_SETS.get(lang)) out_pos.write(out_pos_annotation) out_upos.write(out_upos_annotation) @@ -88,8 +88,8 @@ def annotate(lang: Language = Language(), if len(cols) >= LEM_COLUMN + 1: lem = cols[LEM_COLUMN] else: - log.warning(f"TreeTagger failed to produce a baseform for token '{cols[0]}'! " - "Using the wordform as baseform.") + logger.warning(f"TreeTagger failed to produce a baseform for token '{cols[0]}'! " + "Using the wordform as baseform.") lem = cols[0] out_lemma_annotation[token_id] = lem out_baseform.write(out_lemma_annotation) diff --git a/sparv/modules/version_info/__init__.py b/sparv/modules/version_info/__init__.py new file mode 100644 index 00000000..0cbbb2fe --- /dev/null +++ b/sparv/modules/version_info/__init__.py @@ -0,0 +1,3 @@ +"""Summarizes and exports annotation version information.""" + +from . import version_info diff --git a/sparv/modules/version_info/version_info.py b/sparv/modules/version_info/version_info.py new file mode 100644 index 00000000..0d0e7113 --- /dev/null +++ b/sparv/modules/version_info/version_info.py @@ -0,0 +1,26 @@ +"""Summarizes and exports annotation version information.""" + + +from datetime import datetime + +import yaml + +from sparv import __version__ as sparv_version +from sparv.api import Export, exporter, get_logger + +logger = get_logger(__name__) + + +@exporter("YAML file containing annotation version info") +def yaml_export(out: Export = Export("version_info/info_[metadata.id].yaml")): + """Create YAML file containing annotation version information.""" + info_dict = { + "Sparv version": sparv_version, + "Annotation date": datetime.today().strftime("%Y-%m-%d") + } + + # Write YAML file + logger.info("Exported: %s", out) + content = yaml.dump(info_dict, allow_unicode=True, indent=4, sort_keys=False, default_flow_style=False) + with open(out, "w", encoding="utf-8") as o: + o.writelines(content) diff --git a/sparv/modules/vw_topic_modelling/vw_topic_modelling.py b/sparv/modules/vw_topic_modelling/vw_topic_modelling.py deleted file mode 100644 index 50eca91c..00000000 --- a/sparv/modules/vw_topic_modelling/vw_topic_modelling.py +++ /dev/null @@ -1,546 +0,0 @@ -"""Topic modelling with vowpal wabbit. - -NB: Not fully adapted to Sparv v4 yet! - -Public functions: -- train -- predict -- word_weights -- make_testdata - -Docstring testing: -> import doctest -> doctest.testmod(verbose=False) -""" - -import itertools as it -import json -import logging -import sys -import tempfile -from collections import Counter, OrderedDict, defaultdict, namedtuple - -import pylibvw -from vowpalwabbit import pyvw - -import sparv.util as util -from sparv import Annotation, Config, Document, Model, ModelOutput, Output, annotator, modelbuilder - -log = logging.getLogger(__name__) - - -@annotator("Report the weight for each label for each word", config=[ - Config("vw_topic_modelling.model", default="vw_topic_modelling/?.model")]) -def word_weights(doc: str = Document, - model: str = Model("[vw_topic_modelling.model]"), - word: str = Annotation(""), - pos: str = Annotation(""), - out: str = Output(":vw_topic_modelling:label_weights", description="Label weights per word")): - """ - Report the weight for each label for each word. - - Both model and model.json must exist. See --train and --predict. - """ - m_json = json.load(open(model + ".json")) - index_to_label = m_json["index_to_label"] - min_word_length = int(m_json["min_word_length"] or "0") - banned_pos = (m_json["banned_pos"] or "").split() - words = list(util.read_annotation(doc, word)) - poss = util.read_annotation(doc, pos) if pos else [] - data = (Example(None, vw_normalize(word)) - for n, word in enumerate(words) - if len(word) >= min_word_length - if not pos or poss[n] not in banned_pos) - weights = defaultdict(list) - with tempfile.NamedTemporaryFile() as tmp: - args = ["--initial_regressor", model, "--invert_hash", tmp.name] - for _ in vw_predict(args, data): - pass - for line in open(tmp.name, "r").readlines(): - # allmänna[1]:14342849:0.0139527 - colons = line.split(":") - if len(colons) == 3: - word, _hash, weight = colons - if word[-1] == "]": - bracesplit = word.rsplit("[", 1) - else: - bracesplit = [] - if len(bracesplit) == 2: - word, index = bracesplit - n = int(index[:-1]) + 1 - else: - n = 1 - weights[word].append(index_to_label[str(n)] + ":" + weight) - ws = ( - util.cwbset(weights[vw_normalize(word)]) - for word in words - if vw_normalize(word) in weights - ) - util.write_annotation(doc, out, ws) - - -@annotator("Predict a structural attribute", config=[ - Config("vw_topic_modelling.model", default="vw_topic_modelling/?.model"), - Config("vw_topic_modelling.modeljson", default="vw_topic_modelling/?.model.json")]) -def predict(doc: str = Document, - model: str = Model("[vw_topic_modelling.model]"), - modeljson: str = Model("[vw_topic_modelling.modeljson]"), - order, - struct, - parent: str = Annotation("{chunk}"), - word: str = Annotation(""), - out: str = Output("{chunk}:vw_topic_modelling.prediction", description="Predicted attributes"), - pos: str = Annotation(""), - raw: bool = False): - """Predict a structural attribute.""" - raw = raw == "true" - - m_json = json.load(open(modeljson)) - - data = ( - Example(None, text.words, text.span) - for text in texts([(order, struct, parent, word, pos)], - map_label=lambda _: "?", - min_word_length=m_json["min_word_length"], - banned_pos=m_json["banned_pos"]) - ) - - index_to_label = m_json["index_to_label"] - - args = ["--initial_regressor", model] - - if raw: - predictions = ( - util.cwbset(index_to_label[str(s)] + ":" + str(v) for s, v in ss) - for ss, _span in vw_predict(args, data, raw=True) - ) - else: - predictions = ( - index_to_label[str(s)] - for s, _span in vw_predict(args, data) - ) - - util.write_annotation(doc, out, predictions) - - -def _make_label_map(label_map_json): - if label_map_json: - with open(label_map_json, "r") as fp: - d = json.load(fp) - return lambda label: d.get(label, None) - else: - return lambda label: label - - -def _take(bound, xs): - if bound: - i = 0 - for x in xs: - i += 1 - if i >= bound: - break - yield x - else: - for x in xs: - yield x - - -@modelbuilder("Predict a structural attribute") -def train(doc: str = Document, - file_list, - modelfile: str = ModelOutput("vw_topic_modelling/?.model"), - jsonfile: str = ModelOutput("vw_topic_modelling/?.model.json"), - dry_run_labels: bool = False, - label_map_json=None, - bound=None, - min_word_length: int = 0, - banned_pos=""): - """ - Train a model using vowpal wabbit. - - Creates outprefix.model and outprefix.model.json. - - file_list is a file with 5*N lines of annotation filenames: - first N copies of: order, - then N copies of: annotation_struct, - then N copies of: parent, - then N copies of: word. - then N copies of: pos. - """ - - with open(file_list, "r") as fp: - files = fp.read().split() - order_struct_parent_word_pos = interleave(files, 5) - map_label = _make_label_map(label_map_json) - min_word_length = int(min_word_length) if min_word_length else 0 - - # Look at the structs annotations to get the labels and their distribution: - _, structs, _, _, _ = list(zip(*order_struct_parent_word_pos)) - # TODO: skip labels with very low occurrences - labels = Counter(map_label(label) - for annotfile in structs - for label in util.read_annotation(doc, annotfile) - if map_label(label)) - N = sum(labels.values()) - if bound: - bound = int(bound) - N = min(bound, N) - k = len(labels) - label_to_index = {} - index_to_label = {} - answer = {} - for i, (label, occurences) in enumerate(iter(list(labels.items())), start=1): - w = float(N) / occurences - log.info(f"{label}: occurences: {occurences}, weight: {w}") - answer[label] = ("%s:%s | " % (i, w)).encode() - label_to_index[label] = i - index_to_label[i] = label - - if dry_run_labels == "true": - from pprint import pprint - pprint(labels.most_common()) - print(json.dumps({l: l for l in labels}, indent=2)) - log.info(f"texts: {N}, labels: {k}") - sys.exit() - - def itertexts(): - return _take(bound, texts(order_struct_parent_word_pos, map_label, min_word_length, banned_pos)) - - # Train model - args = ["--oaa", str(k), - "--passes", "10", - "--cache", "--kill_cache", - "--bit_precision", "24", - "--final_regressor", modelfile] - data = ( - Example(answer[text.label], text.words) - for text in every(10, itertexts(), invert=True) - ) - vw_train(args, data) - - # Performance evaluation - args = ["--initial_regressor", modelfile] - target = [] - - def data_iterator(): - for text in every(10, itertexts()): - target.append(label_to_index[text.label]) - yield Example(None, text.words) - - predicted = [int(s) for s, _tag in vw_predict(args, data_iterator())] - N_eval = len(predicted) - - assert len(predicted) == len(target) - - order = list(range(1, 1 + k)) - info = dict( - min_word_length=min_word_length, - banned_pos=banned_pos, - labels=[index_to_label[i] for i in order], - index_to_label=index_to_label, - label_to_index=label_to_index, - N_train=N - N_eval, - N_eval=N_eval, - stats={index_to_label[i]: p.as_dict() - for i, p in - list(multiclass_performance(target, predicted).items())}, - confusion_matrix=confusion_matrix(target, predicted, order)) - with open(jsonfile, "w") as f: - json.dump(info, f, sort_keys=True, indent=2) - log.info(f"Wrote {jsonfile}") - - -def make_testdata(corpus_desc="abcd abcd dcba cbad", docs=1000): - """Write amazing test data on stdout.""" - import random - n_docs = int(docs) - # make = lambda s: (s, triangulate(s)) - corpora = [(s, tuple(triangulate(s))) for s in corpus_desc.split()] - for _ in range(n_docs): - _corpus, freq = random.choice(corpora) - print("") - n_words = random.randint(12, 39) - print(" ".join(random.choice(freq) for _ in range(n_words))) - print("") - - -Text = namedtuple("Text", "label span words") - - -def texts(order_struct_parent_word_pos, map_label, min_word_length, banned_pos): - """ - Get all the texts from an iterator of 5-tuples of annotations. - - The annotations contain token order, the structural attribute (like a label), - its parenting of the words, and the words themselves. - """ - X = 0 - S = 0 - banned_pos = (banned_pos or "").split() - for order, struct, parent, word, pos in order_struct_parent_word_pos: - x = 0 - s = 0 - log.info(f"Processing {struct} {parent} {word}") - # TODO: needs re-writing! cwb.tokens_and_vrt and cwb.vrt_iterate don't exist anymore - tokens, vrt = cwb.tokens_and_vrt(order, [(struct, parent)], [word] + ([pos] if pos else [])) - for (label, span), cols in cwb.vrt_iterate(tokens, vrt): - words = b" ".join(vw_normalize(col[0]) - for col in cols - if len(col[0]) >= min_word_length - if not pos or col[1] not in banned_pos) - mapped_label = map_label(label) - if mapped_label: - x += 1 - yield Text(mapped_label, span, words) - else: - s += 1 - log.info(f"Texts from {struct}: {x} (skipped: {s})") - X += x - S += s - log.info(f"Total texts: {X} (skipped: {S})") - - -Example = namedtuple("Example", "label features tag") -Example.__new__.__defaults__ = (None,) - - -def vw_train(args, data): - """ - Train using vowpal wabbit on an iterator of Examples. - - >>> with tempfile.NamedTemporaryFile() as tmp: - ... _ = vw_train(['--final_regressor', tmp.name, '--oaa', '2', '--quiet'], - ... [Example('1', 'a b c'), Example('2', 'c d e')]) - ... list(vw_predict(['--initial_regressor', tmp.name, '--quiet'], - ... [Example(None, 'a b c', 'abc_tag'), - ... Example(None, 'c d e', 'cde_tag')])) - ... list(vw_predict(['--initial_regressor', tmp.name, '--quiet'], - ... [Example(None, 'a b c', 'abc_tag'), - ... Example(None, 'c d e', 'cde_tag')], - ... raw=True)) - ... # doctest: +NORMALIZE_WHITESPACE - [(1, 'abc_tag'), (2, 'cde_tag')] - [(((1, 0.343459), (2, -0.343459)), 'abc_tag'), - (((1, -0.334946), (2, 0.334946)), 'cde_tag')] - """ - tuple(_vw_run(args, data, False)) # force evaluation using tuple - - -def vw_predict(args, data, raw=False): - """ - Predict using vowpal wabbit on an iterator of Examples. - - Argument list is adjusted for testing and getting predictions as a stream. - """ - if raw: - with tempfile.NamedTemporaryFile() as tmp: - more_args = ["--testonly", "-r", tmp.name] - tags = [] - for _, tag in _vw_run(args + more_args, data, True): - tags.append(tag) - lines = open(tmp.name, "r").read().rstrip().split("\n") - for line, tag in zip(lines, tags): - def pred(label, raw_pred): - return (int(label), float(raw_pred)) - preds = tuple(pred(*p.split(":")) for p in line.split()) - yield preds, tag - else: - more_args = ["--testonly"] - for x in _vw_run(args + more_args, data, True): - yield x - - -def _vw_run(args, data, predict_and_yield): - vw = pyvw.vw(" ".join(args)) - log.info("Running: vw " + " ".join(args)) - for d in data: - ex = vw.example((d.label or b"") + b" | " + d.features + b"\n") - if predict_and_yield: - yield vw.predict(ex, pylibvw.vw.lMulticlass), d.tag - else: - vw.learn(ex) - vw.finish() - - -def vw_normalize(s): - """ - Normalize a string so it can be used as a VW feature. - - >>> print(vw_normalize(u'VW | abcåäö:123').decode('utf-8')) - vwSISabcåäöCXXX - """ - return s.lower().translate(_escape_table).encode("utf-8") - - -# Replace digits with X -_escape_symbols = [(str(x), u"X") for x in range(10)] -# Vowpal Wabbit needs these to be escaped: -_escape_symbols += [(u" ", u"S"), # separates features - (u"|", u"I"), # separates namespaces - (u":", u"C")] # separates feature and its value - -_escape_table = {ord(k): v for k, v in _escape_symbols} - - -################################################################################ -# Performance -################################################################################ - - -class Performance: - """Class for calculating performance measures.""" - - def __init__(self, TP, TN, FP, FN): - """ - Calculate performance measures from true and false positives and negatives. - - https://en.wikipedia.org/wiki/Precision_and_recall - """ - n = TP + TN + FP + FN - self.ACC = self.div(TP + TN, n) - self.PRE = self.div(TP, TP + FP) - self.REC = self.div(TP, TP + FN) - self.PRF = self.harmonic_mean(self.PRE, self.REC) - - def div(self, x, y): - """Divide x/y, return 0 if divisor is 0.""" - if y == 0: - return 0.0 - return float(x) / float(y) - - def harmonic_mean(self, x, y): - """Calculate the harmonic mean.""" - return self.div(2 * x * y, x + y) - - def as_dict(self): - """Store performance statistics in a dictionary.""" - keys = "ACC PRE REC PRF".split() - return OrderedDict((k, self.__dict__[k]) for k in keys) - - def __repr__(self): - """Return a string representation of the performance measures.""" - perf = ", ".join("%s=%.3f" % kv for kv in list(self.as_dict().items())) - return "Performance(" + perf + ")" - - -def binary_performance(target, predicted): - """ - Calculate standard performance measures on the predictions of a binary classifier. - - >>> p = binary_performance([1,1,1,1,0,0,0,0], - ... [1,1,1,0,0,1,1,0]) - >>> p.ACC == 5/8.0 - True - >>> p.PRE == 3/5.0 - True - >>> p.REC == 3/4.0 - True - """ - d = Counter(list(zip(target, predicted))) - return Performance(TP=d[1, 1], - TN=d[0, 0], - FP=d[0, 1], - FN=d[1, 0]) - - -def multiclass_performance(target, predicted): - """ - >>> multiclass_performance([1,1,1,1,2,2,3,3], - ... [1,1,1,2,2,1,1,3]) - ... # doctest: +NORMALIZE_WHITESPACE - {1: Performance(ACC=0.625, PRE=0.600, REC=0.750, PRF=0.667), - 2: Performance(ACC=0.750, PRE=0.500, REC=0.500, PRF=0.500), - 3: Performance(ACC=0.875, PRE=1.000, REC=0.500, PRF=0.667)} - """ - return { - i: binary_performance((t == i for t in target), (p == i for p in predicted)) - for i in nub(it.chain(target, predicted)) - } - - -def confusion_matrix(target, predicted, order): - """ - Calculate confusion matrix with the target on the y-axis and rows on the x-axis. - - https://en.wikipedia.org/wiki/Confusion_matrix - - >>> confusion_matrix([1,1,1,1,2,2,3,3], - ... [1,1,1,2,2,1,1,3], - ... order=[1,2,3]) # doctest: +NORMALIZE_WHITESPACE - [[3, 1, 0], - [1, 1, 0], - [1, 0, 1]] - """ - matrix = Counter(list(zip(target, predicted))) - return [[matrix.get((t, p), 0) for p in order] for t in order] - - -################################################################################ -# Auxiliaries -################################################################################ - - -def triangulate(xs): - """ - All initial segments of xs, concatenated. - - >>> ''.join(triangulate('abc')) - 'aababc' - >>> ''.join(triangulate('1234')) - '1121231234' - >>> list(triangulate([])) - [] - """ - for i, _ in enumerate(xs): - for x in xs[:i + 1]: - yield x - - -def interleave(xs, k): - """ - Put the elements of xs in k-tuples, with the one same distance between consecutive elements in every tuple. - - Does not support infinite xs. - - >>> interleave('abc123', 2) - [('a', '1'), ('b', '2'), ('c', '3')] - >>> interleave('abcdABCD1234', 3) - [('a', 'A', '1'), ('b', 'B', '2'), ('c', 'C', '3'), ('d', 'D', '4')] - """ - ts = [[] for _ in range(len(xs) // k)] # Changed to floor division in python3 - ts_iter = it.cycle(ts) - for x in xs: - next(ts_iter).append(x) - return [tuple(t) for t in ts] - - -def every(sep, generator, invert=False): - """ - Iterate over the generator, returning every sep element of it. - - >>> list(every(3, 'A B C D E F'.split())) - ['C', 'F'] - >>> ' '.join(every(3, 'A B C D E F'.split(), True)) - 'A B D E' - """ - for i, x in enumerate(generator): - if ((i + 1) % sep == 0) != invert: - yield x - - -def nub(xs): - """ - All unique elements from xs in order. - - >>> ''.join(nub('apabepa')) - 'apbe' - >>> import random - >>> spec = lambda xs: sorted(nub(xs)) == sorted(Counter(xs).keys()) - >>> all(spec(random.sample(range(200), i)) for i in range(100)) - True - """ - seen = set() - for x in xs: - if x not in seen: - seen.add(x) - yield x diff --git a/sparv/modules/word_align/word_align.py b/sparv/modules/word_align/word_align.py index fa0c718e..47fe0b74 100644 --- a/sparv/modules/word_align/word_align.py +++ b/sparv/modules/word_align/word_align.py @@ -1,7 +1,6 @@ """NB: Not adapted to Sparv v4 yet!""" -# -*- coding: utf-8 -*- -import sparv.util as util +from sparv.api import util def align_texts(word1, word2, linktok1, linktok2, link1, link2, linkref2, out_wordlink, out_sentences, outindex1, outindex2, delimiter="|", affix="|"): diff --git a/sparv/modules/wsd/wsd.py b/sparv/modules/wsd/wsd.py index 99bf6de5..6ab0cf1b 100644 --- a/sparv/modules/wsd/wsd.py +++ b/sparv/modules/wsd/wsd.py @@ -1,11 +1,8 @@ """Word sense disambiguation based on SALDO annotation.""" -import logging +from sparv.api import Annotation, Binary, Config, Model, ModelOutput, Output, annotator, modelbuilder, get_logger, util -import sparv.util as util -from sparv import Annotation, Binary, Config, Model, ModelOutput, Output, annotator, modelbuilder - -log = logging.getLogger(__name__) +logger = get_logger(__name__) SENT_SEP = "$SENT$" @@ -15,8 +12,8 @@ Config("wsd.context_model", default="wsd/lem_cbow0_s512_w10_NEW2_ctx.bin", description="Path to context model"), Config("wsd.default_prob", -1.0, description="Default value for unanalyzed senses"), Config("wsd.jar", default="wsd/saldowsd.jar", description="Path name of the executable .jar file"), - Config("wsd.prob_format", util.SCORESEP + "%.3f", description="Format string for how to print the " - "sense probability") + Config("wsd.prob_format", util.constants.SCORESEP + "%.3f", description="Format string for how to print the " + "sense probability") ]) def annotate(wsdjar: Binary = Binary("[wsd.jar]"), sense_model: Model = Model("[wsd.sense_model]"), @@ -25,14 +22,14 @@ def annotate(wsdjar: Binary = Binary("[wsd.jar]"), description="Sense disambiguated SALDO identifiers"), sentence: Annotation = Annotation(""), word: Annotation = Annotation(""), - ref: Annotation = Annotation(":misc.number_rel_"), + ref: Annotation = Annotation(""), lemgram: Annotation = Annotation(":saldo.lemgram"), saldo: Annotation = Annotation(":saldo.sense"), pos: Annotation = Annotation(""), token: Annotation = Annotation(""), prob_format: str = Config("wsd.prob_format"), default_prob: float = Config("wsd.default_prob"), - encoding: str = util.UTF8): + encoding: str = util.constants.UTF8): """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation. Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob. @@ -55,6 +52,8 @@ def annotate(wsdjar: Binary = Binary("[wsd.jar]"), sentences, orphans = sentence.get_children(token) sentences.append(orphans) + # Remove empty sentences + sentences = list(s for s in sentences if s) # Start WSD process process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding) @@ -71,7 +70,7 @@ def annotate(wsdjar: Binary = Binary("[wsd.jar]"), # Problem is that regular messages "Reading sense vectors.." are also piped to stderr. if len(stderr) > 52: util.system.kill_process(process) - log.error(str(stderr)) + logger.error(str(stderr)) return if encoding: @@ -122,8 +121,8 @@ def build_input(sentences, word_annotation, ref_annotation, lemgram_annotation, word = word_annotation[token_index] ref = ref_annotation[token_index] pos = pos_annotation[token_index].lower() - saldo = saldo_annotation[token_index].strip(util.AFFIX) if saldo_annotation[ - token_index] != util.AFFIX else "_" + saldo = saldo_annotation[token_index].strip(util.constants.AFFIX) if saldo_annotation[ + token_index] != util.constants.AFFIX else "_" if "_" in saldo and len(saldo) > 1: mwe = True @@ -156,7 +155,7 @@ def process_output(word: Annotation, out: Output, stdout, in_sentences, saldo_an out_prob = out_tok.split("\t")[6] out_prob = [i for i in out_prob.split("|") if i != "_"] out_meanings = [i for i in out_tok.split("\t")[5].split("|") if i != "_"] - saldo = [i for i in saldo_annotation[in_tok].strip(util.AFFIX).split(util.DELIM) if i] + saldo = [i for i in saldo_annotation[in_tok].strip(util.constants.AFFIX).split(util.constants.DELIM) if i] new_saldo = [] if out_prob: @@ -173,15 +172,15 @@ def process_output(word: Annotation, out: Output, stdout, in_sentences, saldo_an new_saldo.sort(key=lambda x: (-x[1], x[0])) # Format probability according to prob_format new_saldo = [saldo + prob_format % prob if prob_format else saldo for saldo, prob in new_saldo] - out_annotation[in_tok] = util.cwbset(new_saldo) + out_annotation[in_tok] = util.misc.cwbset(new_saldo) out.write(out_annotation) def make_lemgram(lemgram, word, pos): """Construct lemgram and simple_lemgram format.""" - lemgram = lemgram.strip(util.AFFIX) if lemgram != util.AFFIX else "_" - simple_lemgram = util.DELIM.join(set((lem[:lem.rfind(".")] for lem in lemgram.split(util.DELIM)))) + lemgram = lemgram.strip(util.constants.AFFIX) if lemgram != util.constants.AFFIX else "_" + simple_lemgram = util.constants.DELIM.join(set((lem[:lem.rfind(".")] for lem in lemgram.split(util.constants.DELIM)))) # Fix simple lemgram for tokens without lemgram (word + pos) if not simple_lemgram: @@ -191,9 +190,9 @@ def make_lemgram(lemgram, word, pos): def remove_mwe(annotation): """For MWEs: strip unnecessary information.""" - annotation = annotation.split(util.DELIM) + annotation = annotation.split(util.constants.DELIM) annotation = [i for i in annotation if "_" not in i] if annotation: - return util.DELIM.join(annotation) + return util.constants.DELIM.join(annotation) else: return "_" diff --git a/sparv/modules/xml_export/preserved_format.py b/sparv/modules/xml_export/preserved_format.py index 54692156..db04c3e7 100644 --- a/sparv/modules/xml_export/preserved_format.py +++ b/sparv/modules/xml_export/preserved_format.py @@ -1,24 +1,23 @@ """Export annotated corpus data to format-preserved xml.""" -import logging import os import xml.etree.ElementTree as etree -import sparv.util as util -from sparv import (AnnotationData, Config, Document, Export, ExportAnnotations, SourceAnnotations, Text, exporter) +from sparv.api import (AnnotationData, Config, Export, ExportAnnotations, Namespaces, SourceAnnotations, SourceFilename, + SparvErrorMessage, Text, exporter, get_logger, util) from . import xml_utils -log = logging.getLogger(__name__) +logger = get_logger(__name__) @exporter("XML export preserving whitespaces from source file", config=[ - Config("xml_export.filename_formatted", default="{doc}_export.xml", - description="Filename pattern for resulting XML files, with '{doc}' representing the source name.") + Config("xml_export.filename_formatted", default="{file}_export.xml", + description="Filename pattern for resulting XML files, with '{file}' representing the source name.") ]) -def preserved_format(doc: Document = Document(), +def preserved_format(source_file: SourceFilename = SourceFilename(), text: Text = Text(), - docid: AnnotationData = AnnotationData(""), - out: Export = Export("xml_preserved_format/[xml_export.filename_formatted]"), + fileid: AnnotationData = AnnotationData(""), + out: Export = Export("xml_export.preserved_format/[xml_export.filename_formatted]"), annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"), @@ -29,14 +28,14 @@ def preserved_format(doc: Document = Document(), """Export annotations to XML in export_dir and keep whitespaces and indentation from original file. Args: - doc: Name of the original document. + source_file: Name of the source file. text: The corpus text. - docid: Annotation with document IDs. + fileid: Annotation with file IDs. out: Path and filename pattern for resulting file. annotations: List of elements:attributes (annotations) to include. - source_annotations: List of elements:attributes from the original document + source_annotations: List of elements:attributes from the source file to be kept. If not specified, everything will be kept. - header_annotations: List of header elements from the original document to include + header_annotations: List of header elements from the source file to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. @@ -47,26 +46,31 @@ def preserved_format(doc: Document = Document(), # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) - # Read corpus text and document ID + # Read corpus text, file ID and XML namespaces corpus_text = text.read() - docid = docid.read() + fileid = fileid.read() + xml_namespaces = Namespaces(source_file).read() # Get annotation spans, annotations list etc. - annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc, - remove_namespaces=remove_namespaces, - sparv_namespace=sparv_namespace, - source_namespace=source_namespace) - h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) + annotation_list, _, export_names = util.export.get_annotation_names(annotations, source_annotations, source_file=source_file, + remove_namespaces=remove_namespaces, + sparv_namespace=sparv_namespace, + source_namespace=source_namespace, + xml_mode=True) + h_annotations, h_export_names = util.export.get_header_names(header_annotations, source_file=source_file) export_names.update(h_export_names) - span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations, doc=doc, - flatten=False, split_overlaps=True) + span_positions, annotation_dict = util.export.gather_annotations(annotation_list, export_names, h_annotations, + source_file=source_file, flatten=False, split_overlaps=True) sorted_positions = [(pos, span[0], span[1]) for pos, spans in sorted(span_positions.items()) for span in spans] # Root tag sanity check - if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]): - raise util.SparvErrorMessage("Root tag is missing! If you have manually specified which elements to include, " - "make sure to include an element that encloses all other included elements and " - "text content.") + if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1], true_root=True): + raise SparvErrorMessage("Root tag is missing! If you have manually specified which elements to include, " + "make sure to include an element that encloses all other included elements and " + "text content (including whitespace characters such as newlines).") + + # Register XML namespaces + xml_utils.register_namespaces(xml_namespaces) # Create root node root_span = sorted_positions[0][2] @@ -86,7 +90,7 @@ def preserved_format(doc: Document = Document(), # Handle headers if span.is_header: - header = annotation_dict[span.name][util.HEADER_CONTENTS][span.index] + header = annotation_dict[span.name][util.constants.HEADER_CONTENTS][span.index] header_xml = etree.fromstring(header) header_xml.tag = span.export # Rename element if needed span.node = header_xml @@ -99,10 +103,10 @@ def preserved_format(doc: Document = Document(), include_empty_attributes) if span.overlap_id: if sparv_namespace: - span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") + span.node.set(f"{sparv_namespace}.{util.constants.OVERLAP_ATTR}", f"{fileid}-{span.overlap_id}") else: - span.node.set(f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}", - f"{docid}-{span.overlap_id}") + span.node.set(f"{util.constants.SPARV_DEFAULT_NAMESPACE}.{util.constants.OVERLAP_ATTR}", + f"{fileid}-{span.overlap_id}") node_stack.append(span) # Set text if there should be any between this node and the next one @@ -133,4 +137,4 @@ def preserved_format(doc: Document = Document(), # Write xml to file etree.ElementTree(root_span.node).write(out, encoding="unicode", method="xml", xml_declaration=True) - log.info("Exported: %s", out) + logger.info("Exported: %s", out) diff --git a/sparv/modules/xml_export/pretty.py b/sparv/modules/xml_export/pretty.py index 20bc887c..aaa49ab1 100644 --- a/sparv/modules/xml_export/pretty.py +++ b/sparv/modules/xml_export/pretty.py @@ -1,19 +1,19 @@ """Export annotated corpus data to pretty-printed xml.""" -import logging import os -import sparv.util as util -from sparv import (AllDocuments, Annotation, AnnotationData, Config, Corpus, Document, Export, ExportAnnotations, - ExportInput, OutputCommonData, SourceAnnotations, exporter, installer) +from sparv.api import (AllSourceFilenames, Annotation, AnnotationData, Config, Corpus, Export, ExportAnnotations, + ExportInput, Namespaces, OutputCommonData, SourceAnnotations, SourceFilename, exporter, + get_logger, installer, util) + from . import xml_utils -log = logging.getLogger(__name__) +logger = get_logger(__name__) @exporter("XML export with one token element per line", config=[ - Config("xml_export.filename", default="{doc}_export.xml", - description="Filename pattern for resulting XML files, with '{doc}' representing the source name."), + Config("xml_export.filename", default="{file}_export.xml", + description="Filename pattern for resulting XML files, with '{file}' representing the source name."), Config("xml_export.annotations", description="Sparv annotations to include."), Config("xml_export.source_annotations", description="List of annotations and attributes from the source data to include. Everything will be " @@ -23,9 +23,9 @@ Config("xml_export.include_empty_attributes", False, description="Whether to include attributes even when they are empty.") ]) -def pretty(doc: Document = Document(), - docid: AnnotationData = AnnotationData(""), - out: Export = Export("xml_pretty/[xml_export.filename]"), +def pretty(source_file: SourceFilename = SourceFilename(), + fileid: AnnotationData = AnnotationData(""), + out: Export = Export("xml_export.pretty/[xml_export.filename]"), token: Annotation = Annotation(""), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"), @@ -38,15 +38,15 @@ def pretty(doc: Document = Document(), """Export annotations to pretty XML in export_dir. Args: - doc: Name of the original document. - docid: Annotation with document IDs. + source_file: Name of the source file. + fileid: Annotation with file IDs. out: Path and filename pattern for resulting file. token: Annotation containing the token strings. word: Annotation containing the token strings. annotations: List of elements:attributes (annotations) to include. - source_annotations: List of elements:attributes from the original document + source_annotations: List of elements:attributes from the source file to be kept. If not specified, everything will be kept. - header_annotations: List of header elements from the original document to include + header_annotations: List of header elements from the source file to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. @@ -59,59 +59,71 @@ def pretty(doc: Document = Document(), token_name = token.name - # Read words and document ID + # Read words, file ID and XML namespaces word_annotation = list(word.read()) - docid_annotation = docid.read() + fileid_annotation = fileid.read() + xml_namespaces = Namespaces(source_file).read() # Get annotation spans, annotations list etc. - annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc, - token_name=token_name, - remove_namespaces=remove_namespaces, - sparv_namespace=sparv_namespace, - source_namespace=source_namespace) - h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) + annotation_list, _, export_names = util.export.get_annotation_names(annotations, source_annotations, source_file=source_file, + token_name=token_name, + remove_namespaces=remove_namespaces, + sparv_namespace=sparv_namespace, + source_namespace=source_namespace, + xml_mode=True) + if token not in annotation_list: + logger.warning("The 'xml_export:pretty' export requires the annotation for the output to include the " + "source text. Make sure to add to the list of export annotations.") + h_annotations, h_export_names = util.export.get_header_names(header_annotations, source_file=source_file) export_names.update(h_export_names) - span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations, - doc=doc, split_overlaps=True) + span_positions, annotation_dict = util.export.gather_annotations(annotation_list, export_names, h_annotations, + source_file=source_file, split_overlaps=True) xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation, - docid_annotation, include_empty_attributes, sparv_namespace) + fileid_annotation, include_empty_attributes, sparv_namespace, xml_namespaces) # Write XML to file - with open(out, mode="w") as outfile: + with open(out, mode="w", encoding="utf-8") as outfile: outfile.write(xmlstr) - log.info("Exported: %s", out) + logger.info("Exported: %s", out) @exporter("Combined XML export (all results in one file)", config=[ Config("xml_export.filename_combined", default="[metadata.id].xml", - description="Filename of resulting combined XML.") + description="Filename of resulting combined XML."), + Config("xml_export.include_version_info", default=True, + description="Whether to include annotation version info in the combined XML.") ]) def combined(corpus: Corpus = Corpus(), - out: Export = Export("[xml_export.filename_combined]"), - docs: AllDocuments = AllDocuments(), - xml_input: ExportInput = ExportInput("xml_pretty/[xml_export.filename]", all_docs=True)): + out: Export = Export("xml_export.combined/[xml_export.filename_combined]"), + source_files: AllSourceFilenames = AllSourceFilenames(), + xml_input: ExportInput = ExportInput("xml_export.pretty/[xml_export.filename]", all_files=True), + version_info: ExportInput = ExportInput("version_info/info_[metadata.id].yaml"), + include_version_info: bool = Config("xml_export.include_version_info")): """Combine XML export files into a single XML file.""" - xml_utils.combine(corpus, out, docs, xml_input) + if include_version_info: + xml_utils.combine(corpus, out, source_files, xml_input, version_info) + else: + xml_utils.combine(corpus, out, source_files, xml_input) @exporter("Compressed combined XML export", config=[ Config("xml_export.filename_compressed", default="[metadata.id].xml.bz2", description="Filename of resulting compressed combined XML.") ]) -def compressed(out: Export = Export("[xml_export.filename_compressed]"), - xmlfile: ExportInput = ExportInput("[xml_export.filename_combined]")): +def compressed(out: Export = Export("xml_export.combined/[xml_export.filename_compressed]"), + xmlfile: ExportInput = ExportInput("xml_export.combined/[xml_export.filename_combined]")): """Compress combined XML export.""" xml_utils.compress(xmlfile, out) -@installer("Copy compressed unscrambled XML to remote host", config=[ - Config("xml_export.export_original_host", "", description="Remote host to copy XML export to."), - Config("xml_export.export_original_path", "", description="Path on remote host to copy XML export to.") +@installer("Copy compressed XML to remote host", config=[ + Config("xml_export.export_host", "", description="Remote host to copy XML export to."), + Config("xml_export.export_path", "", description="Path on remote host to copy XML export to.") ]) -def install_original(corpus: Corpus = Corpus(), - xmlfile: ExportInput = ExportInput("[xml_export.filename_compressed]"), - out: OutputCommonData = OutputCommonData("xml_export.install_export_pretty_marker"), - export_path: str = Config("xml_export.export_original_path"), - host: str = Config("xml_export.export_original_host")): - """Copy compressed combined unscrambled XML to remote host.""" - xml_utils.install_compressed_xml(corpus, xmlfile, out, export_path, host) +def install(corpus: Corpus = Corpus(), + bz2file: ExportInput = ExportInput("xml_export.combined/[xml_export.filename_compressed]"), + out: OutputCommonData = OutputCommonData("xml_export.install_export_pretty_marker"), + export_path: str = Config("xml_export.export_path"), + host: str = Config("xml_export.export_host")): + """Copy compressed combined XML to remote host.""" + xml_utils.install_compressed_xml(corpus, bz2file, out, export_path, host) diff --git a/sparv/modules/xml_export/scrambled.py b/sparv/modules/xml_export/scrambled.py index ed7182cc..fc9789c9 100644 --- a/sparv/modules/xml_export/scrambled.py +++ b/sparv/modules/xml_export/scrambled.py @@ -1,22 +1,21 @@ """Export annotated corpus data to scrambled xml.""" -import logging import os -import sparv.util as util -from sparv import (AllDocuments, Annotation, AnnotationData, Config, Corpus, Document, Export, ExportAnnotations, - ExportInput, OutputCommonData, SourceAnnotations, exporter, installer) +from sparv.api import (AllSourceFilenames, Annotation, AnnotationData, Config, Corpus, Export, ExportAnnotations, + ExportInput, Namespaces, OutputCommonData, SourceAnnotations, SourceFilename, SparvErrorMessage, + exporter, get_logger, installer, util) from . import xml_utils -log = logging.getLogger(__name__) +logger = get_logger(__name__) @exporter("Scrambled XML export", config=[ Config("xml_export.scramble_on", description="Annotation to use for scrambling.") ]) -def scrambled(doc: Document = Document(), - docid: AnnotationData = AnnotationData(""), - out: Export = Export("xml_scrambled/[xml_export.filename]"), +def scrambled(source_file: SourceFilename = SourceFilename(), + fileid: AnnotationData = AnnotationData(""), + out: Export = Export("xml_export.scrambled/[xml_export.filename]"), chunk: Annotation = Annotation("[xml_export.scramble_on]"), chunk_order: Annotation = Annotation("[xml_export.scramble_on]:misc.number_random"), token: Annotation = Annotation(""), @@ -28,63 +27,77 @@ def scrambled(doc: Document = Document(), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config("xml_export.include_empty_attributes")): """Export annotations to scrambled XML.""" + # Read words, file ID and XML namespaces + word_annotation = list(word.read()) + chunk_order = list(chunk_order.read()) + fileid_annotation = fileid.read() + xml_namespaces = Namespaces(source_file).read() + # Get annotation spans, annotations list etc. - annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc, - token_name=token.name, - remove_namespaces=remove_namespaces, - sparv_namespace=sparv_namespace, - source_namespace=source_namespace) + annotation_list, _, export_names = util.export.get_annotation_names(annotations, source_annotations, + source_file=source_file, + token_name=token.name, + remove_namespaces=remove_namespaces, + sparv_namespace=sparv_namespace, + source_namespace=source_namespace, + xml_mode=True) + if token not in annotation_list: + logger.warning("The 'xml_export:scrambled' export requires the annotation for the output to include " + "the source text. Make sure to add to the list of export annotations.") if chunk not in annotation_list: - raise util.SparvErrorMessage( + raise SparvErrorMessage( "The annotation used for scrambling ({}) needs to be included in the output.".format(chunk)) - span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc, - split_overlaps=True) - - # Read words and document ID - word_annotation = list(word.read()) - chunk_order = list(chunk_order.read()) - docid_annotation = docid.read() + span_positions, annotation_dict = util.export.gather_annotations(annotation_list, export_names, + source_file=source_file, split_overlaps=True) # Reorder chunks - new_span_positions = util.scramble_spans(span_positions, chunk.name, chunk_order) + new_span_positions = util.export.scramble_spans(span_positions, chunk.name, chunk_order) # Construct XML string xmlstr = xml_utils.make_pretty_xml(new_span_positions, annotation_dict, export_names, token.name, word_annotation, - docid_annotation, include_empty_attributes, sparv_namespace) + fileid_annotation, include_empty_attributes, sparv_namespace, xml_namespaces) # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Write XML to file - with open(out, mode="w") as outfile: + with open(out, mode="w", encoding="utf-8") as outfile: outfile.write(xmlstr) - log.info("Exported: %s", out) + logger.info("Exported: %s", out) @exporter("Combined scrambled XML export") def combined_scrambled(corpus: Corpus = Corpus(), - out: Export = Export("[metadata.id]_scrambled.xml"), - docs: AllDocuments = AllDocuments(), - xml_input: ExportInput = ExportInput("xml_scrambled/[xml_export.filename]", all_docs=True)): + out: Export = Export("xml_export.combined_scrambled/[metadata.id]_scrambled.xml"), + source_files: AllSourceFilenames = AllSourceFilenames(), + xml_input: ExportInput = ExportInput("xml_export.scrambled/[xml_export.filename]", + all_files=True), + version_info: ExportInput = ExportInput("version_info/info_[metadata.id].yaml"), + include_version_info: bool = Config("xml_export.include_version_info")): """Combine XML export files into a single XML file.""" - xml_utils.combine(corpus, out, docs, xml_input) + if include_version_info: + xml_utils.combine(corpus, out, source_files, xml_input, version_info) + else: + xml_utils.combine(corpus, out, source_files, xml_input) @exporter("Compressed combined scrambled XML export") -def compressed_scrambled(out: Export = Export("[metadata.id]_scrambled.xml.bz2"), - xmlfile: ExportInput = ExportInput("[metadata.id]_scrambled.xml")): +def compressed_scrambled(out: Export = Export("xml_export.combined_scrambled/[metadata.id]_scrambled.xml.bz2"), + xmlfile: ExportInput = ExportInput( + "xml_export.combined_scrambled/[metadata.id]_scrambled.xml")): """Compress combined XML export.""" xml_utils.compress(xmlfile, out) @installer("Copy compressed scrambled XML to remote host", config=[ - Config("xml_export.export_host", "", description="Remote host to copy scrambled XML export to"), - Config("xml_export.export_path", "", description="Path on remote host to copy scrambled XML export to") + Config("xml_export.export_scrambled_host", "", description="Remote host to copy scrambled XML export to"), + Config("xml_export.export_scrambled_path", "", description="Path on remote host to copy scrambled XML export to") ]) def install_scrambled(corpus: Corpus = Corpus(), - xmlfile: ExportInput = ExportInput("[metadata.id]_scrambled.xml"), + bz2file: ExportInput = ExportInput( + "xml_export.combined_scrambled/[metadata.id]_scrambled.xml.bz2"), out: OutputCommonData = OutputCommonData("xml_export.install_export_scrambled_marker"), - export_path: str = Config("xml_export.export_path"), - host: str = Config("xml_export.export_host")): + export_path: str = Config("xml_export.export_scrambled_path"), + host: str = Config("xml_export.export_scrambled_host")): """Copy compressed combined scrambled XML to remote host.""" - xml_utils.install_compressed_xml(corpus, xmlfile, out, export_path, host) + xml_utils.install_compressed_xml(corpus, bz2file, out, export_path, host) diff --git a/sparv/modules/xml_export/xml_utils.py b/sparv/modules/xml_export/xml_utils.py index cdd29963..cbe971b1 100644 --- a/sparv/modules/xml_export/xml_utils.py +++ b/sparv/modules/xml_export/xml_utils.py @@ -2,30 +2,31 @@ import bz2 import io -import logging import os import re import xml.etree.ElementTree as etree +from shutil import copyfileobj from typing import Optional -import sparv.util as util +from sparv.api import SparvErrorMessage, get_logger, util -log = logging.getLogger(__name__) +logger = get_logger(__name__) INDENTATION = " " -def make_pretty_xml(span_positions, annotation_dict, export_names, token_name: str, word_annotation, docid, - include_empty_attributes: bool, sparv_namespace: Optional[str] = None): +def make_pretty_xml(span_positions, annotation_dict, export_names, token_name: str, word_annotation, fileid, + include_empty_attributes: bool, sparv_namespace: Optional[str] = None, + xml_namespaces: Optional[dict] = None): """Create a pretty formatted XML string from span_positions. Used by pretty and sentence_scrambled. """ # Root tag sanity check if not valid_root(span_positions[0], span_positions[-1]): - raise util.SparvErrorMessage("Root tag is missing! If you have manually specified which elements to include, " - "make sure to include an element that encloses all other included elements and " - "text content.") + raise SparvErrorMessage("Root tag is missing! If you have manually specified which elements to include, " + "make sure to include an element that encloses all other included elements and " + "text content.") # Create root node root_span = span_positions[0][2] @@ -39,6 +40,8 @@ def make_pretty_xml(span_positions, annotation_dict, export_names, token_name: s last_node = None inside_token = False + register_namespaces(xml_namespaces) + def handle_subtoken_text(position, last_start_position, last_end_position, node, token_text): """Handle text for subtoken elements.""" if last_start_position < last_end_position < position: @@ -54,7 +57,7 @@ def handle_subtoken_text(position, last_start_position, last_end_position, node, # Handle headers if span.is_header: if instruction == "open": - header = annotation_dict[span.name][util.HEADER_CONTENTS][span.index] + header = annotation_dict[span.name][util.constants.HEADER_CONTENTS][span.index] # Replace any leading tabs with spaces header = re.sub(r"^\t+", lambda m: INDENTATION * len(m.group()), header, flags=re.MULTILINE) header_xml = etree.fromstring(header) @@ -69,9 +72,10 @@ def handle_subtoken_text(position, last_start_position, last_end_position, node, add_attrs(span.node, span.name, annotation_dict, export_names, span.index, include_empty_attributes) if span.overlap_id: if sparv_namespace: - span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") + span.node.set(f"{sparv_namespace}.{util.constants.OVERLAP_ATTR}", f"{fileid}-{span.overlap_id}") else: - span.node.set(f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") + span.node.set(f"{util.constants.SPARV_DEFAULT_NAMESPACE}.{util.constants.OVERLAP_ATTR}", + f"{fileid}-{span.overlap_id}") # Add text if this node is a token if span.name == token_name: @@ -97,12 +101,12 @@ def handle_subtoken_text(position, last_start_position, last_end_position, node, inside_token = False # Make sure closing node == top stack node - assert span == node_stack[-1], "Overlapping elements found: {}".format(node_stack[-2:]) + assert span == node_stack[-1], "Overlapping elements found. Expected {} but found {}".format(span, node_stack[-1]) # Pop stack and move on to next span node_stack.pop() # Pretty formatting of XML tree - util.indent_xml(root_span.node, indentation=INDENTATION) + util.misc.indent_xml(root_span.node, indentation=INDENTATION) # We use write() instead of tostring() here to be able to get an XML declaration stream = io.StringIO() @@ -110,12 +114,19 @@ def handle_subtoken_text(position, last_start_position, last_end_position, node, return stream.getvalue() -def valid_root(first_item, last_item): +def valid_root(first_item, last_item, true_root: bool = False): """Check the validity of the root tag.""" return (first_item[1] == "open" and last_item[1] == "close" and first_item[2].name == last_item[2].name - and first_item[2].index == last_item[2].index) + and first_item[2].index == last_item[2].index + and (not true_root or (first_item[0] == 0))) + + +def register_namespaces(xml_namespaces: dict): + """Register all namespace prefixes.""" + for prefix, uri in xml_namespaces.items(): + etree.register_namespace(prefix, uri) def add_attrs(node, annotation, annotation_dict, export_names, index, include_empty_attributes: bool): @@ -126,16 +137,22 @@ def add_attrs(node, annotation, annotation_dict, export_names, index, include_em node.set(export_name, attrib_values[index]) -def combine(corpus, out, docs, xml_input): +def combine(corpus, out, source_files, xml_input, version_info_file=None): """Combine xml_files into one single file.""" - xml_files = [xml_input.replace("{doc}", doc) for doc in docs] + xml_files = [xml_input.replace("{file}", file) for file in source_files] xml_files.sort() - with open(out, "w") as outf: + with open(out, "w", encoding="utf-8") as outf: print("", file=outf) + if version_info_file: + print("", file=outf) print('' % corpus.replace("&", "&").replace('"', """), file=outf) for infile in xml_files: - log.info("Read: %s", infile) - with open(infile) as inf: + logger.info("Read: %s", infile) + with open(infile, encoding="utf-8") as inf: for n, line in enumerate(inf): # Skip xml declaration if n == 0 and line.startswith("", file=outf) - log.info("Exported: %s" % out) + logger.info("Exported: %s" % out) def compress(xmlfile, out): - """Compress xmlfile to out.""" - with open(xmlfile) as f: - file_data = f.read() - compressed_data = bz2.compress(file_data.encode(util.UTF8)) - with open(out, "wb") as f: - f.write(compressed_data) + """Compress XML file using bzip2. + + Args: + xmlfile: Path to source file. + out: Path to target bz2 file. + """ + with open(xmlfile, "rb") as infile: + with bz2.BZ2File(out, "wb") as outfile: + copyfileobj(infile, outfile) -def install_compressed_xml(corpus, xmlfile, out, export_path, host): +def install_compressed_xml(corpus, bz2file, out, export_path, host): """Install xml file on remote server.""" if not host: raise Exception("No host provided! Export not installed.") filename = corpus + ".xml.bz2" remote_file_path = os.path.join(export_path, filename) - util.install_file(host, xmlfile, remote_file_path) + util.install.install_file(bz2file, host, remote_file_path) out.write("") diff --git a/sparv/modules/xml_import/xml_import.py b/sparv/modules/xml_import/xml_import.py index 5e0771e8..754a415d 100644 --- a/sparv/modules/xml_import/xml_import.py +++ b/sparv/modules/xml_import/xml_import.py @@ -1,16 +1,16 @@ """Parse XML source file.""" import copy -import logging +import re import unicodedata import xml.etree.ElementTree as etree from itertools import chain from typing import List -from sparv import Config, Document, Headers, Output, Source, SourceStructureParser, SourceStructure, Text, importer, \ - util +from sparv.api import (Config, Headers, Namespaces, Output, Source, SourceFilename, SourceStructure, + SourceStructureParser, SparvErrorMessage, Text, get_logger, importer, util) -log = logging.getLogger(__name__) +logger = get_logger(__name__) class XMLStructure(SourceStructureParser): @@ -46,7 +46,7 @@ def get_annotations(self, corpus_config: dict) -> List[str]: @importer("XML import", file_extension="xml", outputs=Config("xml_import.elements", []), config=[ - Config("xml_import.elements", [], description="List of elements and attributes in source document. Only needed for " + Config("xml_import.elements", [], description="List of elements and attributes in source file. Only needed for " "renaming or when used as input to other annotations, as everything " "is parsed whether listed or not."), Config("xml_import.skip", [], description="Elements and attributes to skip. " @@ -56,62 +56,69 @@ def get_annotations(self, corpus_config: dict) -> List[str]: Config("xml_import.header_data", [], description="List of header elements and attributes from which to extract " "metadata."), Config("xml_import.prefix", "", description="Optional prefix to add to annotation names."), - Config("xml_import.encoding", util.UTF8, description="Encoding of source document. Defaults to UTF-8."), + Config("xml_import.remove_namespaces", False, description="Remove XML namespaces upon import."), + Config("xml_import.encoding", util.constants.UTF8, description="Encoding of source file. Defaults to UTF-8."), Config("xml_import.keep_control_chars", False, description="Set to True if control characters should not be " "removed from the text."), Config("xml_import.normalize", "NFC", description="Normalize input using any of the following forms: " "'NFC', 'NFKC', 'NFD', and 'NFKD'.") ], structure=XMLStructure) -def parse(doc: Document = Document(), +def parse(filename: SourceFilename = SourceFilename(), source_dir: Source = Source(), elements: list = Config("xml_import.elements"), skip: list = Config("xml_import.skip"), header_elements: list = Config("xml_import.header_elements"), header_data: list = Config("xml_import.header_data"), prefix: str = Config("xml_import.prefix"), + remove_namespaces: bool = Config("xml_import.remove_namespaces"), encoding: str = Config("xml_import.encoding"), keep_control_chars: bool = Config("xml_import.keep_control_chars"), normalize: str = Config("xml_import.normalize")): """Parse XML source file and create annotation files. Args: - doc: Source document name. - source_dir: Directory containing source documents. - elements: List of elements and attributes in source document. Only needed for renaming, as everything is + filename: Source filename. + source_dir: Directory containing source files. + elements: List of elements and attributes in source file. Only needed for renaming, as everything is parsed whether listed or not. skip: Elements and attributes to skip. Use elementname:@contents to skip contents as well. header_elements: Elements containing header metadata. Contents will not be included in corpus text. header_data: List of header elements and attributes from which to extract metadata. prefix: Optional prefix to add to annotations. - encoding: Encoding of source document. Defaults to UTF-8. + encoding: Encoding of source file. Defaults to UTF-8. keep_control_chars: Set to True to keep control characters in the text. normalize: Normalize input using any of the following forms: 'NFC', 'NFKC', 'NFD', and 'NFKD'. Defaults to 'NFC'. """ parser = SparvXMLParser(elements, skip, header_elements, header_data, source_dir, encoding, prefix, - keep_control_chars, normalize) - parser.parse(doc) + remove_namespaces, keep_control_chars, normalize) + parser.parse(filename) parser.save() class SparvXMLParser: """XML parser class for parsing XML.""" - def __init__(self, elements: list, skip: list, header_elements: list, headers: list, source_dir: Source, - encoding: str = util.UTF8, prefix: str = "", keep_control_chars: bool = True, normalize: str = "NFC"): + def __init__(self, elements: list, skip: list, header_elements: list, header_data: list, source_dir: Source, + encoding: str = util.constants.UTF8, prefix: str = "", remove_namespaces: bool = False, + keep_control_chars: bool = True, normalize: str = "NFC"): """Initialize XML parser.""" self.source_dir = source_dir self.encoding = encoding self.keep_control_chars = keep_control_chars self.normalize = normalize - self.doc = None + self.file = None self.prefix = prefix + self.remove_namespaces = remove_namespaces self.header_elements = header_elements - self.headers = {} + self.header_data = {} + self.unprocessed_header_data_elems = set() self.targets = {} # Index of elements and attributes that will be renamed during import self.data = {} # Metadata collected during parsing - self.text = [] # Text data of the document collected during parsing + self.text = [] # Text data of the source file collected during parsing + self.namespace_mapping = {} # Mapping of namespace prefix --> uri + self.namespace_mapping_reversed = {} # Mapping of uri --> namespace prefix # Parse elements argument @@ -126,7 +133,7 @@ def elsplit(elem): all_elems = set() renames = {} # Element list needs to be sorted to handle plain elements before attributes - for element, target in sorted(util.parse_annotation_list(elements)): + for element, target in sorted(util.misc.parse_annotation_list(elements)): element, attr = elsplit(element) all_elems.add((element, attr)) @@ -150,27 +157,28 @@ def elsplit(elem): if attr: self.data[element]["attrs"].add(attr) - for header in headers: + for header in header_data: header_source, _, header_target = header.partition(" as ") if not header_target: - raise util.SparvErrorMessage("The header '{}' needs to be bound to a target element.".format(header)) + raise SparvErrorMessage("The header '{}' needs to be bound to a target element.".format(header)) header_source, _, header_source_attrib = header_source.partition(":") header_source_root, _, header_source_rest = header_source.partition("/") - self.headers.setdefault(header_source_root, {}) - self.headers[header_source_root].setdefault(header_source_rest, []) - self.headers[header_source_root][header_source_rest].append({ + self.header_data.setdefault(header_source_root, {}) + self.header_data[header_source_root].setdefault(header_source_rest, []) + self.header_data[header_source_root][header_source_rest].append({ "source": header_source_attrib, "target": elsplit(header_target) }) + self.unprocessed_header_data_elems.add(header_source_root) self.skipped_elems = set(elsplit(elem) for elem in skip) assert self.skipped_elems.isdisjoint(all_elems), "skip and elements must be disjoint" - def parse(self, doc): + def parse(self, file): """Parse XML and build data structure.""" - self.doc = doc + self.file = file header_data = {} - source_file = self.source_dir.get_path(self.doc, ".xml") + source_file = self.source_dir.get_path(self.file, ".xml") def handle_element(element): """Handle element renaming, skipping and collection of data.""" @@ -182,7 +190,8 @@ def handle_element(element): if (name_orig, "*") in self.skipped_elems: attrs = {} for attr in attrs.copy(): - if (name_orig, attr) in self.skipped_elems: + attr_name = get_sparv_name(attr) + if (name_orig, attr_name) in self.skipped_elems: attrs.pop(attr) if name_orig in self.targets: @@ -190,13 +199,16 @@ def handle_element(element): name = self.targets[name_orig]["target"] attrs_tmp = {} for attr in attrs: - attrs_tmp[self.targets[name_orig]["attrs"].get(attr, attr)] = attrs[attr] + attr_name = get_sparv_name(attr) + attrs_tmp[self.targets[name_orig]["attrs"].get(attr_name, attr_name)] = attrs[attr] attrs = attrs_tmp else: name = name_orig + # Save attrs in data self.data.setdefault(name, {"attrs": set(), "elements": []}) - self.data[name]["attrs"].update(set(attrs.keys())) + attr_keys = [get_sparv_name(attr) for attr in attrs.keys()] + self.data[name]["attrs"].update(set(attr_keys)) # Add attribute data collected from header if name in header_data: @@ -204,56 +216,116 @@ def handle_element(element): self.data[name]["attrs"].update(set(header_data[name].keys())) del header_data[name] + attrs = {get_sparv_name(k): v for k, v in attrs.items()} self.data[name]["elements"].append( (start, start_subpos, end, end_subpos, name_orig, attrs) ) - def handle_raw_header(element: etree.Element, start_pos: int, start_subpos: int): + def handle_raw_header(element: etree.Element, tag_name: str, start_pos: int, start_subpos: int): """Save full header XML as string.""" # Save header as XML tmp_element = copy.deepcopy(element) tmp_element.tail = "" - self.data.setdefault(element.tag, {"attrs": {util.HEADER_CONTENTS}, "elements": []}) - self.data[element.tag]["elements"].append( - (start_pos, start_subpos, start_pos, start_subpos, element.tag, - {util.HEADER_CONTENTS: etree.tostring(tmp_element, method="xml", encoding="UTF-8").decode()}) + if self.remove_namespaces: + for e in tmp_element.iter(): + remove_namespaces(e) + self.data.setdefault(tag_name, {"attrs": {util.constants.HEADER_CONTENTS}, "elements": []}) + self.data[tag_name]["elements"].append( + (start_pos, start_subpos, start_pos, start_subpos, tag_name, + {util.constants.HEADER_CONTENTS: etree.tostring(tmp_element, method="xml", encoding="UTF-8").decode()}) ) + handle_header_data(element, tag_name) - handle_header_data(element) - - def handle_header_data(element: etree.Element): + def handle_header_data(element: etree.Element, tag_name: str = None): """Extract header metadata.""" - for header_path, header_sources in self.headers.get(element.tag, {}).items(): + if tag_name in self.unprocessed_header_data_elems: + self.unprocessed_header_data_elems.remove(tag_name) + for e in element.iter(): + if self.remove_namespaces: + remove_namespaces(e) + else: + # Extract and register all namespaces from the header and its children + get_sparv_name(e.tag) + for header_path, header_sources in self.header_data.get(tag_name, {}).items(): if not header_path: header_element = element else: - header_element = element.find(header_path) + xpath = annotation_to_xpath(header_path) + header_element = element.find(xpath) if header_element is not None: for header_source in header_sources: if header_source["source"]: - header_value = header_element.attrib.get(header_source["source"]) + source_name = annotation_to_xpath(header_source["source"]) + header_value = header_element.attrib.get(source_name) else: header_value = header_element.text.strip() if header_value: header_data.setdefault(header_source["target"][0], {}) header_data[header_source["target"][0]][header_source["target"][1]] = header_value + else: + logger.warning(f"Header data '{tag_name}/{header_path}' was not found in source data.") + + def iter_ns_declarations(): + """Iterate over namespace declarations in the source file.""" + for _, (prefix, uri) in etree.iterparse(source_file, events=["start-ns"]): + self.namespace_mapping[prefix] = uri + self.namespace_mapping_reversed[uri] = prefix + yield prefix, uri + + def get_sparv_name(xml_name: str): + """Get the sparv notation of a tag or attr name with regards to XML namespaces.""" + ns_uri, tag = get_namespace(xml_name) + if self.remove_namespaces: + return tag + tag_name = xml_name + if ns_uri: + ns_prefix = self.namespace_mapping_reversed.get(ns_uri, "") + if not ns_prefix: + for prefix, uri in iter_ns_declarations(): + if uri == ns_uri: + ns_prefix = prefix + break + tag_name = f"{ns_prefix}{util.constants.XML_NAMESPACE_SEP}{tag}" + return tag_name + + def annotation_to_xpath(path: str): + """Convert a sparv header path into a real xpath.""" + sep = re.escape(util.constants.XML_NAMESPACE_SEP) + m = re.finditer(fr"([^/+:]+){sep}", path) or [] + for i in m: + uri = "{" + self.namespace_mapping[i.group(1)] + "}" + path = re.sub(re.escape(i.group(0)), uri, path, count=1) + return path + + def remove_namespaces(element: etree.Element): + """Remove namespaces from element and its attributes.""" + uri, _ = get_namespace(element.tag) + if uri: + element.tag = element.tag[len("{" + uri + "}"):] + for k in list(element.attrib.keys()): + uri, _ = get_namespace(k) + if uri: + element.set(k[len("{" + uri + "}"):], element.attrib[k]) + element.attrib.pop(k) def iter_tree(element: etree.Element, start_pos: int = 0, start_subpos: int = 0): - """Walk though whole XML and handle elements and text data.""" - if (element.tag, "@contents") in self.skipped_elems: + """Walk through whole XML and handle elements and text data.""" + tag_name = get_sparv_name(element.tag) + + if (tag_name, "@contents") in self.skipped_elems: # Skip whole element and all its contents if element.tail: self.text.append(element.tail) return 0, len(element.tail or ""), 0 - elif element.tag in self.header_elements: + elif tag_name in self.header_elements: if element.tail: self.text.append(element.tail) - handle_raw_header(element, start_pos, start_subpos) + handle_raw_header(element, tag_name, start_pos, start_subpos) return 0, len(element.tail or ""), 0 - elif element.tag in self.headers: - handle_header_data(element) + elif tag_name in self.header_data: + handle_header_data(element, tag_name) element_length = 0 if element.text: element_length = len(element.text) @@ -271,31 +343,45 @@ def iter_tree(element: etree.Element, start_pos: int = 0, start_subpos: int = 0) end_subpos += 1 else: end_subpos = 0 - handle_element([start_pos, start_subpos, end_pos, end_subpos, element.tag, element.attrib]) + handle_element([start_pos, start_subpos, end_pos, end_subpos, tag_name, element.attrib]) if element.tail: self.text.append(element.tail) return element_length, len(element.tail or ""), end_subpos if self.keep_control_chars and not self.normalize: - tree = etree.parse(source_file) + try: + tree = etree.parse(source_file) + except Exception as e: + raise SparvErrorMessage(f"The XML input file could not be parsed. Error: {str(e)}") root = tree.getroot() else: - text = source_file.read_text() + text = source_file.read_text(encoding="utf-8") if not self.keep_control_chars: - text = util.remove_control_characters(text) + text = util.misc.remove_control_characters(text) if self.normalize: text = unicodedata.normalize(self.normalize, text) - root = etree.fromstring(text) + try: + root = etree.fromstring(text) + except Exception as e: + raise SparvErrorMessage(f"The XML input file could not be parsed. Error: {str(e)}") iter_tree(root) if header_data: - log.warning("Some header data could not be bound to target elements.") + logger.warning("Some header data could not be bound to target elements.") + + if self.unprocessed_header_data_elems: + logger.warning("{} header data element{} {} not found in source data: '{}'.".format( + "Some" if len(self.unprocessed_header_data_elems) > 1 else "One", + "s" if len(self.unprocessed_header_data_elems) > 1 else "", + "were" if len(self.unprocessed_header_data_elems) > 1 else "was", + "', '".join(self.unprocessed_header_data_elems) + )) def save(self): """Save text data and annotation files to disk.""" text = "".join(self.text) - Text(self.doc).write(text) + Text(self.file).write(text) structure = [] header_elements = [] @@ -318,42 +404,61 @@ def save(self): structure.append(full_element) # Sort spans and annotations by span position (required by Sparv) - if attributes: + if attributes and spans: attr_names, attr_values = list(zip(*attributes.items())) spans, *attr_values = list(zip(*sorted(zip(spans, *attr_values), key=lambda x: x[0]))) attributes = dict(zip(attr_names, attr_values)) else: spans.sort() - Output(full_element, doc=self.doc).write(spans) + Output(full_element, source_file=self.file).write(spans) for attr in attributes: full_attr = "{}.{}".format(self.prefix, attr) if self.prefix else attr - Output("{}:{}".format(full_element, full_attr), doc=self.doc).write(attributes[attr], - allow_newlines=is_header) + Output("{}:{}".format(full_element, full_attr), source_file=self.file).write(attributes[attr], + allow_newlines=is_header) if element not in self.header_elements: structure.append("{}:{}".format(full_element, full_attr)) # Save list of all elements and attributes to a file (needed for export) - SourceStructure(self.doc).write(structure) + SourceStructure(self.file).write(structure) if header_elements: # Save list of all header elements to a file - Headers(self.doc).write(header_elements) + Headers(self.file).write(header_elements) + + # Save namespace mapping (URI to prefix) + if self.namespace_mapping: + Namespaces(self.file).write(self.namespace_mapping) + + +def get_namespace(xml_name: str): + """Search for a namespace in tag and return a tuple (URI, tagname).""" + m = re.match(r"\{(.*)\}(.+)", xml_name) + return (m.group(1), m.group(2)) if m else ("", xml_name) def analyze_xml(source_file): """Analyze an XML file and return a list of elements and attributes.""" elements = set() - parser = etree.iterparse(source_file, events=("start", "end")) + parser = etree.iterparse(source_file, events=("start-ns", "start")) event, root = next(parser) + namespace_map = {} for event, element in chain([(event, root)], parser): - if event == "start": - elements.add(element.tag) + if event == "start-ns": + prefix, uri = element + namespace_map[uri] = prefix + elif event == "start": + tagname = element.tag + uri, tag = get_namespace(tagname) + if uri: + prefix = namespace_map[uri] + tagname = f"{prefix}{util.constants.XML_NAMESPACE_SEP}{tag}" + elements.add(tagname) for attr in element.attrib: - elements.add(f"{element.tag}:{attr}") + elements.add(f"{tagname}:{attr}") root.clear() return elements diff --git a/sparv/resources/config/config_default.yaml b/sparv/resources/config/config_default.yaml index 0623c2d5..9b25635c 100644 --- a/sparv/resources/config/config_default.yaml +++ b/sparv/resources/config/config_default.yaml @@ -3,7 +3,7 @@ #=============================================================================== metadata: - # Language of the input documents, specified as ISO 639-3 code + # Language of the input files, specified as ISO 639-3 code language: swe #=============================================================================== @@ -22,6 +22,7 @@ classes: sentence: segment.sentence token: segment.token "token:word": :misc.word + "token:ref": :misc.ref #=============================================================================== # Export Settings @@ -45,11 +46,11 @@ export: # Common settings for XML exporters # xml_export: - # Export hosts and paths (targets for install_export and install_export_original) + # Export hosts and paths (targets for install and install_scrambled) # export_host: "" # export_path: "" - # export_original_host: "" - # export_original_path: "" + # export_scrambled_host: "" + # export_scrambled_path: "" #=============================================================================== # Module-specific Settings @@ -64,17 +65,3 @@ segment: sentence_chunk: , # Chunk to use for automatic tokenisation token_chunk: - -# korp: - # Password protected corpus - # protected: false - - # Remote host name for installation of both corpus data and database - # remote_host: - - # Paths on remote server (targets for install_corpus) - # remote_cwb_datadir: - # remote_cwb_registry: - - # Database name for relations, lemgram index and timespan - # mysql_dbname: diff --git a/sparv/resources/config/presets/FREELING_FULL.yaml b/sparv/resources/config/presets/FREELING_FULL.yaml deleted file mode 100644 index e4e389b5..00000000 --- a/sparv/resources/config/presets/FREELING_FULL.yaml +++ /dev/null @@ -1,33 +0,0 @@ -#=============================================================================== -# FreeLing Annotation Presets with named entities -#=============================================================================== - -languages: - - cat - - deu - - eng - - spa - - por - - -classes: - sentence: freeling.sentence - token: freeling.token - "token:word": freeling.token:freeling.word - "token:pos": freeling.token:freeling.pos - "token:baseform": freeling.token:freeling.baseform - - -presets: - all: - - token - - sentence - - token: - - freeling.token:freeling.upos - - freeling.token:freeling.pos - - freeling.token:freeling.baseform - - freeling.token:freeling.ne_type - - sentence: - - freeling.sentence:misc.id diff --git a/sparv/resources/config/presets/MALT.yaml b/sparv/resources/config/presets/MALT.yaml index b8e16d1f..1b503079 100644 --- a/sparv/resources/config/presets/MALT.yaml +++ b/sparv/resources/config/presets/MALT.yaml @@ -8,9 +8,10 @@ classes: "token:deprel": :malt.deprel "token:dephead": :malt.dephead "token:dephead_ref": :malt.dephead_ref + "token:ref": :malt.ref presets: all: - :malt.deprel - :malt.dephead_ref - - :misc.number_rel_ as ref + - :malt.ref diff --git a/sparv/resources/config/presets/SBX_1800.yaml b/sparv/resources/config/presets/SBX_1800.yaml new file mode 100644 index 00000000..293266ac --- /dev/null +++ b/sparv/resources/config/presets/SBX_1800.yaml @@ -0,0 +1,35 @@ +#=============================================================================== +# Språkbanken's Standard Annotation Presets for Swedish from the 1800's +#=============================================================================== +languages: + - swe-1800 + + +presets: + paragraph: + - SWE_1800.all + - DATETIME.all + - :misc.id as _id + - as lemma + - as lex + - as dephead + - :misc.tail as _tail + - :misc.head as _head + - :geo.geo_context as _geocontext + - :geo.geo_context as _geocontext + no-paragraph: + - paragraph + - not :geo.geo_context + paragraph-no-date: + - SWE_1800.all + - :misc.id as _id + - as lemma + - as lex + - as dephead + - :misc.tail as _tail + - :misc.head as _head + - :geo.geo_context as _geocontext + - :geo.geo_context as _geocontext + no-paragraph-no-date: + - paragraph-no-date + - not :geo.geo_context diff --git a/sparv/resources/config/presets/SB_DEFAULT.yaml b/sparv/resources/config/presets/SBX_DEFAULT.yaml similarity index 62% rename from sparv/resources/config/presets/SB_DEFAULT.yaml rename to sparv/resources/config/presets/SBX_DEFAULT.yaml index 1a5f33ce..006aaaf2 100644 --- a/sparv/resources/config/presets/SB_DEFAULT.yaml +++ b/sparv/resources/config/presets/SBX_DEFAULT.yaml @@ -4,24 +4,35 @@ languages: - swe +classes: + "token:msd": :stanza.msd_hunpos_backoff + "token:pos": :stanza.pos_hunpos_backoff presets: - paragraph: + standard-token: - SWE_DEFAULT.all - - DATETIME.all - :saldo.baseform2 as lemma - :saldo.lemgram as lex - as dephead + - not :stanza.msd + - not :stanza.pos + - as msd + - as pos + - :stanza.msd_hunpos_backoff_info as msd_annotator + - :misc.tail as _tail + - :misc.head as _head + paragraph: + - standard-token + - DATETIME.all + - :misc.id as _id - :geo.geo_context as _geocontext - :geo.geo_context as _geocontext no-paragraph: - paragraph - not :geo.geo_context paragraph-no-date: - - SWE_DEFAULT.all - - :saldo.baseform2 as lemma - - :saldo.lemgram as lex - - as dephead + - standard-token + - :misc.id as _id - :geo.geo_context as _geocontext - :geo.geo_context as _geocontext no-paragraph-no-date: diff --git a/sparv/resources/config/presets/FREELING.yaml b/sparv/resources/config/presets/SBX_FREELING.yaml similarity index 50% rename from sparv/resources/config/presets/FREELING.yaml rename to sparv/resources/config/presets/SBX_FREELING.yaml index 1d55c973..25b1d55a 100644 --- a/sparv/resources/config/presets/FREELING.yaml +++ b/sparv/resources/config/presets/SBX_FREELING.yaml @@ -13,11 +13,10 @@ languages: classes: - sentence: freeling.sentence - token: freeling.token - "token:word": freeling.token:freeling.word - "token:pos": freeling.token:freeling.pos - "token:baseform": freeling.token:freeling.baseform + sentence: sbx_freeling.sentence + token: sbx_freeling.token + "token:pos": sbx_freeling.token:sbx_freeling.pos + "token:baseform": sbx_freeling.token:sbx_freeling.baseform presets: @@ -26,9 +25,9 @@ presets: - sentence token: - - freeling.token:freeling.upos - - freeling.token:freeling.pos - - freeling.token:freeling.baseform + - sbx_freeling.token:sbx_freeling.upos + - sbx_freeling.token:sbx_freeling.pos + - sbx_freeling.token:sbx_freeling.baseform sentence: - - freeling.sentence:misc.id + - sbx_freeling.sentence:misc.id diff --git a/sparv/resources/config/presets/SBX_FREELING_FULL.yaml b/sparv/resources/config/presets/SBX_FREELING_FULL.yaml new file mode 100644 index 00000000..4410ca89 --- /dev/null +++ b/sparv/resources/config/presets/SBX_FREELING_FULL.yaml @@ -0,0 +1,32 @@ +#=============================================================================== +# FreeLing Annotation Presets with named entities +#=============================================================================== + +languages: + - cat + - deu + - eng + - spa + - por + + +classes: + sentence: sbx_freeling.sentence + token: sbx_freeling.token + "token:pos": sbx_freeling.token:sbx_freeling.pos + "token:baseform": sbx_freeling.token:sbx_freeling.baseform + + +presets: + all: + - token + - sentence + + token: + - sbx_freeling.token:sbx_freeling.upos + - sbx_freeling.token:sbx_freeling.pos + - sbx_freeling.token:sbx_freeling.baseform + - sbx_freeling.token:sbx_freeling.ne_type + + sentence: + - sbx_freeling.sentence:misc.id diff --git a/sparv/resources/config/presets/STANFORD.yaml b/sparv/resources/config/presets/STANFORD.yaml index 26ea366b..7ab1d30f 100644 --- a/sparv/resources/config/presets/STANFORD.yaml +++ b/sparv/resources/config/presets/STANFORD.yaml @@ -9,11 +9,11 @@ languages: classes: sentence: stanford.sentence token: stanford.token - "token:word": stanford.token:stanford.word "token:pos": stanford.token:stanford.pos "token:baseform": stanford.token:stanford.baseform "token:deprel": :stanford.deprel "token:dephead_ref": :stanford.dehead_ref + "token:ref": :stanford.ref presets: diff --git a/sparv/resources/config/presets/STANZA.yaml b/sparv/resources/config/presets/STANZA.yaml new file mode 100644 index 00000000..0b972c16 --- /dev/null +++ b/sparv/resources/config/presets/STANZA.yaml @@ -0,0 +1,42 @@ +#=============================================================================== +# Stanza Annotation Presets (not for Swedish) +#=============================================================================== + +languages: + - eng + + +classes: + sentence: stanza.sentence + named_entity: stanza.ne + token: stanza.token + "token:pos": :stanza.pos + "token:baseform": :stanza.baseform + "token:ufeats": :stanza.ufeats + "token:deprel": :stanza.deprel + "token:dephead": :stanza.dehead + "token:dephead_ref": :stanza.dehead_ref + "token:ref": :stanza.ref + + +presets: + all: + - sentence + - named-entity + - token + + token: + - :stanza.pos + - :stanza.upos + - :stanza.baseform + - :stanza.ufeats + - :stanza.dephead_ref + - :stanza.deprel + - :stanza.ref + + sentence: + - :misc.id + + named-entity: + - stanza.ne + - stanza.ne:stanza.ne_type diff --git a/sparv/resources/config/presets/SWE_1800.yaml b/sparv/resources/config/presets/SWE_1800.yaml new file mode 100644 index 00000000..e7daaf41 --- /dev/null +++ b/sparv/resources/config/presets/SWE_1800.yaml @@ -0,0 +1,51 @@ +#=============================================================================== +# Annotations for older Swedish from the 1800's (swe-1800) +#=============================================================================== +languages: + - swe-1800 + + +classes: + sentence: segment.sentence + token: segment.token + "token:word": :misc.word + "token:pos": :hunpos.pos + "token:msd": :hunpos.msd_hist + "token:baseform": :hist.baseform + "token:lemgram": :hist.combined_lemgrams + "token:sense": :hist.sense + "token:deprel": :stanza.deprel + "token:dephead": :stanza.dephead + "token:dephead_ref": :stanza.dephead_ref + "token:ref": :stanza.ref + +presets: + all: + - SWE_DEFAULT.swener + - SWE_DEFAULT.sentence + - SWE_DEFAULT.text-lexical-class + - SWE_DEFAULT.readability + - all-token-annotations + + #==================================== + # Token Annotations + #==================================== + + all-token-annotations: + - hist-token-annotations + - stanza-depparse + - SWE_DEFAULT.sentiment + - SWE_DEFAULT.token-lexical-class + + hist-token-annotations: + - :hunpos.pos + - :hunpos.msd_hist as msd + - :hist.baseform + - :hist.combined_lemgrams as lemgram + - :hist.sense + # - :hist.homograph_set + + stanza-depparse: + - :stanza.deprel + - :stanza.dephead_ref + - :stanza.ref diff --git a/sparv/resources/config/presets/SWE_DEFAULT.yaml b/sparv/resources/config/presets/SWE_DEFAULT.yaml index 6fbe8180..a7944937 100644 --- a/sparv/resources/config/presets/SWE_DEFAULT.yaml +++ b/sparv/resources/config/presets/SWE_DEFAULT.yaml @@ -12,11 +12,14 @@ classes: "token:word": :misc.word "token:pos": :stanza.pos "token:msd": :stanza.msd + "token:ufeats": :stanza.ufeats "token:baseform": :saldo.baseform2 "token:sense": :wsd.sense + "token:lemgram": :saldo.lemgram "token:deprel": :stanza.deprel "token:dephead": :stanza.dephead "token:dephead_ref": :stanza.dephead_ref + "token:ref": :stanza.ref presets: all: @@ -46,9 +49,10 @@ presets: stanza: - :stanza.msd - :stanza.pos + - :stanza.ufeats - :stanza.deprel - :stanza.dephead_ref - - :misc.number_rel_ as ref + - :stanza.ref sentiment: - :sensaldo.sentiment_score diff --git a/sparv/resources/config/presets/SWE_FSV.yaml b/sparv/resources/config/presets/SWE_FSV.yaml new file mode 100644 index 00000000..61effb38 --- /dev/null +++ b/sparv/resources/config/presets/SWE_FSV.yaml @@ -0,0 +1,31 @@ +#=============================================================================== +# Annotations for Old Swedish (swe-fsv) +#=============================================================================== +languages: + - swe-fsv + + +classes: + sentence: segment.sentence + token: segment.token + "token:word": :misc.word + "token:baseform": :hist.baseform + "token:lemgram": :hist.combined_lemgrams + +presets: + all: + - sentence + - token-annotations + + #==================================== + # Token Annotations + #==================================== + + token-annotations: + - :hist.spelling_variants + - :hist.baseform + - :hist.combined_lemgrams as lemgram + - :hist.homograph_set + + sentence: + - :misc.id diff --git a/sparv/util/__init__.py b/sparv/util/__init__.py deleted file mode 100644 index 630d4ae7..00000000 --- a/sparv/util/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from . import system, tagsets -from .constants import * -from .export import gather_annotations, get_annotation_names, get_header_names, scramble_spans -from .install import install_directory, install_file, install_mysql -from .misc import * -from .system import call_binary, call_java, clear_directory, find_binary, kill_process, rsync diff --git a/sparv/util/lmflexicon.py b/sparv/util/lmflexicon.py deleted file mode 100644 index 5fb985c3..00000000 --- a/sparv/util/lmflexicon.py +++ /dev/null @@ -1,172 +0,0 @@ -"""Parses an lmf-lexicon into the standard Saldo format. - -Does not handle msd-information well -Does not mark particles -Does handle multiwords expressions with gaps - -To pickle a file, run -lmflexicon.lmf_to_pickle("swedberg.xml", "swedberg.pickle", skip_multiword=False) -lmlexicon place in subversion: https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/dalinm -""" - -import logging -import re -import xml.etree.ElementTree as etree - -import sparv.util as util -from sparv.modules.saldo.saldo_model import HashableDict, SaldoLexicon - -log = logging.getLogger(__name__) - - -def lmf_to_pickle(xml, filename, annotation_elements=("writtenForm", "lemgram"), skip_multiword=False): - """Read an XML dictionary and save as a pickle file.""" - xml_lexicon = read_lmf(xml, annotation_elements, skip_multiword=skip_multiword) - SaldoLexicon.save_to_picklefile(filename, xml_lexicon) - - -# TODO: Can this be united with saldo.read_lmf ? -def read_lmf(xml, annotation_elements=("writtenForm", "lemgram"), tagset="SUC", verbose=True, skip_multiword=False, translate_tags=True): - """Read the XML version of a morphological lexicon in lmf format (dalinm.xml, swedbergm.xml). - - Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}} - - annotation_element is the XML element for the annotation value, "writtenForm" for baseform, "lemgram" for lemgram - writtenForm is translated to "gf" and lemgram to "lem" (for compatability with Saldo) - - skip_multiword is a flag telling whether to make special entries for multiword expressions. Set this to False only if - the tool used for text annotation cannot handle this at all - """ - # assert annotation_element in ("writtenForm lemgram") "Invalid annotation element" - if verbose: - log.info("Reading XML lexicon") - lexicon = {} - tagmap = util.tagsets.mappings["saldo_to_" + tagset.lower()] - - context = etree.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element - context = iter(context) - event, root = next(context) - - for event, elem in context: - if event == "end": - if elem.tag == "LexicalEntry": - annotations = HashableDict() - - lem = elem.find("Lemma").find("FormRepresentation") - for a in annotation_elements: - if a == "writtenForm": - key = "gf" - elif a == "lemgram": - key = "lem" - annotations[key] = tuple([findval(lem, a)]) - - pos = findval(lem, "partOfSpeech") - inhs = findval(lem, "inherent") - if inhs == "-": - inhs = "" - inhs = inhs.split() - - # there may be several WordForms - for forms in elem.findall("WordForm"): - word = findval(forms, "writtenForm") - param = findval(forms, "msd") - - multiwords = [] - wordparts = word.split() - for i, word in enumerate(wordparts): - if (not skip_multiword) and len(wordparts) > 1: - - # Handle multi-word expressions - multiwords.append(word) - - # We don't use any particles or mwe:s with gaps since that information is not formally - # expressed in the historical lexicons - particle = False - mwe_gap = False # but keep the fields so that the file format matches the normal saldo-pickle format - - # is it the last word in the multi word expression? - if i == len(wordparts) - 1: - lexicon.setdefault(multiwords[0], {}).setdefault(annotations, (set(), set(), mwe_gap, particle))[1].add(tuple(multiwords[1:])) - multiwords = [] - else: - # Single word expressions - particle = False # we don't use any particles or mwe:s with gaps - mwe_gap = False # but keep the fields so that the file format match the normal saldo-pickle format - - if translate_tags: - tags = convert_default(pos, inhs, param, tagmap) - if tags: - lexicon.setdefault(word, {}).setdefault(annotations, (set(), set(), mwe_gap, particle))[0].update(tags) - else: - saldotag = " ".join([pos, param]) # this tag is rather useless, but at least gives some information - tags = tuple([saldotag]) - lexicon.setdefault(word, {}).setdefault(annotations, (set(), set(), mwe_gap, particle))[0].update(tags) - - # Done parsing section. Clear tree to save memory - if elem.tag in ["LexicalEntry", "frame", "resFrame"]: - root.clear() - if verbose: - testwords = ["äplebuske", - "stöpljus", - "katt"] - util.test_lexicon(lexicon, testwords) - log.info("OK, read") - return lexicon - - -################################################################################ -# Auxiliaries -################################################################################ - - -def convert_default(pos, inh, param, tagmap): - saldotag = " ".join(([pos] + inh + [param])) - tags = tagmap.get(saldotag) - if tags: - return tags - tags = try_translate(saldotag) - if tags: - tagmap[saldotag] = tags - return tags - tags = tagmap.get(pos) - if tags: - return tags - tags = [] - for t in list(tagmap.keys()): - if t.split()[0] == pos: - tags.extend(tagmap.get(t)) - return tags - - -def try_translate(params): - """Do some basic translations.""" - params_list = [params] - if " m " in params: - # masculine is translated into utrum - params_list.append(re.sub(" m ", " u ", params)) - if " f " in params: - # feminine is translated into utrum - params_list.append(re.sub(" f ", " u ", params)) - for params in params_list: - params = params.split() - # copied from util.tagsets.tagmappings._make_saldo_to_suc(), try to convert the tag - # but allow m (the match) to be None if the tag still can't be translated - paramstr = " ".join(util.tagsets.mappings["saldo_params_to_suc"].get(prm, prm.upper()) for prm in params) - for (pre, post) in util.tagsets.tagmappings._suc_tag_replacements: - m = re.match(pre, paramstr) - if m: - break - if m is not None: - sucfilter = m.expand(post).replace(" ", r"\.").replace("+", r"\+") - return set(suctag for suctag in util.tagsets.mappings["suc_tags"] if re.match(sucfilter, suctag)) - return [] - - -def findval(elems, key): - """Help function for looking up values in the lmf.""" - def iterfindval(): - for form in elems: - att = form.get("att", "") - if att == key: - yield form.get("val") - yield "" - - return next(iterfindval()) diff --git a/sparv/util/tagsets/tagmappings.py b/sparv/util/tagsets/tagmappings.py deleted file mode 100644 index 0c410255..00000000 --- a/sparv/util/tagsets/tagmappings.py +++ /dev/null @@ -1,1382 +0,0 @@ -"""This module contains translations between Saldo, SUC, Parole and Granska-ish tagsets. - -The Parole and SUC tags are described here: - http://spraakbanken.gu.se/parole/tags.phtml - -* Constants: - -TAGSEP = ".": a non-space separator between parts of POS/MSD attributes - -* Functions: - -split_tag: splits a SUC or Saldo tag into a pair (pos/part-of-speech, msd/morphology) -join_tag: joins a SUC or Saldo {pos:.., msd:..} record into a tag - -* Tagsets: - -simple_tags: the pos part of SUC tags -suc_tags: all SUC tags -parole_tags: all Parole tags -granska_tags: all Granska-ish tags -saldo_tags: all Saldo tags - -* Dictionaries with descriptions: - -suc_descriptions: 1-1 mapping between SUC tags and a Swedish description - -* Dictionaries for tag conversion: - -suc_to_simple: manu-1 mapping between SUC (msd) and SUC (pos) - -suc_to_parole: 1-1 mapping between SUC and Parole -parole_to_suc: 1-1 mapping between Parole and SUC - -granska_to_parole: many-1 mapping between Granska-ish and Parole -granska_to_suc: many-1 mapping between Granska-ish and SUC -parole_to_granska: 1-many mapping between Parole and Granska-ish -suc_to_granska: 1-many mapping between SUC and Granska-ish - -saldo_to_suc: 1-many mapping between Saldo and SUC -saldo_to_granska: 1-many mapping between Saldo and Granska-ish -saldo_to_parole: 1-many mapping between Saldo and Parole -saldo_to_saldo: 1-many identity mapping of Saldo tags -""" - -TAGSEP = "." - - -def split_tag(tag, sep=TAGSEP): - """Split a tag 'X.Y.Z' into a tuple ('X', 'Y.Z').""" - pos_msd = tag.split(sep, 1) - if len(pos_msd) == 1: - return pos_msd[0], "" - else: - return pos_msd - - -def join_tag(tag, sep=TAGSEP): - """Join a complex tag into a string. - - The tag can be a dict {'pos':pos, 'msd':msd} or a tuple (pos, msd). - """ - if isinstance(tag, dict): - pos, msd = tag['pos'], tag['msd'] - else: - pos, msd = tag - return pos + sep + msd if msd else pos - - -suc_descriptions = { - 'AB': u'adverb', - 'AB.AN': u'adverb förkortning', - 'AB.KOM': u'adverb komparativ', - 'AB.POS': u'adverb positiv', - 'AB.SMS': u'adverb sammansättningsform', - 'AB.SUV': u'adverb superlativ', - 'MAD': u'meningsskiljande interpunktion', - 'MID': u'interpunktion', - 'PAD': u'interpunktion', - 'DT.AN': u'determinerare förkortning', - 'DT.MAS.SIN.DEF': u'determinerare maskulinum singularis bestämd', - 'DT.MAS.SIN.IND': u'determinerare maskulinum singularis obestämd', - 'DT.NEU.SIN.DEF': u'determinerare neutrum singularis bestämd', - 'DT.NEU.SIN.IND': u'determinerare neutrum singularis obestämd', - 'DT.NEU.SIN.IND+DEF': u'determinerare neutrum singularis obestämd/bestämd', - 'DT.UTR.SIN.DEF': u'determinerare utrum singularis bestämd', - 'DT.UTR.SIN.IND': u'determinerare utrum singularis obestämd', - 'DT.UTR.SIN.IND+DEF': u'determinerare utrum singularis obestämd/bestämd', - 'DT.UTR+NEU.PLU.DEF': u'determinerare utrum/neutrum pluralis bestämd', - 'DT.UTR+NEU.PLU.IND': u'determinerare utrum/neutrum pluralis obestämd', - 'DT.UTR+NEU.PLU.IND+DEF': u'determinerare utrum/neutrum pluralis obestämd/bestämd', - 'DT.UTR+NEU.SIN.DEF': u'determinerare utrum/neutrum singularis bestämd', - 'DT.UTR+NEU.SIN.IND': u'determinerare utrum/neutrum singularis obestämd', - 'DT.UTR+NEU.SIN+PLU.IND': u'determinerare utrum/neutrum singularis/pluralis obestämd', - 'HA': u'frågande/relativt adverb', - 'HD.NEU.SIN.IND': u'frågande/relativ determinerare neutrum singularis obestämd', - 'HD.UTR.SIN.IND': u'frågande/relativ determinerare utrum singularis obestämd', - 'HD.UTR+NEU.PLU.IND': u'frågande/relativ determinerare utrum/neutrum pluralis obestämd', - 'HP.-.-.-': u'frågande/relativt pronomen', - 'HP.NEU.SIN.IND': u'frågande/relativt pronomen neutrum singularis obestämd', - 'HP.NEU.SIN.IND.SMS': u'frågande/relativt pronomen neutrum singularis obestämd sammansättningsform', - 'HP.UTR.SIN.IND': u'frågande/relativt pronomen utrum singularis obestämd', - 'HP.UTR+NEU.PLU.IND': u'frågande/relativt pronomen utrum/neutrum pluralis obestämd', - 'HS.DEF': u'frågande/relativt possesivt pronomen bestämd', - 'IE': u'infinitivmärke', - 'IN': u'interjektion', - 'JJ.AN': u'adjektiv förkortning', - 'JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.GEN': u'adjektiv komparativ utrum/neutrum singularis/pluralis obestämd/bestämd genitiv', - 'JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.NOM': u'adjektiv komparativ utrum/neutrum singularis/pluralis obestämd/bestämd nominativ', - 'JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.SMS': u'adjektiv komparativ utrum/neutrum singularis/pluralis obestämd/bestämd sammansättningsform', - 'JJ.POS.MAS.SIN.DEF.GEN': u'adjektiv positiv maskulinum singularis bestämd genitiv', - 'JJ.POS.MAS.SIN.DEF.NOM': u'adjektiv positiv maskulinum singularis bestämd nominativ', - 'JJ.POS.NEU.SIN.IND.GEN': u'adjektiv positiv neutrum singularis obestämd genitiv', - 'JJ.POS.NEU.SIN.IND.NOM': u'adjektiv positiv neutrum singularis obestämd nominativ', - 'JJ.POS.NEU.SIN.IND+DEF.NOM': u'adjektiv positiv neutrum singularis obestämd/bestämd nominativ', - 'JJ.POS.UTR.-.-.SMS': u'adjektiv positiv utrum sammansättningsform', - 'JJ.POS.UTR.SIN.IND.GEN': u'adjektiv positiv utrum singularis obestämd genitiv', - 'JJ.POS.UTR.SIN.IND.NOM': u'adjektiv positiv utrum singularis obestämd nominativ', - 'JJ.POS.UTR.SIN.IND+DEF.NOM': u'adjektiv positiv utrum singularis obestämd/bestämd nominativ', - 'JJ.POS.UTR+NEU.-.-.SMS': u'adjektiv positiv utrum/neutrum sammansättningsform', - 'JJ.POS.UTR+NEU.PLU.IND.NOM': u'adjektiv positiv utrum/neutrum pluralis obestämd nominativ', - 'JJ.POS.UTR+NEU.PLU.IND+DEF.GEN': u'adjektiv positiv utrum/neutrum pluralis obestämd/bestämd genitiv', - 'JJ.POS.UTR+NEU.PLU.IND+DEF.NOM': u'adjektiv positiv utrum/neutrum pluralis obestämd/bestämd nominativ', - 'JJ.POS.UTR+NEU.SIN.DEF.GEN': u'adjektiv positiv utrum/neutrum singularis bestämd genitiv', - 'JJ.POS.UTR+NEU.SIN.DEF.NOM': u'adjektiv positiv utrum/neutrum singularis bestämd nominativ', - 'JJ.POS.UTR+NEU.SIN+PLU.IND.NOM': u'adjektiv positiv utrum/neutrum singularis/pluralis obestämd nominativ', - 'JJ.POS.UTR+NEU.SIN+PLU.IND+DEF.NOM': u'adjektiv positiv utrum/neutrum singularis/pluralis obestämd/bestämd nominativ', - 'JJ.SUV.MAS.SIN.DEF.GEN': u'adjektiv superlativ maskulinum singularis bestämd genitiv', - 'JJ.SUV.MAS.SIN.DEF.NOM': u'adjektiv superlativ maskulinum singularis bestämd nominativ', - 'JJ.SUV.UTR+NEU.PLU.DEF.NOM': u'adjektiv superlativ utrum/neutrum pluralis bestämd nominativ', - 'JJ.SUV.UTR+NEU.PLU.IND.NOM': u'adjektiv superlativ utrum/neutrum pluralis obestämd nominativ', - 'JJ.SUV.UTR+NEU.SIN+PLU.DEF.NOM': u'adjektiv superlativ utrum/neutrum singularis/pluralis bestämd nominativ', - 'JJ.SUV.UTR+NEU.SIN+PLU.IND.NOM': u'adjektiv superlativ utrum/neutrum singularis/pluralis obestämd nominativ', - 'KN': u'konjunktion', - 'KN.AN': u'konjunktion förkortning', - 'NN.-.-.-.-': u'substantiv', - 'NN.-.-.-.SMS': u'substantiv sammansättningsform', - 'NN.AN': u'substantiv förkortning', - 'NN.NEU.-.-.-': u'substantiv neutrum', - 'NN.NEU.-.-.SMS': u'substantiv neutrum sammansättningsform', - 'NN.NEU.PLU.DEF.GEN': u'substantiv neutrum pluralis bestämd genitiv', - 'NN.NEU.PLU.DEF.NOM': u'substantiv neutrum pluralis bestämd nominativ', - 'NN.NEU.PLU.IND.GEN': u'substantiv neutrum pluralis obestämd genitiv', - 'NN.NEU.PLU.IND.NOM': u'substantiv neutrum pluralis obestämd nominativ', - 'NN.NEU.SIN.DEF.GEN': u'substantiv neutrum singularis bestämd genitiv', - 'NN.NEU.SIN.DEF.NOM': u'substantiv neutrum singularis bestämd nominativ', - 'NN.NEU.SIN.IND.GEN': u'substantiv neutrum singularis obestämd genitiv', - 'NN.NEU.SIN.IND.NOM': u'substantiv neutrum singularis obestämd nominativ', - 'NN.UTR.-.-.-': u'substantiv utrum', - 'NN.UTR.-.-.SMS': u'substantiv utrum sammansättningsform', - 'NN.UTR.PLU.DEF.GEN': u'substantiv utrum pluralis bestämd genitiv', - 'NN.UTR.PLU.DEF.NOM': u'substantiv utrum pluralis bestämd nominativ', - 'NN.UTR.PLU.IND.GEN': u'substantiv utrum pluralis obestämd genitiv', - 'NN.UTR.PLU.IND.NOM': u'substantiv utrum pluralis obestämd nominativ', - 'NN.UTR.SIN.DEF.GEN': u'substantiv utrum singularis bestämd genitiv', - 'NN.UTR.SIN.DEF.NOM': u'substantiv utrum singularis bestämd nominativ', - 'NN.UTR.SIN.IND.GEN': u'substantiv utrum singularis obestämd genitiv', - 'NN.UTR.SIN.IND.NOM': u'substantiv utrum singularis obestämd nominativ', - 'PC.AN': u'particip förkortning', - 'PC.PRF.MAS.SIN.DEF.GEN': u'particip perfekt maskulinum singularis bestämd genitiv', - 'PC.PRF.MAS.SIN.DEF.NOM': u'particip perfekt maskulinum singularis bestämd nominativ', - 'PC.PRF.NEU.SIN.IND.NOM': u'particip perfekt neutrum singularis obestämd nominativ', - 'PC.PRF.UTR.SIN.IND.GEN': u'particip perfekt utrum singularis obestämd genitiv', - 'PC.PRF.UTR.SIN.IND.NOM': u'particip perfekt utrum singularis obestämd nominativ', - 'PC.PRF.UTR+NEU.PLU.IND+DEF.GEN': u'particip perfekt utrum/neutrum pluralis obestämd/bestämd genitiv', - 'PC.PRF.UTR+NEU.PLU.IND+DEF.NOM': u'particip perfekt utrum/neutrum pluralis obestämd/bestämd nominativ', - 'PC.PRF.UTR+NEU.SIN.DEF.GEN': u'particip perfekt utrum/neutrum singularis bestämd genitiv', - 'PC.PRF.UTR+NEU.SIN.DEF.NOM': u'particip perfekt utrum/neutrum singularis bestämd nominativ', - 'PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.GEN': u'particip presens utrum/neutrum singularis/pluralis obestämd/bestämd genitiv', - 'PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.NOM': u'particip presens utrum/neutrum singularis/pluralis obestämd/bestämd nominativ', - 'PL': u'partikel', - 'PL.SMS': u'partikel sammansättningsform', - 'PM.GEN': u'egennamn genitiv', - 'PM.NOM': u'egennamn nominativ', - 'PM.SMS': u'egennamn sammansättningsform', - 'PN.MAS.SIN.DEF.SUB+OBJ': u'pronomen maskulinum singularis bestämd subjektsform/objektsform', - 'PN.NEU.SIN.DEF.SUB+OBJ': u'pronomen neutrum singularis bestämd subjektsform/objektsform', - 'PN.NEU.SIN.IND.SUB+OBJ': u'pronomen neutrum singularis obestämd subjektsform/objektsform', - 'PN.UTR.PLU.DEF.OBJ': u'pronomen utrum pluralis bestämd objektsform', - 'PN.UTR.PLU.DEF.SUB': u'pronomen utrum pluralis bestämd subjektsform', - 'PN.UTR.SIN.DEF.OBJ': u'pronomen utrum singularis bestämd objektsform', - 'PN.UTR.SIN.DEF.SUB': u'pronomen utrum singularis bestämd subjektsform', - 'PN.UTR.SIN.DEF.SUB+OBJ': u'pronomen utrum singularis bestämd subjektsform/objektsform', - 'PN.UTR.SIN.IND.SUB': u'pronomen utrum singularis obestämd subjektsform', - 'PN.UTR.SIN.IND.SUB+OBJ': u'pronomen utrum singularis obestämd subjektsform/objektsform', - 'PN.UTR+NEU.PLU.DEF.OBJ': u'pronomen utrum/neutrum pluralis bestämd objektsform', - 'PN.UTR+NEU.PLU.DEF.SUB': u'pronomen utrum/neutrum pluralis bestämd subjektsform', - 'PN.UTR+NEU.PLU.DEF.SUB+OBJ': u'pronomen utrum/neutrum pluralis bestämd subjektsform/objektsform', - 'PN.UTR+NEU.PLU.IND.SUB+OBJ': u'pronomen utrum/neutrum pluralis obestämd subjektsform/objektsform', - 'PN.UTR+NEU.SIN+PLU.DEF.OBJ': u'pronomen utrum/neutrum singularis/pluralis bestämd objektsform', - 'PP': u'preposition', - 'PP.AN': u'preposition förkortning', - 'PP.SMS': u'preposition sammansättningsforms', - 'PS.AN': u'possesivt pronomen förkortning', - 'PS.NEU.SIN.DEF': u'possesivt pronomen neutrum singularis bestämd', - 'PS.UTR.SIN.DEF': u'possesivt pronomen utrum singularis bestämd', - 'PS.UTR+NEU.PLU.DEF': u'possesivt pronomen utrum/neutrum pluralis bestämd', - 'PS.UTR+NEU.SIN+PLU.DEF': u'possesivt pronomen utrum/neutrum singularis/pluralis bestämd', - 'RG.GEN': u'grundtal genitiv', - 'RG.MAS.SIN.DEF.NOM': u'grundtal singularis bestämd nominativ', - 'RG.NEU.SIN.IND.NOM': u'grundtal neutrum singularis obestämd nominativ', - 'RG.NOM': u'grundtal nominativ', - 'RG.SMS': u'grundtal sammansättningsform', - 'RG.UTR.SIN.IND.NOM': u'grundtal utrum singularis obestämd nominativ', - 'RG.UTR+NEU.SIN.DEF.NOM': u'grundtal utrum/neutrum singularis bestämd nominativ', - 'RO.GEN': u'ordningstal genitiv', - 'RO.MAS.SIN.IND+DEF.GEN': u'ordningstal maskulinum singularis obestämd/bestämd genitiv', - 'RO.MAS.SIN.IND+DEF.NOM': u'ordningstal maskulinum singularis obestämd/bestämd nominativ', - 'RO.NOM': u'ordningstal nominativ', - 'RO.UTR+NEU.SIN+PLU.IND+DEF.SMS': u'ordningstal utrum/neutrum singularis/pluralis obestämd/bestämd sammansättningsform', - 'SN': u'subjunktion', - 'UO': u'utländskt ord', - 'VB.AN': u'verb förkortning', - 'VB.IMP.AKT': u'verb imperativ aktiv', - 'VB.IMP.SFO': u'verb imperativ s-form', - 'VB.INF.AKT': u'verb infinitiv aktiv', - 'VB.INF.SFO': u'verb infinitiv s-form', - 'VB.KON.PRS.AKT': u'verb konjunktiv presens aktiv', - 'VB.KON.PRT.AKT': u'verb konjunktiv preteritum aktiv', - 'VB.KON.PRT.SFO': u'verb konjunktiv preteritum s-form', - 'VB.PRS.AKT': u'verb presens aktiv', - 'VB.PRS.SFO': u'verb presens s-form', - 'VB.PRT.AKT': u'verb preteritum aktiv', - 'VB.PRT.SFO': u'verb preteritum s-form', - 'VB.SMS': u'verb sammansättningsform', - 'VB.SUP.AKT': u'verb supinum aktiv', - 'VB.SUP.SFO': u'verb supinum s-form', -} - - -# This is automatically created from Saldo by saldo.saldo_model.extract_tags() -saldo_tags = set("""\ -ab c -ab invar -ab komp -ab pos -ab sms -ab super -aba invar -abh c -abh invar -abh sms -abm invar -al pl def -al pl indef -al sg n def -al sg n indef -al sg u def -al sg u indef -av c -av invar -av komp gen -av komp nom -av pos def pl gen -av pos def pl nom -av pos def sg masc gen -av pos def sg masc nom -av pos def sg no_masc gen -av pos def sg no_masc nom -av pos indef pl gen -av pos indef pl nom -av pos indef sg n gen -av pos indef sg n nom -av pos indef sg u gen -av pos indef sg u nom -av sms -av super def masc gen -av super def masc nom -av super def no_masc gen -av super def no_masc nom -av super indef gen -av super indef nom -ava c -ava invar -ava sms -avh c -avh komp gen -avh komp nom -avh pos def pl gen -avh pos def pl nom -avh pos def sg masc gen -avh pos def sg masc nom -avh pos def sg no_masc gen -avh pos def sg no_masc nom -avh pos indef pl gen -avh pos indef pl nom -avh pos indef sg n gen -avh pos indef sg n nom -avh pos indef sg u gen -avh pos indef sg u nom -avh sms -avh super def masc gen -avh super def masc nom -avh super def no_masc gen -avh super def no_masc nom -avh super indef gen -avh super indef nom -avm c -avm invar -avm komp nom -avm pos def pl gen -avm pos def pl nom -avm pos def sg masc gen -avm pos def sg masc nom -avm pos def sg no_masc gen -avm pos def sg no_masc nom -avm pos indef pl gen -avm pos indef pl nom -avm pos indef sg n gen -avm pos indef sg n nom -avm pos indef sg u gen -avm pos indef sg u nom -avm sms -avm super def masc nom -avm super def no_masc nom -avm super indef nom -in invar -inm invar -kn invar -kna c -kna invar -kna sms -mxc c -mxc sms -nl c -nl gen num n -nl gen num u -nl gen ord masc -nl gen ord no_masc -nl nom num n -nl nom num u -nl nom ord masc -nl nom ord no_masc -nlm c -nlm invar -nlm sms -nn n ci -nn n cm -nn n pl def gen -nn n pl def nom -nn n pl indef gen -nn n pl indef nom -nn n sg def gen -nn n sg def nom -nn n sg indef gen -nn n sg indef nom -nn n sms -nn p ci -nn p cm -nn p pl def gen -nn p pl def nom -nn p pl indef gen -nn p pl indef nom -nn p sms -nn u ci -nn u cm -nn u pl def gen -nn u pl def nom -nn u pl indef gen -nn u pl indef nom -nn u sg def gen -nn u sg def nom -nn u sg indef gen -nn u sg indef nom -nn u sms -nn v ci -nn v cm -nn v pl def gen -nn v pl def nom -nn v pl indef gen -nn v pl indef nom -nn v sg def gen -nn v sg def nom -nn v sg indef gen -nn v sg indef nom -nn v sms -nna n ci -nna n cm -nna n pl def gen -nna n pl def nom -nna n pl indef gen -nna n pl indef nom -nna n sg def gen -nna n sg def nom -nna n sg indef gen -nna n sg indef nom -nna n sms -nna u ci -nna u cm -nna u pl def gen -nna u pl def nom -nna u pl indef gen -nna u pl indef nom -nna u sg def gen -nna u sg def nom -nna u sg indef gen -nna u sg indef nom -nna u sms -nna v ci -nna v cm -nna v pl def gen -nna v pl def nom -nna v pl indef gen -nna v pl indef nom -nna v sg def gen -nna v sg def nom -nna v sg indef gen -nna v sg indef nom -nna v sms -nnh n sg def gen -nnh n sg def nom -nnh u ci -nnh u cm -nnh u pl def gen -nnh u pl def nom -nnh u pl indef gen -nnh u pl indef nom -nnh u sg def gen -nnh u sg def nom -nnh u sg indef gen -nnh u sg indef nom -nnh u sms -nnm n ci -nnm n cm -nnm n pl def gen -nnm n pl def nom -nnm n pl indef gen -nnm n pl indef nom -nnm n sg def gen -nnm n sg def nom -nnm n sg indef gen -nnm n sg indef nom -nnm n sms -nnm p pl def gen -nnm p pl def nom -nnm p pl indef gen -nnm p pl indef nom -nnm u ci -nnm u cm -nnm u pl def gen -nnm u pl def nom -nnm u pl indef gen -nnm u pl indef nom -nnm u sg def gen -nnm u sg def nom -nnm u sg indef gen -nnm u sg indef nom -nnm u sms -nnm v ci -nnm v cm -nnm v pl def gen -nnm v pl def nom -nnm v pl indef gen -nnm v pl indef nom -nnm v sg def gen -nnm v sg def nom -nnm v sg indef gen -nnm v sg indef nom -nnm v sms -pm f ph ci -pm f ph cm -pm f ph gen -pm f ph nom -pm f ph pl def gen -pm f ph pl def nom -pm f ph pl indef gen -pm f ph pl indef nom -pm f ph sg def gen -pm f ph sg def nom -pm f ph sg indef gen -pm f ph sg indef nom -pm f ph sms -pm f pm ci -pm f pm cm -pm f pm gen -pm f pm nom -pm f pm pl def gen -pm f pm pl def nom -pm f pm pl indef gen -pm f pm pl indef nom -pm f pm sg def gen -pm f pm sg def nom -pm f pm sg indef gen -pm f pm sg indef nom -pm f pm sms -pm h ph ci -pm h ph cm -pm h ph gen -pm h ph nom -pm h ph pl def gen -pm h ph pl def nom -pm h ph pl indef gen -pm h ph pl indef nom -pm h ph sg def gen -pm h ph sg def nom -pm h ph sg indef gen -pm h ph sg indef nom -pm h ph sms -pm m ph ci -pm m ph cm -pm m ph gen -pm m ph nom -pm m ph pl def gen -pm m ph pl def nom -pm m ph pl indef gen -pm m ph pl indef nom -pm m ph sg def gen -pm m ph sg def nom -pm m ph sg indef gen -pm m ph sg indef nom -pm m ph sms -pm m pm gen -pm m pm nom -pm n aa gen -pm n aa nom -pm n ac gen -pm n ac nom -pm n ap gen -pm n ap nom -pm n aw gen -pm n aw nom -pm n es gen -pm n es nom -pm n la gen -pm n la nom -pm n lf gen -pm n lf nom -pm n lg gen -pm n lg nom -pm n lp gen -pm n lp nom -pm n oa gen -pm n oa nom -pm n oc gen -pm n oc nom -pm n oe gen -pm n oe nom -pm n og gen -pm n og nom -pm n op gen -pm n op nom -pm n os gen -pm n os nom -pm n wm gen -pm n wm nom -pm n wp gen -pm n wp nom -pm p lg gen -pm p lg nom -pm p oc gen -pm p oc nom -pm u aa gen -pm u aa nom -pm u ae gen -pm u ae nom -pm u ag gen -pm u ag nom -pm u ap gen -pm u ap nom -pm u eh gen -pm u eh nom -pm u la gen -pm u la nom -pm u lf gen -pm u lf nom -pm u lg gen -pm u lg nom -pm u ls gen -pm u ls nom -pm u oc gen -pm u oc nom -pm u oe gen -pm u oe nom -pm u og gen -pm u og nom -pm u op gen -pm u op nom -pm u pa gen -pm u pa nom -pm u pc gen -pm u pc nom -pm u pm gen -pm u pm nom -pm u tz gen -pm u tz nom -pm u wa gen -pm u wa nom -pm u wb gen -pm u wb nom -pm u wc gen -pm u wc nom -pm u wn gen -pm u wn nom -pm v lf gen -pm v lf nom -pm v lg gen -pm v lg nom -pma h ph gen -pma h ph nom -pma n aa gen -pma n aa nom -pma n af gen -pma n af nom -pma n am gen -pma n am nom -pma n lp gen -pma n lp nom -pma n oa gen -pma n oa nom -pma n oe gen -pma n oe nom -pma n og gen -pma n og nom -pma n om gen -pma n om nom -pma n op gen -pma n op nom -pma n os gen -pma n os nom -pma n tm gen -pma n tm nom -pma n wb gen -pma n wb nom -pma u wn gen -pma u wn nom -pma w oc gen -pma w oc nom -pma w ph gen -pma w ph nom -pma w pm gen -pma w pm nom -pmm f ph gen -pmm f ph nom -pmm f pm gen -pmm f pm nom -pmm h ph gen -pmm h ph nom -pmm m pa gen -pmm m pa nom -pmm m ph gen -pmm m ph nom -pmm m pm gen -pmm m pm nom -pmm n eh gen -pmm n eh nom -pmm n lf gen -pmm n lf nom -pmm n lg gen -pmm n lg nom -pmm n lp gen -pmm n lp nom -pmm n oc gen -pmm n oc nom -pmm n oe gen -pmm n oe nom -pmm n og gen -pmm n og nom -pmm n op gen -pmm n op nom -pmm n wm gen -pmm n wm nom -pmm n wn gen -pmm n wn nom -pmm p ph gen -pmm p ph nom -pmm p pm gen -pmm p pm nom -pmm u aa gen -pmm u aa nom -pmm u ag gen -pmm u ag nom -pmm u aw gen -pmm u aw nom -pmm u ec gen -pmm u ec nom -pmm u eh gen -pmm u eh nom -pmm u en gen -pmm u en nom -pmm u er gen -pmm u er nom -pmm u es gen -pmm u es nom -pmm u la gen -pmm u la nom -pmm u lg gen -pmm u lg nom -pmm u ls gen -pmm u ls nom -pmm u oe gen -pmm u oe nom -pmm u og gen -pmm u og nom -pmm u op gen -pmm u op nom -pmm u tb gen -pmm u tb nom -pmm u tm gen -pmm u tm nom -pmm u wb gen -pmm u wb nom -pmm u wc gen -pmm u wc nom -pmm u wn gen -pmm u wn nom -pmm v lf gen -pmm v lf nom -pn ack -pn c -pn invar -pn komp gen -pn komp nom -pn nom -pn p1 pl ack -pn p1 pl nom -pn p1 pl poss pl -pn p1 pl poss sg n -pn p1 pl poss sg u -pn p1 sg ack -pn p1 sg nom -pn p1 sg poss pl -pn p1 sg poss sg n -pn p1 sg poss sg u -pn p2 pl ack -pn p2 pl nom -pn p2 pl poss pl -pn p2 pl poss sg n -pn p2 pl poss sg u -pn p2 sg ack -pn p2 sg nom -pn p2 sg poss pl -pn p2 sg poss sg n -pn p2 sg poss sg u -pn p3 pl ack -pn p3 pl nom -pn p3 pl poss pl -pn p3 pl poss sg n -pn p3 pl poss sg u -pn p3 sg ack -pn p3 sg nom -pn p3 sg poss pl -pn p3 sg poss sg n -pn p3 sg poss sg u -pn pl gen -pn pl nom -pn pos def pl gen -pn pos def pl nom -pn pos def sg masc gen -pn pos def sg masc nom -pn pos def sg no_masc gen -pn pos def sg no_masc nom -pn pos indef pl gen -pn pos indef pl nom -pn pos indef sg n gen -pn pos indef sg n nom -pn pos indef sg u gen -pn pos indef sg u nom -pn poss pl -pn poss sg n -pn poss sg u -pn sg n gen -pn sg n nom -pn sg u gen -pn sg u nom -pn sms -pn super def masc gen -pn super def masc nom -pn super def no_masc gen -pn super def no_masc nom -pn super indef gen -pn super indef nom -pnm gen -pnm invar -pnm nom -pp invar -ppa c -ppa invar -ppa sms -ppm c -ppm invar -ppm sms -sn invar -snm c -snm invar -snm sms -ssm c -ssm invar -ssm sms -sxc c -sxc sms -vb c -vb imper -vb inf aktiv -vb inf s-form -vb pres ind aktiv -vb pres ind s-form -vb pres konj aktiv -vb pres konj s-form -vb pres_part gen -vb pres_part nom -vb pret ind aktiv -vb pret ind s-form -vb pret konj aktiv -vb pret konj s-form -vb pret_part def pl gen -vb pret_part def pl nom -vb pret_part def sg masc gen -vb pret_part def sg masc nom -vb pret_part def sg no_masc gen -vb pret_part def sg no_masc nom -vb pret_part indef pl gen -vb pret_part indef pl nom -vb pret_part indef sg n gen -vb pret_part indef sg n nom -vb pret_part indef sg u gen -vb pret_part indef sg u nom -vb sms -vb sup aktiv -vb sup s-form -vba c -vba invar -vba sms -vbm imper -vbm inf aktiv -vbm inf s-form -vbm pres ind aktiv -vbm pres ind s-form -vbm pres konj aktiv -vbm pres konj s-form -vbm pres_part gen -vbm pres_part nom -vbm pret ind aktiv -vbm pret ind s-form -vbm pret konj aktiv -vbm pret konj s-form -vbm pret_part def pl gen -vbm pret_part def pl nom -vbm pret_part def sg masc gen -vbm pret_part def sg masc nom -vbm pret_part def sg no_masc gen -vbm pret_part def sg no_masc nom -vbm pret_part indef pl gen -vbm pret_part indef pl nom -vbm pret_part indef sg n gen -vbm pret_part indef sg n nom -vbm pret_part indef sg u gen -vbm pret_part indef sg u nom -vbm sup aktiv -vbm sup s-form -""".splitlines()) - - -suc_to_parole = { - 'AB': 'RG0S', - 'AB.AN': 'RG0A', - 'AB.KOM': 'RGCS', - 'AB.POS': 'RGPS', - 'AB.SMS': 'RG0C', - 'AB.SUV': 'RGSS', - 'MAD': 'FE', - 'MID': 'FI', - 'PAD': 'FP', - 'DT.AN': 'D0@00@A', - 'DT.MAS.SIN.DEF': 'DF@MS@S', - 'DT.MAS.SIN.IND': 'DI@MS@S', - 'DT.NEU.SIN.DEF': 'DF@NS@S', - 'DT.NEU.SIN.IND': 'DI@NS@S', - 'DT.NEU.SIN.IND+DEF': 'D0@NS@S', - 'DT.UTR.SIN.DEF': 'DF@US@S', - 'DT.UTR.SIN.IND': 'DI@US@S', - 'DT.UTR.SIN.IND+DEF': 'D0@US@S', - 'DT.UTR+NEU.PLU.DEF': 'DF@0P@S', - 'DT.UTR+NEU.PLU.IND': 'DI@0P@S', - 'DT.UTR+NEU.PLU.IND+DEF': 'D0@0P@S', - 'DT.UTR+NEU.SIN.DEF': 'DF@0S@S', - 'DT.UTR+NEU.SIN.IND': 'DI@0S@S', - 'DT.UTR+NEU.SIN+PLU.IND': 'DI@00@S', - 'HA': 'RH0S', - 'HD.NEU.SIN.IND': 'DH@NS@S', - 'HD.UTR.SIN.IND': 'DH@US@S', - 'HD.UTR+NEU.PLU.IND': 'DH@0P@S', - 'HP.-.-.-': 'PH@000@S', - 'HP.NEU.SIN.IND': 'PH@NS0@S', - 'HP.NEU.SIN.IND.SMS': 'PH@NS0@C', - 'HP.UTR.SIN.IND': 'PH@US0@S', - 'HP.UTR+NEU.PLU.IND': 'PH@0P0@S', - 'HS.DEF': 'PE@000@S', - 'IE': 'CIS', - 'IN': 'I', - 'JJ.AN': 'AQ00000A', - 'JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.GEN': 'AQC00G0S', - 'JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.NOM': 'AQC00N0S', - 'JJ.KOM.UTR+NEU.SIN+PLU.IND+DEF.SMS': 'AQC0000C', - 'JJ.POS.MAS.SIN.DEF.GEN': 'AQPMSGDS', - 'JJ.POS.MAS.SIN.DEF.NOM': 'AQPMSNDS', - 'JJ.POS.NEU.SIN.IND.GEN': 'AQPNSGIS', - 'JJ.POS.NEU.SIN.IND.NOM': 'AQPNSNIS', - 'JJ.POS.NEU.SIN.IND+DEF.NOM': 'AQPNSN0S', - 'JJ.POS.UTR.-.-.SMS': 'AQPU000C', - 'JJ.POS.UTR.SIN.IND.GEN': 'AQPUSGIS', - 'JJ.POS.UTR.SIN.IND.NOM': 'AQPUSNIS', - 'JJ.POS.UTR.SIN.IND+DEF.NOM': 'AQPUSN0S', - 'JJ.POS.UTR+NEU.-.-.SMS': 'AQP0000C', - 'JJ.POS.UTR+NEU.PLU.IND.NOM': 'AQP0PNIS', - 'JJ.POS.UTR+NEU.PLU.IND+DEF.GEN': 'AQP0PG0S', - 'JJ.POS.UTR+NEU.PLU.IND+DEF.NOM': 'AQP0PN0S', - 'JJ.POS.UTR+NEU.SIN.DEF.GEN': 'AQP0SGDS', - 'JJ.POS.UTR+NEU.SIN.DEF.NOM': 'AQP0SNDS', - 'JJ.POS.UTR+NEU.SIN+PLU.IND.NOM': 'AQP00NIS', - 'JJ.POS.UTR+NEU.SIN+PLU.IND+DEF.NOM': 'AQP00N0S', - 'JJ.SUV.MAS.SIN.DEF.GEN': 'AQSMSGDS', - 'JJ.SUV.MAS.SIN.DEF.NOM': 'AQSMSNDS', - 'JJ.SUV.UTR+NEU.PLU.DEF.NOM': 'AQS0PNDS', - 'JJ.SUV.UTR+NEU.PLU.IND.NOM': 'AQS0PNIS', - 'JJ.SUV.UTR+NEU.SIN+PLU.DEF.NOM': 'AQS00NDS', - 'JJ.SUV.UTR+NEU.SIN+PLU.IND.NOM': 'AQS00NIS', - 'KN': 'CCS', - 'KN.AN': 'CCA', - 'NN.-.-.-.-': 'NC000@0S', - 'NN.-.-.-.SMS': 'NC000@0C', - 'NN.AN': 'NC000@0A', - 'NN.NEU.-.-.-': 'NCN00@0S', - 'NN.NEU.-.-.SMS': 'NCN00@0C', - 'NN.NEU.PLU.DEF.GEN': 'NCNPG@DS', - 'NN.NEU.PLU.DEF.NOM': 'NCNPN@DS', - 'NN.NEU.PLU.IND.GEN': 'NCNPG@IS', - 'NN.NEU.PLU.IND.NOM': 'NCNPN@IS', - 'NN.NEU.SIN.DEF.GEN': 'NCNSG@DS', - 'NN.NEU.SIN.DEF.NOM': 'NCNSN@DS', - 'NN.NEU.SIN.IND.GEN': 'NCNSG@IS', - 'NN.NEU.SIN.IND.NOM': 'NCNSN@IS', - 'NN.UTR.-.-.-': 'NCU00@0S', - 'NN.UTR.-.-.SMS': 'NCU00@0C', - 'NN.UTR.PLU.DEF.GEN': 'NCUPG@DS', - 'NN.UTR.PLU.DEF.NOM': 'NCUPN@DS', - 'NN.UTR.PLU.IND.GEN': 'NCUPG@IS', - 'NN.UTR.PLU.IND.NOM': 'NCUPN@IS', - 'NN.UTR.SIN.DEF.GEN': 'NCUSG@DS', - 'NN.UTR.SIN.DEF.NOM': 'NCUSN@DS', - 'NN.UTR.SIN.IND.GEN': 'NCUSG@IS', - 'NN.UTR.SIN.IND.NOM': 'NCUSN@IS', - 'PC.AN': 'AF00000A', - 'PC.PRF.MAS.SIN.DEF.GEN': 'AF0MSGDS', - 'PC.PRF.MAS.SIN.DEF.NOM': 'AF0MSNDS', - 'PC.PRF.NEU.SIN.IND.NOM': 'AF0NSNIS', - 'PC.PRF.UTR.SIN.IND.GEN': 'AF0USGIS', - 'PC.PRF.UTR.SIN.IND.NOM': 'AF0USNIS', - 'PC.PRF.UTR+NEU.PLU.IND+DEF.GEN': 'AF00PG0S', - 'PC.PRF.UTR+NEU.PLU.IND+DEF.NOM': 'AF00PN0S', - 'PC.PRF.UTR+NEU.SIN.DEF.GEN': 'AF00SGDS', - 'PC.PRF.UTR+NEU.SIN.DEF.NOM': 'AF00SNDS', - 'PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.GEN': 'AP000G0S', - 'PC.PRS.UTR+NEU.SIN+PLU.IND+DEF.NOM': 'AP000N0S', - 'PL': 'QS', - 'PL.SMS': 'QC', - 'PM.GEN': 'NP00G@0S', - 'PM.NOM': 'NP00N@0S', - 'PM.SMS': 'NP000@0C', - 'PN.MAS.SIN.DEF.SUB+OBJ': 'PF@MS0@S', - 'PN.NEU.SIN.DEF.SUB+OBJ': 'PF@NS0@S', - 'PN.NEU.SIN.IND.SUB+OBJ': 'PI@NS0@S', - 'PN.UTR.PLU.DEF.OBJ': 'PF@UPO@S', - 'PN.UTR.PLU.DEF.SUB': 'PF@UPS@S', - 'PN.UTR.SIN.DEF.OBJ': 'PF@USO@S', - 'PN.UTR.SIN.DEF.SUB': 'PF@USS@S', - 'PN.UTR.SIN.DEF.SUB+OBJ': 'PF@US0@S', - 'PN.UTR.SIN.IND.SUB': 'PI@USS@S', - 'PN.UTR.SIN.IND.SUB+OBJ': 'PI@US0@S', - 'PN.UTR+NEU.PLU.DEF.OBJ': 'PF@0PO@S', - 'PN.UTR+NEU.PLU.DEF.SUB': 'PF@0PS@S', - 'PN.UTR+NEU.PLU.DEF.SUB+OBJ': 'PF@0P0@S', - 'PN.UTR+NEU.PLU.IND.SUB+OBJ': 'PI@0P0@S', - 'PN.UTR+NEU.SIN+PLU.DEF.OBJ': 'PF@00O@S', - 'PP': 'SPS', - 'PP.AN': 'SPA', - 'PP.SMS': 'SPC', - 'PS.AN': 'PS@000@A', - 'PS.NEU.SIN.DEF': 'PS@NS0@S', - 'PS.UTR.SIN.DEF': 'PS@US0@S', - 'PS.UTR+NEU.PLU.DEF': 'PS@0P0@S', - 'PS.UTR+NEU.SIN+PLU.DEF': 'PS@000@S', - 'RG.GEN': 'MC00G0S', - 'RG.MAS.SIN.DEF.NOM': 'MCMSNDS', - 'RG.NEU.SIN.IND.NOM': 'MCNSNIS', - 'RG.NOM': 'MC00N0S', - 'RG.SMS': 'MC0000C', - 'RG.UTR.SIN.IND.NOM': 'MCUSNIS', - 'RG.UTR+NEU.SIN.DEF.NOM': 'MC0SNDS', - 'RO.GEN': 'MO00G0S', - 'RO.MAS.SIN.IND+DEF.GEN': 'MOMSG0S', - 'RO.MAS.SIN.IND+DEF.NOM': 'MOMSN0S', - 'RO.NOM': 'MO00N0S', - 'RO.UTR+NEU.SIN+PLU.IND+DEF.SMS': 'MO0000C', - 'SN': 'CSS', - 'UO': 'XF', - 'VB.AN': 'V@000A', - 'VB.IMP.AKT': 'V@M0AS', - 'VB.IMP.SFO': 'V@M0SS', - 'VB.INF.AKT': 'V@N0AS', - 'VB.INF.SFO': 'V@N0SS', - 'VB.KON.PRS.AKT': 'V@SPAS', - 'VB.KON.PRT.AKT': 'V@SIAS', - 'VB.KON.PRT.SFO': 'V@SISS', - 'VB.PRS.AKT': 'V@IPAS', - 'VB.PRS.SFO': 'V@IPSS', - 'VB.PRT.AKT': 'V@IIAS', - 'VB.PRT.SFO': 'V@IISS', - 'VB.SMS': 'V@000C', - 'VB.SUP.AKT': 'V@IUAS', - 'VB.SUP.SFO': 'V@IUSS', -} - - -# This mapping, courtesy of Eva Forsbom -granska_to_parole = { - 'pc.an': 'AF00000A', - 'pc.prf.utr+neu.plu.ind+def.gen': 'AF00PG0S', - 'pc.prf.utr+neu.plu.ind+def.nom': 'AF00PN0S', - 'pc.prf.utr+neu.sin.def.gen': 'AF00SGDS', - 'pc.prf.utr+neu.sin.def.nom': 'AF00SNDS', - 'pc.prf.mas.sin.def.gen': 'AF0MSGDS', - 'pc.prf.mas.sin.def.nom': 'AF0MSNDS', - 'pc.prf.neu.sin.ind.nom': 'AF0NSNIS', - 'pc.prf.utr.sin.ind.gen': 'AF0USGIS', - 'pc.prf.utr.sin.ind.nom': 'AF0USNIS', - 'pc.prs.utr+neu.sin+plu.ind+def.gen': 'AP000G0S', - 'pc.prs.utr+neu.sin+plu.ind+def.nom': 'AP000N0S', - 'jj.an': 'AQ00000A', - 'jj.kom.utr+neu.sin+plu.ind+def.sms': 'AQC0000C', - 'jj.kom.utr+neu.sin+plu.ind+def.gen': 'AQC00G0S', - 'jj.kom.utr+neu.sin+plu.ind+def.nom': 'AQC00N0S', - 'jj.pos.utr+neu.-.-.sms': 'AQP0000C', - 'jj.pos.utr+neu.sin+plu.ind+def.nom': 'AQP00N0S', - 'jj.pos.utr+neu.sin+plu.ind.nom': 'AQP00NIS', - 'jj.pos.utr+neu.plu.ind+def.gen': 'AQP0PG0S', - 'jj.pos.utr+neu.plu.ind+def.nom': 'AQP0PN0S', - 'jj.pos.utr+neu.plu.ind.nom': 'AQP0PNIS', - 'jj.pos.utr+neu.sin.def.gen': 'AQP0SGDS', - 'jj.pos.utr+neu.sin.def.nom': 'AQP0SNDS', - 'jj.pos.mas.sin.def.gen': 'AQPMSGDS', - 'jj.pos.mas.sin.def.nom': 'AQPMSNDS', - 'jj.pos.neu.sin.ind.gen': 'AQPNSGIS', - 'jj.pos.neu.sin.ind+def.nom': 'AQPNSN0S', - 'jj.pos.neu.sin.ind.nom': 'AQPNSNIS', - 'jj.pos.utr.-.-.sms': 'AQPU000C', - 'jj.pos.utr.sin.ind.gen': 'AQPUSGIS', - 'jj.pos.utr.sin.ind+def.nom': 'AQPUSN0S', - 'jj.pos.utr.sin.ind.nom': 'AQPUSNIS', - 'jj.suv.utr+neu.sin+plu.def.nom': 'AQS00NDS', - 'jj.suv.utr+neu.sin+plu.ind.nom': 'AQS00NIS', - 'jj.suv.utr+neu.plu.def.nom': 'AQS0PNDS', - 'jj.suv.utr+neu.plu.ind.nom': 'AQS0PNIS', - 'jj.suv.mas.sin.def.gen': 'AQSMSGDS', - 'jj.suv.mas.sin.def.nom': 'AQSMSNDS', - 'kn.an': 'CCA', - 'kn': 'CCS', - 'ie': 'CIS', - 'sn': 'CSS', - 'dt.an': 'D0@00@A', - 'dt.utr+neu.plu.ind+def': 'D0@0P@S', - 'dt.neu.sin.ind+def': 'D0@NS@S', - 'dt.utr.sin.ind+def': 'D0@US@S', - 'dt.utr+neu.plu.def': 'DF@0P@S', - 'dt.utr+neu.sin.def': 'DF@0S@S', - 'dt.mas.sin.def': 'DF@MS@S', - 'dt.neu.sin.def': 'DF@NS@S', - 'dt.utr.sin.def': 'DF@US@S', - 'hd.utr+neu.plu.ind': 'DH@0P@S', - 'hd.neu.sin.ind': 'DH@NS@S', - 'hd.utr.sin.ind': 'DH@US@S', - 'dt.utr+neu.sin+plu.ind': 'DI@00@S', - 'dt.utr+neu.plu.ind': 'DI@0P@S', - 'dt.utr+neu.sin.ind': 'DI@0S@S', - 'dt.mas.sin.ind': 'DI@MS@S', - 'dt.neu.sin.ind': 'DI@NS@S', - 'dt.utr.sin.ind': 'DI@US@S', - 'mad': 'FE', - 'mid': 'FI', - 'pad': 'FP', - 'in': 'I', - 'rg.sms': 'MC0000C', - 'rg.gen': 'MC00G0S', - 'rg.nom': 'MC00N0S', - 'rg.sin.nom': 'MC00N0S', - 'rg.neu.sin.ind.nom': 'MCNSNIS', - 'rg.utr.sin.ind.nom': 'MCUSNIS', - 'rg.mas.sin.def.nom': 'MCMSNDS', - 'rg utr.neu.sin.def.nom': 'MC0SNDS', - 'ro.sms': 'MO0000C', - 'ro.gen': 'MO00G0S', - 'ro.nom': 'MO00N0S', - 'ro.sin.nom': 'MO00N0S', - 'ro.mas.sin.ind+def.gen': 'MOMSG0S', - 'ro.mas.sin.ind+def.nom': 'MOMSN0S', - 'nn.an': 'NC000@0A', - 'nn.-.-.-.sms': 'NC000@0C', - 'nn.-.-.-.-': 'NC000@0S', - 'nn.neu.-.-.sms': 'NCN00@0C', - 'nn.neu.-.-.-': 'NCN00@0S', - 'nn.neu.plu.def.gen': 'NCNPG@DS', - 'nn.neu.plu.ind.gen': 'NCNPG@IS', - 'nn.neu.plu.def.nom': 'NCNPN@DS', - 'nn.neu.plu.ind.nom': 'NCNPN@IS', - 'nn.neu.sin.def.gen': 'NCNSG@DS', - 'nn.neu.sin.ind.gen': 'NCNSG@IS', - 'nn.neu.sin.def.nom': 'NCNSN@DS', - 'nn.neu.sin.ind.nom': 'NCNSN@IS', - 'nn.utr.-.-.sms': 'NCU00@0C', - 'nn.utr.-.-.-': 'NCU00@0S', - 'nn.utr.plu.def.gen': 'NCUPG@DS', - 'nn.utr.plu.ind.gen': 'NCUPG@IS', - 'nn.utr.plu.def.nom': 'NCUPN@DS', - 'nn.utr.plu.ind.nom': 'NCUPN@IS', - 'nn.utr.sin.def.gen': 'NCUSG@DS', - 'nn.utr.sin.ind.gen': 'NCUSG@IS', - 'nn.utr.sin.def.nom': 'NCUSN@DS', - 'nn.utr.sin.def.nom.dat': 'NCUSN@DS', - 'nn.utr.sin.ind.nom': 'NCUSN@IS', - 'nn.utr.sin.ind.nom.dat': 'NCUSN@IS', - 'pm.sms': 'NP000@0C', - 'pm.gen': 'NP00G@0S', - 'pm.nom': 'NP00N@0S', - 'pn.utr+neu.sin+plu.def.obj': 'PF@00O@S', - 'pn.utr+neu.plu.def.sub+obj': 'PF@0P0@S', - 'pn.utr+neu.plu.def.obj': 'PF@0PO@S', - 'pn.utr+neu.plu.def.sub': 'PF@0PS@S', - 'pn.mas.sin.def.sub+obj': 'PF@MS0@S', - 'pn.neu.sin.def.sub+obj': 'PF@NS0@S', - 'pn.utr.plu.def.obj': 'PF@UPO@S', - 'pn.utr.plu.def.sub': 'PF@UPS@S', - 'pn.utr.sin.def.sub+obj': 'PF@US0@S', - 'pn.utr.sin.def.obj': 'PF@USO@S', - 'pn.utr.sin.def.sub': 'PF@USS@S', - 'hs.def': 'PE@000@S', - 'hp.-.-.-': 'PH@000@S', - 'hp.utr+neu.plu.ind': 'PH@0P0@S', - 'hp.neu.sin.ind.sms': 'PH@NS0@C', - 'hp.neu.sin.ind': 'PH@NS0@S', - 'hp.utr.sin.ind': 'PH@US0@S', - 'pn.utr+neu.plu.ind.sub+obj': 'PI@0P0@S', - 'pn.neu.sin.ind.sub+obj': 'PI@NS0@S', - 'pn.utr.sin.ind.sub+obj': 'PI@US0@S', - 'pn.utr.sin.ind.sub': 'PI@USS@S', - 'ps.an': 'PS@000@A', - 'ps.utr+neu.sin+plu.def': 'PS@000@S', - 'ps.utr+neu.plu.def': 'PS@0P0@S', - 'ps.neu.sin.def': 'PS@NS0@S', - 'ps.utr.sin.def': 'PS@US0@S', - 'pl': 'QS', - 'pl.sms': 'QC', - 'ab.an': 'RG0A', - 'ab.sms': 'RG0C', - 'ab': 'RG0S', - 'ab.kom': 'RGCS', - 'ab.pos': 'RGPS', - 'ab.suv': 'RGSS', - 'ha': 'RH0S', - 'pp.an': 'SPA', - 'pp.sms': 'SPC', - 'pp': 'SPS', - 'vb.an': 'V@000A', - 'vb.sms': 'V@000C', - 'vb.prt.akt': 'V@IIAS', - 'vb.prt.akt.aux': 'V@IIAS', - 'vb.prt.akt.kop': 'V@IIAS', - 'vb.prt.sfo': 'V@IISS', - 'vb.prt.sfo.kop': 'V@IISS', - 'vb.prs.akt': 'V@IPAS', - 'vb.prs.akt.aux': 'V@IPAS', - 'vb.prs.akt.kop': 'V@IPAS', - 'vb.prs.sfo': 'V@IPSS', - 'vb.prs.sfo.kop': 'V@IPSS', - 'vb.sup.akt': 'V@IUAS', - 'vb.sup.akt.kop': 'V@IUAS', - 'vb.sup.sfo': 'V@IUSS', - 'vb.imp.akt': 'V@M0AS', - 'vb.imp.akt.aux': 'V@M0AS', - 'vb.imp.akt.kop': 'V@M0AS', - 'vb.imp.sfo': 'V@M0SS', - 'vb.inf.akt': 'V@N0AS', - 'vb.inf.akt.aux': 'V@N0AS', - 'vb.inf.akt.kop': 'V@N0AS', - 'vb.inf.sfo': 'V@N0SS', - 'vb.kon.prt.akt': 'V@SIAS', - 'vb.kon.prt.sfo': 'V@SISS', - 'vb.kon.prs.akt': 'V@SPAS', - 'uo': 'XF', -} - -parole_to_suc = dict((parole, suc) for (suc, parole) in list(suc_to_parole.items())) - -granska_to_suc = dict((granska, parole_to_suc[parole]) for (granska, parole) in list(granska_to_parole.items())) - -parole_to_granska = {} -for granska, parole in list(granska_to_parole.items()): - parole_to_granska.setdefault(parole, set()).add(granska) - -suc_to_granska = dict((suc, parole_to_granska[parole]) for (suc, parole) in list(suc_to_parole.items())) - -suc_tags = set(suc_descriptions) - -suc_to_simple = dict((suc, split_tag(suc)[0]) for suc in suc_tags) - -simple_tags = set(suc_to_simple.values()) - -granska_tags = set(granska_to_parole) - -parole_tags = set(parole_to_suc) - - -assert suc_tags == set(suc_to_parole.keys()) -assert suc_tags == set(suc_to_granska.keys()) -assert suc_tags == set(parole_to_suc.values()) -assert suc_tags == set(granska_to_suc.values()) - -assert granska_tags == set(granska_to_parole.keys()) -assert granska_tags == set(granska_to_suc.keys()) -assert granska_tags == set().union(*list(parole_to_granska.values())) -assert granska_tags == set().union(*list(suc_to_granska.values())) - -assert parole_tags == set(parole_to_suc.keys()) -assert parole_tags == set(parole_to_granska.keys()) -assert parole_tags == set(suc_to_parole.values()) -assert parole_tags == set(granska_to_parole.values()) - - -###################################################################### -# Here we automatically create the 1-many dictionaries -# saldo_to_suc and saldo_to_parole - -saldo_params_to_suc = { - 'u': 'UTR', - 'n': 'NEU', - 'masc': 'MAS', - 'no_masc': 'UTR+NEU', - 'komp': 'KOM', - 'super': 'SUV', - 'pl': 'PLU', - 'sg': 'SIN', - 'indef': 'IND', - 'pres_part': 'PCPRS', - 'pret_part': 'PCPRT', - 'imper': 'IMP', - 'aktiv': 'AKT', - 's-form': 'SFO', - 'ind': 'INDIKATIV', - 'konj': 'KON', - 'pres': 'PRS', - 'pret': 'PRT', -} - -# SALDO to SUC mapping -_suc_tag_replacements = [ - (r"(IN|KN|PP)", r"\1"), - (r"SN", r"(SN|IE)"), # ie doesn't exist in SALDO anymore - (r"(AB|KN|PP|VB)A", r"\1 AN"), - (r"[MS]XC", r"(NN|JJ|AB) .* SMS"), - - (r"ABH? INVAR", r"(AB|PL|HA)"), - (r"ABH? (KOM|POS|SMS|SUV)", r"AB \1"), - - (r"AL PLU (DEF|IND)", r"DT UTR+NEU PLU \1"), - (r"AL SIN (UTR|NEU) (DEF|IND)", r"DT \1 SIN \2"), - - (r"AV INVAR", r"(JJ POS|PC PRS) .* NOM"), - (r"AVH? POS IND SIN NEU NOM", r"(AB|AB POS|(JJ POS|PC PRF) NEU SIN IND NOM)"), # snabbt - (r"AVH? POS (DEF|IND) (SIN|PLU) (MAS|NEU|UTR|UTR\+NEU) (NOM|GEN)", r"(JJ POS|PC PRF) \3 \2 (\1|IND+DEF) \4"), # ind/def doesn't exist in SALDO - (r"AVH? POS (DEF|IND) PLU (NOM|GEN)", r"(JJ POS|PC PRF) UTR+NEU PLU (\1|IND+DEF) \2"), # ind/def doesn't exist in SALDO - # (r"AV POS .* (SIN|PLU) .*(NOM|GEN)", r"(JJ POS|PC PRF) .* \1 .* \2"), - (r"AVH? KOM NOM", r"(JJ KOM .* NOM|AB KOM)"), - (r"AVH? SUV IND NOM", r"(JJ SUV .* NOM|AB SUV)"), - (r"AVH? (KOM|SUV) .*(NOM|GEN)", r"JJ \1 .* \2"), - (r"AVH? SMS", r"JJ .* SMS"), - (r"AVA", r"AB AN"), - - (r"NL (NOM|GEN)", r"(RG|RO) .*\1"), - - (r"NN (V|P) (SIN|PLU) (IND|DEF) (NOM|GEN)", r"NN (UTR|NEU|-) (\2|-) (\3|-) (\4|-)"), - (r"NNH? (UTR|NEU) (SIN|PLU) (IND|DEF) (NOM|GEN)", r"NN (\1|-) (\2|-) (\3|-) (\4|-)"), - (r"NNH? .* SMS", r"NN .* SMS"), - (r"NNA .* SMS", r"(NN|PM) .* SMS"), - (r"NNA .* (SIN|PLU) (IND|DEF) (NOM|GEN)", r"NN (AN|.* \1 \2 \3)"), - - (r"PMA .* (NOM|GEN)", r"PM \1"), - (r"PM .* (NOM|GEN)", r"PM \1"), - (r"PM .* SMS", r"PM .* SMS"), - - (r"PN .*POSS", r"(PS|HS)"), - (r"PN KOM GEN", r"PS"), - (r"PN SUV (IND|DEF)", r"JJ SUV .* \1"), - (r"PN (P1|P2|P3) (SIN|PLU)", r"PN .* \2 DEF"), - (r"PN POS .*(SIN|PLU)", r"PN .* \1"), - (r"PN PLU NOM", r"(PN .* PLU|DT UTR+NEU PLU .*|JJ POS UTR+NEU PLU .* NOM)"), - (r"PN PLU GEN", r"(PN .* PLU|DT UTR+NEU PLU .*|PS UTR+NEU SIN+PLU DEF)"), - (r"PN SIN UTR NOM", r"(PN (UTR|MAS) SIN|DT UTR SIN .*|JJ POS UTR SIN IND NOM)"), - (r"PN SIN UTR GEN", r"(PN (UTR|MAS) SIN|DT UTR SIN .*|PS UTR+NEU SIN+PLU DEF)"), - (r"PN SIN NEU NOM", r"(PN NEU SIN|DT NEU SIN .*|JJ POS NEU SIN IND NOM)"), - (r"PN SIN NEU GEN", r"(PN NEU SIN|DT NEU SIN .*|PS UTR+NEU SIN+PLU DEF)"), - (r"PN (ACK|NOM|INVAR|KOM|SMS)", r"(PN|HP|HS)"), - - (r"VB (INF|SUP) (AKT|SFO)", r"VB \1 \2"), - (r"VB (PRS|PRT) .* (AKT|SFO)", r"VB .*\1 \2"), - (r"VB PCPRS (NOM|GEN)", r"PC PRS .* \1"), - (r"VB PCPRT .* (PLU|SIN) .*(NOM|GEN)", r"PC PRF .* \1 .* \2"), - (r"VB (IMP|SMS)", r"VB \1"), - - # Compounds - (r"ABH? C", r"AB"), - (r"AVH? C", r"JJ"), - (r"VB C", r"VB"), - (r"NNA? (UTR|NEU) (CI|CM)", r"NN (\1|-) - - -"), - (r"NNA? (V|P) (CI|CM)", r"NN (UTR|NEU|-) - - -"), - (r"NNH? (UTR|NEU) (CI|CM)", r"NN (\1|-) - - -"), - - (r"PM .* (CI|CM)", r"PM"), - (r"PN C", r"PN"), - (r"NL C", r"(RG|RO)"), -] - - -def _make_saldo_to_suc(compound=False): - import re - tagmap = {} - for saldotag in saldo_tags: - params = saldotag.split() - if not compound: - if saldotag.endswith((' c', ' ci', ' cm')) or not params or (len(params[0]) == 3 and params[0].endswith(('m', 'h'))): - # Skip multiword units and compound/end syllables - continue - else: - if not params or (len(params[0]) == 3 and params[0].endswith('m')): - # Skip multiword units - continue - paramstr = " ".join(saldo_params_to_suc.get(prm, prm.upper()) for prm in params) - for (pre, post) in _suc_tag_replacements: - m = re.match(pre, paramstr) - if m: - break - if m is None: - print(paramstr) - print() - sucfilter = m.expand(post).replace(" ", r"\.").replace("+", r"\+") - tagmap[saldotag] = set(suctag for suctag in suc_tags - if re.match(sucfilter, suctag)) - return tagmap - - -saldo_to_suc = _make_saldo_to_suc() -saldo_to_suc_compound = _make_saldo_to_suc(compound=True) # For use with the compound module - -saldo_to_parole = dict((saldotag, set(suc_to_parole[suctag] for suctag in suctags)) - for saldotag, suctags in list(saldo_to_suc.items())) - -saldo_to_granska = dict((saldotag, set().union(*(suc_to_granska[suctag] for suctag in suctags))) - for saldotag, suctags in list(saldo_to_suc.items())) - -saldo_to_saldo = dict((saldotag, {saldotag}) for saldotag in saldo_tags) - - -mappings = { - "granska_to_parole": granska_to_parole, - "granska_to_suc": granska_to_suc, - "parole_to_granska": parole_to_granska, - "parole_to_suc": parole_to_suc, - "saldo_to_granska": saldo_to_granska, - "saldo_to_parole": saldo_to_parole, - "saldo_to_saldo": saldo_to_saldo, - "saldo_to_suc_compound": saldo_to_suc_compound, - "saldo_to_suc": saldo_to_suc, - "suc_descriptions": suc_descriptions, - "suc_to_granska": suc_to_granska, - "suc_to_parole": suc_to_parole, - "suc_to_simple": suc_to_simple, - "saldo_params_to_suc": saldo_params_to_suc, -} - -tags = { - "granska_tags": granska_tags, - "parole_tags": parole_tags, - "saldo_tags": saldo_tags, - "simple_tags": simple_tags, - "suc_tags": suc_tags, -} diff --git a/tests/test_annotations.py b/tests/test_annotations.py index 059f19a5..255b03c7 100644 --- a/tests/test_annotations.py +++ b/tests/test_annotations.py @@ -4,7 +4,7 @@ import pytest -from sparv.util.system import find_binary +from sparv.api.util.system import find_binary from . import utils @@ -14,7 +14,7 @@ def test_mini_swe(tmp_path): gold_corpus_dir = pathlib.Path("tests/test_corpora/mini-swe") test_corpus_dir = utils.run_sparv(gold_corpus_dir, tmp_path) utils.cmp_workdir(gold_corpus_dir, test_corpus_dir) - utils.cmp_export(gold_corpus_dir, test_corpus_dir) + utils.cmp_export(gold_corpus_dir, test_corpus_dir, ignore=["version_info"]) @pytest.mark.swe @@ -45,6 +45,27 @@ def test_standard_swe(tmp_path): utils.cmp_export(gold_corpus_dir, test_corpus_dir) +@pytest.mark.swe +@pytest.mark.swehist +@pytest.mark.slow +def test_swe_1800(tmp_path): + """Run corpus swe-1800 and compare the annotations and exports to gold standard.""" + gold_corpus_dir = pathlib.Path("tests/test_corpora/swe-1800") + test_corpus_dir = utils.run_sparv(gold_corpus_dir, tmp_path) + utils.cmp_workdir(gold_corpus_dir, test_corpus_dir) + utils.cmp_export(gold_corpus_dir, test_corpus_dir) + + +@pytest.mark.swe +@pytest.mark.swehist +def test_swe_fsv(tmp_path): + """Run corpus swe-fsv and compare the annotations and exports to gold standard.""" + gold_corpus_dir = pathlib.Path("tests/test_corpora/swe-fsv") + test_corpus_dir = utils.run_sparv(gold_corpus_dir, tmp_path) + utils.cmp_workdir(gold_corpus_dir, test_corpus_dir) + utils.cmp_export(gold_corpus_dir, test_corpus_dir) + + @pytest.mark.freeling @pytest.mark.skipif(not find_binary("analyze"), reason="FreeLing is not installed") def test_freeling_eng_slevel(tmp_path): diff --git a/tests/test_corpora/freeling-eng-slevel/config.yaml b/tests/test_corpora/freeling-eng-slevel/config.yaml index aa9f2e5d..c02a7dbb 100644 --- a/tests/test_corpora/freeling-eng-slevel/config.yaml +++ b/tests/test_corpora/freeling-eng-slevel/config.yaml @@ -8,13 +8,13 @@ metadata: # Corpus name (human readable) name: eng: FreeLing test corpus - # Language of the input documents, specified as ISO 639-3 code + # Language of the source files, specified as ISO 639-3 code language: eng description: eng: | This test corpus includes: - - the FreeLing_FULL annotations from the FREELING preset + - the SBX_FreeLing_FULL annotations from the SBX_FREELING preset - annotations with slevel (meaning that FreeLing won't do sentence segmentation) - some exports @@ -23,8 +23,8 @@ metadata: #=============================================================================== import: - # The annotation representing one text document. Any text-level annotations will be attached to this annotation. - document_annotation: text + # The annotation representing one text. Any text-level annotations will be attached to this annotation. + text_annotation: text xml_import: elements: @@ -42,7 +42,7 @@ classes: # Module Settings #=============================================================================== -freeling: +sbx_freeling: sentence_annotation: s # Description of the date/time input format @@ -59,16 +59,17 @@ export: # Exports to create by default when running 'sparv run' default: - csv_export:csv - - cwb:info - cwb:vrt - cwb:vrt_scrambled - korp:timespan_sql - - stats_export:freq_list_simple + - stats_export:sbx_freq_list_simple - xml_export:pretty - xml_export:preserved_format # Automatic annotations to be included in the export annotations: - :readability.lix - DATETIME.all - - s:misc.id - - FREELING_FULL.token + - SBX_FREELING_FULL.all + +sparv: + compression: none diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/csv/document1.csv b/tests/test_corpora/freeling-eng-slevel/gold_export/csv/document1.csv deleted file mode 100644 index 12f452a1..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_export/csv/document1.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8386861e73a45369d17b1fc2f3eaf8c3bb2822af2b59f485f7c7bf95bab3764f -size 2454 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/csv_export/document1.csv b/tests/test_corpora/freeling-eng-slevel/gold_export/csv_export/document1.csv new file mode 100644 index 00000000..c1a7e60a --- /dev/null +++ b/tests/test_corpora/freeling-eng-slevel/gold_export/csv_export/document1.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cc4f44bc754e6445bcf9ac40fa1ddd3fe2420311b6c1e5b626719c653432a27 +size 2382 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/cwb.vrt/document1.vrt b/tests/test_corpora/freeling-eng-slevel/gold_export/cwb.vrt/document1.vrt new file mode 100644 index 00000000..8d672e76 --- /dev/null +++ b/tests/test_corpora/freeling-eng-slevel/gold_export/cwb.vrt/document1.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44d4832d7d90be76169806f746ad3d09c1e9804b451eaf16949fa5b22b56c99 +size 2311 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/cwb.vrt_scrambled/document1.vrt b/tests/test_corpora/freeling-eng-slevel/gold_export/cwb.vrt_scrambled/document1.vrt new file mode 100644 index 00000000..444b7e54 --- /dev/null +++ b/tests/test_corpora/freeling-eng-slevel/gold_export/cwb.vrt_scrambled/document1.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68765d77780206c7ff4d2e616b10ae63f2986a67df29cbcc997e8f36b9862821 +size 2559 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/frequency_list/stats_freeling-eng-slevel.csv b/tests/test_corpora/freeling-eng-slevel/gold_export/frequency_list/stats_freeling-eng-slevel.csv deleted file mode 100644 index cb56833c..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_export/frequency_list/stats_freeling-eng-slevel.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f154c814667d43d8de67f336a6a1c036fb5e245ab322d24581899b23c06cfba -size 1517 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/korp_timespan/timespan.sql b/tests/test_corpora/freeling-eng-slevel/gold_export/korp.timespan/timespan.sql similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_export/korp_timespan/timespan.sql rename to tests/test_corpora/freeling-eng-slevel/gold_export/korp.timespan/timespan.sql diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/stats_export.frequency_list_sbx/stats_freeling-eng-slevel.csv b/tests/test_corpora/freeling-eng-slevel/gold_export/stats_export.frequency_list_sbx/stats_freeling-eng-slevel.csv new file mode 100644 index 00000000..cf3c658c --- /dev/null +++ b/tests/test_corpora/freeling-eng-slevel/gold_export/stats_export.frequency_list_sbx/stats_freeling-eng-slevel.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a3e7f92b017e2274af2586b8aa25c86472704353758216abc3d057369625b3c +size 1290 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/vrt/document1.vrt b/tests/test_corpora/freeling-eng-slevel/gold_export/vrt/document1.vrt deleted file mode 100644 index 90e67885..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_export/vrt/document1.vrt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4922e63c5bb3fd0d2fe98f842c7196f3bc6b5d385b3eb25c0ee3a589fb436a1b -size 2383 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/vrt_scrambled/document1.vrt b/tests/test_corpora/freeling-eng-slevel/gold_export/vrt_scrambled/document1.vrt deleted file mode 100644 index 75ff2fa1..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_export/vrt_scrambled/document1.vrt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3f33dfa372f88ccab159ca875411884e1476a67587e8d491f38b62289e3f1e9 -size 2631 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/xml_export.preserved_format/document1_export.xml b/tests/test_corpora/freeling-eng-slevel/gold_export/xml_export.preserved_format/document1_export.xml new file mode 100644 index 00000000..640ca083 --- /dev/null +++ b/tests/test_corpora/freeling-eng-slevel/gold_export/xml_export.preserved_format/document1_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25af9dbab08bec180caf88c135a7949307098d05fce76a73c3435cebc4e9072 +size 5951 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/xml_export.pretty/document1_export.xml b/tests/test_corpora/freeling-eng-slevel/gold_export/xml_export.pretty/document1_export.xml new file mode 100644 index 00000000..98bad702 --- /dev/null +++ b/tests/test_corpora/freeling-eng-slevel/gold_export/xml_export.pretty/document1_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a71f1b15377c11ade27ae5a57c899f9ba75b02ddfff7e88586a0d26701e8c26 +size 6596 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/xml_preserved_format/document1_export.xml b/tests/test_corpora/freeling-eng-slevel/gold_export/xml_preserved_format/document1_export.xml deleted file mode 100644 index 809e957a..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_export/xml_preserved_format/document1_export.xml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:626aa1070efc6fbebdd701e41913a4229dd470744efa7435bd7ec208f9e9a802 -size 6023 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_export/xml_pretty/document1_export.xml b/tests/test_corpora/freeling-eng-slevel/gold_export/xml_pretty/document1_export.xml deleted file mode 100644 index 2ffb02da..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_export/xml_pretty/document1_export.xml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a684d26914280d9851c3aaaffcd6a47c7d6f985673c2bb883b0ffdd639cc22da -size 6668 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/cwb.datefirst b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/cwb.datefirst deleted file mode 100644 index 14d8058c..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/cwb.datefirst +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5722206db6822dd01d112d0fc33e67439e27f9338329d8fb2d70f0b13d3a2d18 -size 19 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/cwb.datelast b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/cwb.datelast deleted file mode 100644 index bf647868..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/cwb.datelast +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5429e82a539bd860189ab3ccf5c832125ba999fd32bf0143910975ce44e62c65 -size 19 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/dateformat.resolution b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/dateformat.resolution deleted file mode 100644 index a4c1de80..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/dateformat.resolution +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:545090164d18915eaef2b742d3725095c612850780ceb07f249b91a168f1c9ca -size 3 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/misc.docid b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/misc.fileid similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/misc.docid rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/misc.fileid diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/s/misc.id b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/s/misc.id deleted file mode 100644 index 1bee7ac4..00000000 --- a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/s/misc.id +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f5b6e0c2d9e72052a8b0fea876ead74228bd1af179635e32169a0a8fc1fd3d54 -size 32 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.sentence/@span b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.sentence/@span similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.sentence/@span rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.sentence/@span diff --git a/docs/docsify/.nojekyll b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.sentence/misc.id similarity index 100% rename from docs/docsify/.nojekyll rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.sentence/misc.id diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/@span b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/@span similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/@span rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/@span diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.word b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/misc.word similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.word rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/misc.word diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.baseform b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.baseform similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.baseform rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.baseform diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.ne_type b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.ne_type similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.ne_type rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.ne_type diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.pos b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.pos similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.pos rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.pos diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.upos b/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.upos similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/freeling.token/freeling.upos rename to tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/document1/sbx_freeling.token/sbx_freeling.upos diff --git a/tests/test_corpora/freeling-fra-txt/config.yaml b/tests/test_corpora/freeling-fra-txt/config.yaml index 4b1b6872..60810367 100644 --- a/tests/test_corpora/freeling-fra-txt/config.yaml +++ b/tests/test_corpora/freeling-fra-txt/config.yaml @@ -8,14 +8,14 @@ metadata: # Corpus name (human readable) name: eng: French FreeLing test corpus - # Language of the input documents, specified as ISO 639-3 code + # Language of the source files, specified as ISO 639-3 code language: fra description: eng: | This test corpus includes: - standard Sparv paragraph segmentation - - the FreeLing annotations from the FREELING preset + - the FreeLing annotations from the SBX_FREELING preset - input files in txt format - some exports @@ -31,7 +31,7 @@ import: # Module Settings #=============================================================================== -freeling: +sbx_freeling: sentence_chunk: #=============================================================================== @@ -42,15 +42,17 @@ export: # Exports to create by default when running 'sparv run' default: - korp:timespan_sql - - cwb:info - csv_export:csv - cwb:vrt - cwb:vrt_scrambled - - stats_export:freq_list_simple + - stats_export:sbx_freq_list_simple - xml_export:pretty - xml_export:preserved_format # Automatic annotations to be included in the export annotations: - PARAGRAPH.all - :misc.id - - FREELING.all + - SBX_FREELING.all + +sparv: + compression: none diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/csv/wikipedia.csv b/tests/test_corpora/freeling-fra-txt/gold_export/csv/wikipedia.csv deleted file mode 100644 index dac65b29..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_export/csv/wikipedia.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f0d145bdb98181d5030c00450c8165cef7822d25c543fca401b11aeffc09ee5 -size 9868 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/csv_export/wikipedia.csv b/tests/test_corpora/freeling-fra-txt/gold_export/csv_export/wikipedia.csv new file mode 100644 index 00000000..f091eae0 --- /dev/null +++ b/tests/test_corpora/freeling-fra-txt/gold_export/csv_export/wikipedia.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e60530b2ce5ff621965cc9f6987777b8c4a788cb26228de358e8d72d121cbfd +size 9868 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/cwb.vrt/wikipedia.vrt b/tests/test_corpora/freeling-fra-txt/gold_export/cwb.vrt/wikipedia.vrt new file mode 100644 index 00000000..ea54fe27 --- /dev/null +++ b/tests/test_corpora/freeling-fra-txt/gold_export/cwb.vrt/wikipedia.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:074609bfef8a4936cf5a7ed3500641dbee904db450b253d567d9cd45b9429a3b +size 10178 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/cwb.vrt_scrambled/wikipedia.vrt b/tests/test_corpora/freeling-fra-txt/gold_export/cwb.vrt_scrambled/wikipedia.vrt new file mode 100644 index 00000000..019db489 --- /dev/null +++ b/tests/test_corpora/freeling-fra-txt/gold_export/cwb.vrt_scrambled/wikipedia.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0b2db79331117fd9442e9902c25a7cd07c237b1ca4540174e4e83c2ee511793 +size 10484 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/frequency_list/stats_freeling-fra-txt.csv b/tests/test_corpora/freeling-fra-txt/gold_export/frequency_list/stats_freeling-fra-txt.csv deleted file mode 100644 index 345b3ed0..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_export/frequency_list/stats_freeling-fra-txt.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce5e8c83db4b11aeb4c3381b13e39953a4e112f4392d69fcd283a68ea674a109 -size 5980 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/korp_timespan/timespan.sql b/tests/test_corpora/freeling-fra-txt/gold_export/korp.timespan/timespan.sql similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_export/korp_timespan/timespan.sql rename to tests/test_corpora/freeling-fra-txt/gold_export/korp.timespan/timespan.sql diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/stats_export.frequency_list_sbx/stats_freeling-fra-txt.csv b/tests/test_corpora/freeling-fra-txt/gold_export/stats_export.frequency_list_sbx/stats_freeling-fra-txt.csv new file mode 100644 index 00000000..bd137c3f --- /dev/null +++ b/tests/test_corpora/freeling-fra-txt/gold_export/stats_export.frequency_list_sbx/stats_freeling-fra-txt.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a25bb68b3ef3ca9831611b1d03cf05044720bcc51d669c6fe5fd5d17c4e01e3 +size 5312 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/vrt/wikipedia.vrt b/tests/test_corpora/freeling-fra-txt/gold_export/vrt/wikipedia.vrt deleted file mode 100644 index ba2f3241..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_export/vrt/wikipedia.vrt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62d838cfb01ee78f6b535f07aa547373a934f889af15eebb4797f4b225c7805b -size 10178 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/vrt_scrambled/wikipedia.vrt b/tests/test_corpora/freeling-fra-txt/gold_export/vrt_scrambled/wikipedia.vrt deleted file mode 100644 index b2452127..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_export/vrt_scrambled/wikipedia.vrt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:879b8f8fa2bf99d181129d5204d10214247cca0c2a267dd7bf4b8183da193bf8 -size 10484 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/xml_export.preserved_format/wikipedia_export.xml b/tests/test_corpora/freeling-fra-txt/gold_export/xml_export.preserved_format/wikipedia_export.xml new file mode 100644 index 00000000..685c5b25 --- /dev/null +++ b/tests/test_corpora/freeling-fra-txt/gold_export/xml_export.preserved_format/wikipedia_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e15a7356b50d4c32f03e388f50fff6dbcf3f06aff44a38a37b939a6490ed39e1 +size 25801 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/xml_export.pretty/wikipedia_export.xml b/tests/test_corpora/freeling-fra-txt/gold_export/xml_export.pretty/wikipedia_export.xml new file mode 100644 index 00000000..f092da2e --- /dev/null +++ b/tests/test_corpora/freeling-fra-txt/gold_export/xml_export.pretty/wikipedia_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f181c856cf9826abb7d84e09d44a9ff5956f66dc8aacc82c301c9f78d031f283 +size 28516 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/xml_preserved_format/wikipedia_export.xml b/tests/test_corpora/freeling-fra-txt/gold_export/xml_preserved_format/wikipedia_export.xml deleted file mode 100644 index 30cfcced..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_export/xml_preserved_format/wikipedia_export.xml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:375e5e15de51398bfd67a6f2e15197aefc7973d1c036376fd3e62f84bd4eb12b -size 25801 diff --git a/tests/test_corpora/freeling-fra-txt/gold_export/xml_pretty/wikipedia_export.xml b/tests/test_corpora/freeling-fra-txt/gold_export/xml_pretty/wikipedia_export.xml deleted file mode 100644 index 54120903..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_export/xml_pretty/wikipedia_export.xml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ded9d342481d91ba6af6bedc1c82f7514dae7e082b79a9aee3dc590b38798ac -size 28516 diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/cwb.datefirst b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/cwb.datefirst deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/cwb.datelast b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/cwb.datelast deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/dateformat.resolution b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/dateformat.resolution deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/misc.freeling.sentence_count b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/misc.freeling.sentence_count deleted file mode 100644 index 0a476c24..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/misc.freeling.sentence_count +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9400f1b21cb527d7fa3d3eabba93557a18ebe7a2ca4e471cfe5e4c5b4ca7f767 -size 2 diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.sentence/misc.id b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.sentence/misc.id deleted file mode 100644 index 65ac9a73..00000000 --- a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.sentence/misc.id +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c79dfb2cc1ef7736e32258cf2b2f329bbab5e4db8319bb948e4cd63ee909b18 -size 76 diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/misc.docid b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/misc.fileid similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/misc.docid rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/misc.fileid diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.sentence/@span b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.sentence/@span similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.sentence/@span rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.sentence/@span diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.sentence/misc.id b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.sentence/misc.id new file mode 100644 index 00000000..164bbb37 --- /dev/null +++ b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.sentence/misc.id @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9780a79a056f409c2402f5b5d550504e6136e7be763f86c3d583584d2f7f38d6 +size 76 diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.sentence/misc.number_random b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.sentence/misc.number_random similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.sentence/misc.number_random rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.sentence/misc.number_random diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/@span b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/@span similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/@span rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/@span diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.word b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/misc.word similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.word rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/misc.word diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.baseform b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/sbx_freeling.baseform similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.baseform rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/sbx_freeling.baseform diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.pos b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/sbx_freeling.pos similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.pos rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/sbx_freeling.pos diff --git a/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.upos b/tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/sbx_freeling.upos similarity index 100% rename from tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/freeling.token/freeling.upos rename to tests/test_corpora/freeling-fra-txt/gold_sparv-workdir/wikipedia/sbx_freeling.token/sbx_freeling.upos diff --git a/tests/test_corpora/mini-swe/config.yaml b/tests/test_corpora/mini-swe/config.yaml index 7991fe63..8bb25de8 100644 --- a/tests/test_corpora/mini-swe/config.yaml +++ b/tests/test_corpora/mini-swe/config.yaml @@ -21,8 +21,8 @@ metadata: #=============================================================================== import: - # The annotation representing one text document. Any text-level annotations will be attached to this annotation. - document_annotation: text + # The annotation representing one text. Any text-level annotations will be attached to this annotation. + text_annotation: text #=============================================================================== # Annotation Class Settings @@ -32,6 +32,7 @@ classes: "token:pos": :stanza.pos "token:msd": :stanza.msd "token:baseform": :saldo.baseform + "token:ref": :stanza.ref #=============================================================================== # Export Settings @@ -41,11 +42,10 @@ export: # Exports to create by default when running 'sparv run' default: - csv_export:csv - - cwb:info - cwb:vrt - cwb:vrt_scrambled - korp:timespan_sql - - stats_export:freq_list_simple + - stats_export:sbx_freq_list_simple_swe - xml_export:pretty - xml_export:preserved_format - xml_export:scrambled @@ -56,15 +56,21 @@ export: - - - :sensaldo.sentiment_label - # Annotations from original documents to include in the output. If nothing is specified, everything is kept. + # Annotations from source files to include in the output. If nothing is specified, everything is kept. source_annotations: - dokument as document - text:forfattare as author - ... +xml_export: + include_version_info: false + csv_export: # Overriding export.source_annotations source_annotations: - text:id - text:date - text:forfattare as author + +sparv: + compression: none diff --git a/tests/test_corpora/mini-swe/gold_export/csv/dokument1.csv b/tests/test_corpora/mini-swe/gold_export/csv/dokument1.csv deleted file mode 100644 index c4367732..00000000 --- a/tests/test_corpora/mini-swe/gold_export/csv/dokument1.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:365861fe1a31a52fe2a3280b7bce6dfca9b3c75b3fda7706f0d2cab506536c6c -size 4343 diff --git a/tests/test_corpora/mini-swe/gold_export/csv_export/dokument1.csv b/tests/test_corpora/mini-swe/gold_export/csv_export/dokument1.csv new file mode 100644 index 00000000..7e86c4c6 --- /dev/null +++ b/tests/test_corpora/mini-swe/gold_export/csv_export/dokument1.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e8b9166381d6882e48f948ad695430ab744f8a3ab1cd9747f387d7e82b4986a +size 4311 diff --git a/tests/test_corpora/mini-swe/gold_export/csv/dokument2.csv b/tests/test_corpora/mini-swe/gold_export/csv_export/dokument2.csv similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/csv/dokument2.csv rename to tests/test_corpora/mini-swe/gold_export/csv_export/dokument2.csv diff --git a/tests/test_corpora/mini-swe/gold_export/vrt/dokument1.vrt b/tests/test_corpora/mini-swe/gold_export/cwb.vrt/dokument1.vrt similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/vrt/dokument1.vrt rename to tests/test_corpora/mini-swe/gold_export/cwb.vrt/dokument1.vrt diff --git a/tests/test_corpora/mini-swe/gold_export/vrt/dokument2.vrt b/tests/test_corpora/mini-swe/gold_export/cwb.vrt/dokument2.vrt similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/vrt/dokument2.vrt rename to tests/test_corpora/mini-swe/gold_export/cwb.vrt/dokument2.vrt diff --git a/tests/test_corpora/mini-swe/gold_export/vrt_scrambled/dokument1.vrt b/tests/test_corpora/mini-swe/gold_export/cwb.vrt_scrambled/dokument1.vrt similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/vrt_scrambled/dokument1.vrt rename to tests/test_corpora/mini-swe/gold_export/cwb.vrt_scrambled/dokument1.vrt diff --git a/tests/test_corpora/mini-swe/gold_export/vrt_scrambled/dokument2.vrt b/tests/test_corpora/mini-swe/gold_export/cwb.vrt_scrambled/dokument2.vrt similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/vrt_scrambled/dokument2.vrt rename to tests/test_corpora/mini-swe/gold_export/cwb.vrt_scrambled/dokument2.vrt diff --git a/tests/test_corpora/mini-swe/gold_export/frequency_list/stats_mini-swe.csv b/tests/test_corpora/mini-swe/gold_export/frequency_list/stats_mini-swe.csv deleted file mode 100644 index 9371c453..00000000 --- a/tests/test_corpora/mini-swe/gold_export/frequency_list/stats_mini-swe.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c3816781b5312b0b4f14a7f2296012b7970815c611db14570b9567d614f2bc2 -size 3275 diff --git a/tests/test_corpora/mini-swe/gold_export/korp_timespan/timespan.sql b/tests/test_corpora/mini-swe/gold_export/korp.timespan/timespan.sql similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/korp_timespan/timespan.sql rename to tests/test_corpora/mini-swe/gold_export/korp.timespan/timespan.sql diff --git a/tests/test_corpora/mini-swe/gold_export/stats_export.frequency_list_sbx/stats_mini-swe.csv b/tests/test_corpora/mini-swe/gold_export/stats_export.frequency_list_sbx/stats_mini-swe.csv new file mode 100644 index 00000000..6fe61151 --- /dev/null +++ b/tests/test_corpora/mini-swe/gold_export/stats_export.frequency_list_sbx/stats_mini-swe.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb94e774fd9f8bc648ba72af240a354bc5cca8ca29b05b739f7c6342beb8187c +size 2826 diff --git a/tests/test_corpora/mini-swe/gold_export/mini-swe.xml b/tests/test_corpora/mini-swe/gold_export/xml_export.combined/mini-swe.xml similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/mini-swe.xml rename to tests/test_corpora/mini-swe/gold_export/xml_export.combined/mini-swe.xml diff --git a/tests/test_corpora/mini-swe/gold_export/xml_preserved_format/dokument1_export.xml b/tests/test_corpora/mini-swe/gold_export/xml_export.preserved_format/dokument1_export.xml similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/xml_preserved_format/dokument1_export.xml rename to tests/test_corpora/mini-swe/gold_export/xml_export.preserved_format/dokument1_export.xml diff --git a/tests/test_corpora/mini-swe/gold_export/xml_preserved_format/dokument2_export.xml b/tests/test_corpora/mini-swe/gold_export/xml_export.preserved_format/dokument2_export.xml similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/xml_preserved_format/dokument2_export.xml rename to tests/test_corpora/mini-swe/gold_export/xml_export.preserved_format/dokument2_export.xml diff --git a/tests/test_corpora/mini-swe/gold_export/xml_pretty/dokument1_export.xml b/tests/test_corpora/mini-swe/gold_export/xml_export.pretty/dokument1_export.xml similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/xml_pretty/dokument1_export.xml rename to tests/test_corpora/mini-swe/gold_export/xml_export.pretty/dokument1_export.xml diff --git a/tests/test_corpora/mini-swe/gold_export/xml_pretty/dokument2_export.xml b/tests/test_corpora/mini-swe/gold_export/xml_export.pretty/dokument2_export.xml similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/xml_pretty/dokument2_export.xml rename to tests/test_corpora/mini-swe/gold_export/xml_export.pretty/dokument2_export.xml diff --git a/tests/test_corpora/mini-swe/gold_export/xml_scrambled/dokument1_export.xml b/tests/test_corpora/mini-swe/gold_export/xml_export.scrambled/dokument1_export.xml similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/xml_scrambled/dokument1_export.xml rename to tests/test_corpora/mini-swe/gold_export/xml_export.scrambled/dokument1_export.xml diff --git a/tests/test_corpora/mini-swe/gold_export/xml_scrambled/dokument2_export.xml b/tests/test_corpora/mini-swe/gold_export/xml_export.scrambled/dokument2_export.xml similarity index 100% rename from tests/test_corpora/mini-swe/gold_export/xml_scrambled/dokument2_export.xml rename to tests/test_corpora/mini-swe/gold_export/xml_export.scrambled/dokument2_export.xml diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/cwb.datefirst b/tests/test_corpora/mini-swe/gold_sparv-workdir/cwb.datefirst deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/cwb.datelast b/tests/test_corpora/mini-swe/gold_sparv-workdir/cwb.datelast deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/dateformat.resolution b/tests/test_corpora/mini-swe/gold_sparv-workdir/dateformat.resolution deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/misc.docid b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/misc.fileid similarity index 100% rename from tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/misc.docid rename to tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/misc.fileid diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/segment.token/misc.number_rel_segment.sentence b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/segment.token/stanza.ref similarity index 100% rename from tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/segment.token/misc.number_rel_segment.sentence rename to tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/segment.token/stanza.ref diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/segment.token/stats_export.baseform_first b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/segment.token/stats_export.baseform_first new file mode 100644 index 00000000..198e1cad --- /dev/null +++ b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument1/segment.token/stats_export.baseform_first @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70a91eb21f90556125e4212d76a16e0848060c3cd9568da41612835db13269aa +size 998 diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/misc.docid b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/misc.fileid similarity index 100% rename from tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/misc.docid rename to tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/misc.fileid diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/segment.token/misc.number_rel_segment.sentence b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/segment.token/stanza.ref similarity index 100% rename from tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/segment.token/misc.number_rel_segment.sentence rename to tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/segment.token/stanza.ref diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/segment.token/stats_export.baseform_first b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/segment.token/stats_export.baseform_first new file mode 100644 index 00000000..904e5349 --- /dev/null +++ b/tests/test_corpora/mini-swe/gold_sparv-workdir/dokument2/segment.token/stats_export.baseform_first @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdc8609dd77f8993678b75896cb0c5495b1bd292d08b4e297a3fcf47ea20fdb5 +size 88 diff --git a/tests/test_corpora/mini-swe/gold_sparv-workdir/misc.segment.sentence_count b/tests/test_corpora/mini-swe/gold_sparv-workdir/misc.segment.sentence_count deleted file mode 100644 index 6ad44159..00000000 --- a/tests/test_corpora/mini-swe/gold_sparv-workdir/misc.segment.sentence_count +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4523540f1504cd17100c4835e85b7eefd49911580f8efff0599a8f283be6b9e3 -size 2 diff --git a/tests/test_corpora/special-swe/config.yaml b/tests/test_corpora/special-swe/config.yaml index 02c01c34..e0b2747e 100644 --- a/tests/test_corpora/special-swe/config.yaml +++ b/tests/test_corpora/special-swe/config.yaml @@ -9,6 +9,7 @@ metadata: eng: | This test corpus includes: - XML headers + - XML namespaces - re-naming of elements and attributes on import - special source_annotations syntax - sub-token source annotations @@ -18,28 +19,31 @@ metadata: #=============================================================================== import: - # The annotation representing one text document. Any text-level annotations will be attached to this annotation. - document_annotation: text + # The annotation representing one text. Any text-level annotations will be attached to this annotation. + text_annotation: text xml_import: # Elements that should be treated as headers, i.e. their contents will not be part of the corpus text header_elements: - - header + - s+header - another-header + - nested:s+n as sparv+id # Header elements and attributes from which we want to extract metadata header_data: - - header/författare as text:author - - header/författare:birth as text:author-birth - - header/författare:death as text:author-death - - header/title/main-title as text:title - - header/title/sub-title as text:subtitle - - header/date as text:date + - s+header/m+författare as text:m+author + - s+header/m+författare:birth as text:author-birth + - s+header/m+författare:death as text:author-death + - s+header/title/m+main-title as text:title + - s+header/title/m+sub-title as text:subtitle + - s+header/m+date as text:date + - s+header/location as text:location elements: - paragraph:n as id - paragraph as p + - text:location skip: - - i - - u:@contents + - sparv+i + - sparv+u:@contents #=============================================================================== # Export Settings @@ -49,21 +53,31 @@ export: # Exports to create by default when running 'sparv run' default: - csv_export:csv - - cwb:info - cwb:vrt - cwb:vrt_scrambled - xml_export:pretty - xml_export:preserved_format - xml_export:scrambled - # Annotations from original documents to include in the output. If nothing is specified, everything is kept. + # Annotations from source files to include in the output. If nothing is specified, everything is kept. source_annotations: + - sparv+b as b + - sparv+x as s+x - dokument as document - not text:subtitle - ... - # Headers from the original documents that we want to include in the output - header_annotations: - - header # Automatic annotations to be included in the export annotations: - :misc.id - + - :geo.geo_metadata + +geo: + metadata_source: text:location + +xml_export: + # Headers from the source files that we want to include in the output + header_annotations: + - s+header + +sparv: + compression: none diff --git a/tests/test_corpora/special-swe/gold_export/csv/dokument.csv b/tests/test_corpora/special-swe/gold_export/csv/dokument.csv deleted file mode 100644 index 9bd30a41..00000000 --- a/tests/test_corpora/special-swe/gold_export/csv/dokument.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04270437721ee41db146941bd082704fa8d71ec4c2086b26a002bf0de42b6c0d -size 619 diff --git a/tests/test_corpora/special-swe/gold_export/csv_export/dokument.csv b/tests/test_corpora/special-swe/gold_export/csv_export/dokument.csv new file mode 100644 index 00000000..dd9a488c --- /dev/null +++ b/tests/test_corpora/special-swe/gold_export/csv_export/dokument.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d64d23098f9b1a8ea39ff4be2772bd2dca0eddee5ae25a60b0902cbe7598d13 +size 650 diff --git a/tests/test_corpora/special-swe/gold_export/cwb.vrt/dokument.vrt b/tests/test_corpora/special-swe/gold_export/cwb.vrt/dokument.vrt new file mode 100644 index 00000000..6c8b7b16 --- /dev/null +++ b/tests/test_corpora/special-swe/gold_export/cwb.vrt/dokument.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6e335f8c0a355552c88999003f143bd049d817b00c13b62e12ded980dd4378 +size 803 diff --git a/tests/test_corpora/special-swe/gold_export/cwb.vrt_scrambled/dokument.vrt b/tests/test_corpora/special-swe/gold_export/cwb.vrt_scrambled/dokument.vrt new file mode 100644 index 00000000..371432b2 --- /dev/null +++ b/tests/test_corpora/special-swe/gold_export/cwb.vrt_scrambled/dokument.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d42882a96ff112bc36846128043fed97e564bab2404d843bda97b2aa5be27d +size 1000 diff --git a/tests/test_corpora/special-swe/gold_export/vrt/dokument.vrt b/tests/test_corpora/special-swe/gold_export/vrt/dokument.vrt deleted file mode 100644 index f1231e8e..00000000 --- a/tests/test_corpora/special-swe/gold_export/vrt/dokument.vrt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:244a433cafb4940fb946c7270f0534860eb711ccdef50702f69bd258ef8403f8 -size 671 diff --git a/tests/test_corpora/special-swe/gold_export/vrt_scrambled/dokument.vrt b/tests/test_corpora/special-swe/gold_export/vrt_scrambled/dokument.vrt deleted file mode 100644 index 86208a51..00000000 --- a/tests/test_corpora/special-swe/gold_export/vrt_scrambled/dokument.vrt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0eaf8bd16cbadebdb05271f079528e5e6c099e2caee9c3a6c352016b8fc12636 -size 791 diff --git a/tests/test_corpora/special-swe/gold_export/xml_export.preserved_format/dokument_export.xml b/tests/test_corpora/special-swe/gold_export/xml_export.preserved_format/dokument_export.xml new file mode 100644 index 00000000..03c98a82 --- /dev/null +++ b/tests/test_corpora/special-swe/gold_export/xml_export.preserved_format/dokument_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b914b0403872f302bd8e72966377bd69d9f0af6d3812320e9d4ea3aee129555 +size 1690 diff --git a/tests/test_corpora/special-swe/gold_export/xml_export.pretty/dokument_export.xml b/tests/test_corpora/special-swe/gold_export/xml_export.pretty/dokument_export.xml new file mode 100644 index 00000000..ca250706 --- /dev/null +++ b/tests/test_corpora/special-swe/gold_export/xml_export.pretty/dokument_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0dc3bea21f279284888f41ee39fd832d58034439c6fbf96a63e03e1f986e017 +size 2070 diff --git a/tests/test_corpora/special-swe/gold_export/xml_export.scrambled/dokument_export.xml b/tests/test_corpora/special-swe/gold_export/xml_export.scrambled/dokument_export.xml new file mode 100644 index 00000000..9016960c --- /dev/null +++ b/tests/test_corpora/special-swe/gold_export/xml_export.scrambled/dokument_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d984dda11e21eef185e0cef1cb72273c77b0f69fe2881c8e19ef4610e8412eb +size 1948 diff --git a/tests/test_corpora/special-swe/gold_export/xml_preserved_format/dokument_export.xml b/tests/test_corpora/special-swe/gold_export/xml_preserved_format/dokument_export.xml deleted file mode 100644 index 7f19ab99..00000000 --- a/tests/test_corpora/special-swe/gold_export/xml_preserved_format/dokument_export.xml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f526c080cb793d70074b6742f76d4b9de2a9234789d247696440d0749deb4b3 -size 1354 diff --git a/tests/test_corpora/special-swe/gold_export/xml_pretty/dokument_export.xml b/tests/test_corpora/special-swe/gold_export/xml_pretty/dokument_export.xml deleted file mode 100644 index 20f137d8..00000000 --- a/tests/test_corpora/special-swe/gold_export/xml_pretty/dokument_export.xml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89f42a5673e2c7df8f1b7bded4a07e0296fd6f687d477876653e259dedd92c9c -size 1729 diff --git a/tests/test_corpora/special-swe/gold_export/xml_scrambled/dokument_export.xml b/tests/test_corpora/special-swe/gold_export/xml_scrambled/dokument_export.xml deleted file mode 100644 index 2dc3c83a..00000000 --- a/tests/test_corpora/special-swe/gold_export/xml_scrambled/dokument_export.xml +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7c33211c330b8b763cade2532580a1f0be9f566de9a818cfade630b671517c0 -size 1589 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/cwb.datefirst b/tests/test_corpora/special-swe/gold_sparv-workdir/cwb.datefirst deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/cwb.datelast b/tests/test_corpora/special-swe/gold_sparv-workdir/cwb.datelast deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dateformat.resolution b/tests/test_corpora/special-swe/gold_sparv-workdir/dateformat.resolution deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@headers b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@headers index e82c79e7..3da1030b 100644 --- a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@headers +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@headers @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b354742cc2b6c5502b0b94c1245250ef1e41afd07b77fd43c8cc4e6cbbb64a9a -size 21 +oid sha256:d158c8e03530c7fbed9bb676417ed978ed4b0948052c5b350b8c3fa569089775 +size 23 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@namespaces b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@namespaces new file mode 100644 index 00000000..8ccdb68e --- /dev/null +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@namespaces @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ccac48dc84bc27421f5066397128b11b93a762827dd9b814a95ebde4e20597a +size 151 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@structure b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@structure index ef4b3391..11f3a09a 100644 --- a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@structure +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/@structure @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2b2213e030d6339e9f87cb301795c6480eae265af14b73c662be7f4a775268d -size 133 +oid sha256:b893c9582e176cb6f64cb2f66e2cafd4a34339048acd05105383a1fe270376e7 +size 178 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/header/contents b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/header/contents deleted file mode 100644 index 8eef0e05..00000000 --- a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/header/contents +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6c9e45b19e1561fd2fa2777e2c33c4cbd720f39e5d11d3412c70beb02181527 -size 229 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/misc.docid b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/misc.fileid similarity index 100% rename from tests/test_corpora/special-swe/gold_sparv-workdir/dokument/misc.docid rename to tests/test_corpora/special-swe/gold_sparv-workdir/dokument/misc.fileid diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/nested/n b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/nested/n index 76b7ceb2..4d357ee2 100644 --- a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/nested/n +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/nested/n @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6e2b7a040683432de03a18fd8a1939a2fdf82585b364bfc874bdd4095c4cae1 -size 4 +oid sha256:ac01d162df437d8a5943a8439e00c15b7294f28ac05e06421ee0bcd878f61079 +size 3 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/nested/s+n b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/nested/s+n new file mode 100644 index 00000000..66b5d533 --- /dev/null +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/nested/s+n @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a0a2b76858bf8cc147613f783d84cd22c0047d142ba0d9ce13fcedf953b57ed +size 3 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/header/@span b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/s+header/@span similarity index 100% rename from tests/test_corpora/special-swe/gold_sparv-workdir/dokument/header/@span rename to tests/test_corpora/special-swe/gold_sparv-workdir/dokument/s+header/@span diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/s+header/contents b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/s+header/contents new file mode 100644 index 00000000..b4d2f05e --- /dev/null +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/s+header/contents @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341d8de044ab189fcca86c0d57e3460311fe28472a25226d8b94f04289af0a46 +size 429 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/b/@span b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/sparv+b/@span similarity index 100% rename from tests/test_corpora/special-swe/gold_sparv-workdir/dokument/b/@span rename to tests/test_corpora/special-swe/gold_sparv-workdir/dokument/sparv+b/@span diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/x/@span b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/sparv+x/@span similarity index 100% rename from tests/test_corpora/special-swe/gold_sparv-workdir/dokument/x/@span rename to tests/test_corpora/special-swe/gold_sparv-workdir/dokument/sparv+x/@span diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/geo.geo_metadata b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/geo.geo_metadata new file mode 100644 index 00000000..220e9361 --- /dev/null +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/geo.geo_metadata @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb41b5ca8eb59c47dfc5b306641df801bc4d819be81c2432a8d85f5047a21d0b +size 35 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/location b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/location new file mode 100644 index 00000000..8a0a2b01 --- /dev/null +++ b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/location @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1d089df90e2c8f57acdd0c67b076765415c87bf29cd48f54aba7e82b057cb89 +size 11 diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/author b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/m+author similarity index 100% rename from tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/author rename to tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/m+author diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/id b/tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/sparv+id similarity index 100% rename from tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/id rename to tests/test_corpora/special-swe/gold_sparv-workdir/dokument/text/sparv+id diff --git a/tests/test_corpora/special-swe/gold_sparv-workdir/misc.segment.sentence_count b/tests/test_corpora/special-swe/gold_sparv-workdir/misc.segment.sentence_count deleted file mode 100644 index 0d5c17b2..00000000 --- a/tests/test_corpora/special-swe/gold_sparv-workdir/misc.segment.sentence_count +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d -size 1 diff --git a/tests/test_corpora/special-swe/source/dokument.xml b/tests/test_corpora/special-swe/source/dokument.xml index dcba203b..baa390d3 100644 --- a/tests/test_corpora/special-swe/source/dokument.xml +++ b/tests/test_corpora/special-swe/source/dokument.xml @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0eaca44259997fe88f8c2bc573225c3838f0a2f67a71cab7b71cb6374522ada -size 680 +oid sha256:8e1b2590322ef9ff239574f4ee450650c69776d94a4fc3af64d724533a7b9423 +size 1033 diff --git a/tests/test_corpora/standard-swe/config.yaml b/tests/test_corpora/standard-swe/config.yaml index 154f35c5..29d79ff3 100644 --- a/tests/test_corpora/standard-swe/config.yaml +++ b/tests/test_corpora/standard-swe/config.yaml @@ -15,6 +15,7 @@ metadata: - all Swedish standard token and text-level annotations except wsd - custom annotations - all kinds of exports except the combined and compressed variants + - removal of namespaces upon import #=============================================================================== @@ -22,13 +23,16 @@ metadata: #=============================================================================== import: - # The annotation representing one text document. Any text-level annotations will be attached to this annotation. - document_annotation: text + # The annotation representing one text. Any text-level annotations will be attached to this annotation. + text_annotation: text xml_import: + # Remove namespaces upon import + remove_namespaces: true # Elements and attributes from the source XML that we want to be available as input for other annotations elements: - text:date + - document:name #=============================================================================== # Annotation Class Settings @@ -61,7 +65,6 @@ export: default: - conll_export:conllu - csv_export:csv - - cwb:info - cwb:vrt - cwb:vrt_scrambled - korp:relations_sql @@ -76,18 +79,29 @@ export: - PARAGRAPH_SWE.all - SWE_DEFAULT.all - not :wsd.sense - - :misc.word.affixed + - :misc.affixed - :custom.convert.upper +stats_export: + annotations: + - + - :custom.convert.upper + source_annotations: + - document:name + + #=============================================================================== # Custom Annotations #=============================================================================== custom_annotations: - annotator: misc:affix params: - out: :misc.word.affixed + out: :misc.affixed chunk: prefix: "|" suffix: "|" - annotator: custom.convert:uppercase + +sparv: + compression: none diff --git a/tests/test_corpora/standard-swe/convert.py b/tests/test_corpora/standard-swe/convert.py index e575f187..d2ed9793 100644 --- a/tests/test_corpora/standard-swe/convert.py +++ b/tests/test_corpora/standard-swe/convert.py @@ -1,6 +1,6 @@ """Example for a custom annotator.""" -from sparv import Annotation, Output, annotator +from sparv.api import Annotation, Output, annotator @annotator("Convert every word to uppercase") diff --git "a/tests/test_corpora/standard-swe/gold_export/conll/r\303\244var.conllu" "b/tests/test_corpora/standard-swe/gold_export/conll_export/r\303\244var.conllu" similarity index 100% rename from "tests/test_corpora/standard-swe/gold_export/conll/r\303\244var.conllu" rename to "tests/test_corpora/standard-swe/gold_export/conll_export/r\303\244var.conllu" diff --git "a/tests/test_corpora/standard-swe/gold_export/csv/r\303\244var.csv" "b/tests/test_corpora/standard-swe/gold_export/csv/r\303\244var.csv" deleted file mode 100644 index c6f6db33..00000000 --- "a/tests/test_corpora/standard-swe/gold_export/csv/r\303\244var.csv" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e7eac06f3c2ab88b63a6bd4aca8eaf9d5bd9be8135be9be722ab7a510834ec5 -size 63901 diff --git "a/tests/test_corpora/standard-swe/gold_export/csv_export/r\303\244var.csv" "b/tests/test_corpora/standard-swe/gold_export/csv_export/r\303\244var.csv" new file mode 100644 index 00000000..5c83f307 --- /dev/null +++ "b/tests/test_corpora/standard-swe/gold_export/csv_export/r\303\244var.csv" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59566c61cf0f4367df1dc484b01863320a630806c2d414f26e85621e34745125 +size 78501 diff --git "a/tests/test_corpora/standard-swe/gold_export/cwb.vrt/r\303\244var.vrt" "b/tests/test_corpora/standard-swe/gold_export/cwb.vrt/r\303\244var.vrt" new file mode 100644 index 00000000..6a4593e5 --- /dev/null +++ "b/tests/test_corpora/standard-swe/gold_export/cwb.vrt/r\303\244var.vrt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb9fd6ee6616de333efcb039e46318965e2cdaed57c2343d67d9b75c6c65c44 +size 77904 diff --git "a/tests/test_corpora/standard-swe/gold_export/cwb.vrt_scrambled/r\303\244var.vrt" "b/tests/test_corpora/standard-swe/gold_export/cwb.vrt_scrambled/r\303\244var.vrt" new file mode 100644 index 00000000..e565921f --- /dev/null +++ "b/tests/test_corpora/standard-swe/gold_export/cwb.vrt_scrambled/r\303\244var.vrt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e809f1a9beb436ec07bf7ff78fb83370eeb85294aeb4ef2abcff851013e8eae0 +size 86864 diff --git a/tests/test_corpora/standard-swe/gold_export/frequency_list/stats_standard-swe.csv b/tests/test_corpora/standard-swe/gold_export/frequency_list/stats_standard-swe.csv deleted file mode 100644 index 04c0fa70..00000000 --- a/tests/test_corpora/standard-swe/gold_export/frequency_list/stats_standard-swe.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28924e00c31eeb7902282cbbf3937270269d69e55f6eceab7fed9d584d9d1c4e -size 15223 diff --git a/tests/test_corpora/standard-swe/gold_export/korp_timespan/timespan.sql b/tests/test_corpora/standard-swe/gold_export/korp.timespan/timespan.sql similarity index 100% rename from tests/test_corpora/standard-swe/gold_export/korp_timespan/timespan.sql rename to tests/test_corpora/standard-swe/gold_export/korp.timespan/timespan.sql diff --git a/tests/test_corpora/standard-swe/gold_export/korp_wordpicture/relations.sql b/tests/test_corpora/standard-swe/gold_export/korp.wordpicture/relations.sql similarity index 100% rename from tests/test_corpora/standard-swe/gold_export/korp_wordpicture/relations.sql rename to tests/test_corpora/standard-swe/gold_export/korp.wordpicture/relations.sql diff --git a/tests/test_corpora/standard-swe/gold_export/stats_export.frequency_list/stats_standard-swe.csv b/tests/test_corpora/standard-swe/gold_export/stats_export.frequency_list/stats_standard-swe.csv new file mode 100644 index 00000000..08593243 --- /dev/null +++ b/tests/test_corpora/standard-swe/gold_export/stats_export.frequency_list/stats_standard-swe.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4ffec4e1f3151749d722e50a8c482b3a48530058d3f87266a5b7a80eb02a7df +size 7898 diff --git "a/tests/test_corpora/standard-swe/gold_export/vrt/r\303\244var.vrt" "b/tests/test_corpora/standard-swe/gold_export/vrt/r\303\244var.vrt" deleted file mode 100644 index ba928425..00000000 --- "a/tests/test_corpora/standard-swe/gold_export/vrt/r\303\244var.vrt" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb2889071d785d57384bbbd2d4fbeaf328e7ca86e2d6eade2a86c799931fe04f -size 63311 diff --git "a/tests/test_corpora/standard-swe/gold_export/vrt_scrambled/r\303\244var.vrt" "b/tests/test_corpora/standard-swe/gold_export/vrt_scrambled/r\303\244var.vrt" deleted file mode 100644 index 9888d11f..00000000 --- "a/tests/test_corpora/standard-swe/gold_export/vrt_scrambled/r\303\244var.vrt" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e8e530969746bce8fab4628ce5a58d33c9b7fb938c6d6a9dfe4b0fc214720b91 -size 72251 diff --git "a/tests/test_corpora/standard-swe/gold_export/xml_export.preserved_format/r\303\244var_export.xml" "b/tests/test_corpora/standard-swe/gold_export/xml_export.preserved_format/r\303\244var_export.xml" new file mode 100644 index 00000000..ddc000bb --- /dev/null +++ "b/tests/test_corpora/standard-swe/gold_export/xml_export.preserved_format/r\303\244var_export.xml" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1efed855daf7a56bd7be0de4b395882172bde57902014d289d8e26c1c946ce42 +size 155490 diff --git "a/tests/test_corpora/standard-swe/gold_export/xml_export.pretty/r\303\244var_export.xml" "b/tests/test_corpora/standard-swe/gold_export/xml_export.pretty/r\303\244var_export.xml" new file mode 100644 index 00000000..d24fa482 --- /dev/null +++ "b/tests/test_corpora/standard-swe/gold_export/xml_export.pretty/r\303\244var_export.xml" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55da678047891e7f3075b1bdf7637f23efbacc6d648a8198c34e8ceb9bbd3f93 +size 160491 diff --git "a/tests/test_corpora/standard-swe/gold_export/xml_export.scrambled/r\303\244var_export.xml" "b/tests/test_corpora/standard-swe/gold_export/xml_export.scrambled/r\303\244var_export.xml" new file mode 100644 index 00000000..190782f5 --- /dev/null +++ "b/tests/test_corpora/standard-swe/gold_export/xml_export.scrambled/r\303\244var_export.xml" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3842496a369f2c3ea4023a93367b53d70909620746e861fd78a6ac5073a45e1 +size 169699 diff --git "a/tests/test_corpora/standard-swe/gold_export/xml_preserved_format/r\303\244var_export.xml" "b/tests/test_corpora/standard-swe/gold_export/xml_preserved_format/r\303\244var_export.xml" deleted file mode 100644 index f2b21039..00000000 --- "a/tests/test_corpora/standard-swe/gold_export/xml_preserved_format/r\303\244var_export.xml" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe0b44c48db349a3fe5b4aa5a195e23007fc8e0dde0fe2f9fb2e1092d3030acd -size 136411 diff --git "a/tests/test_corpora/standard-swe/gold_export/xml_pretty/r\303\244var_export.xml" "b/tests/test_corpora/standard-swe/gold_export/xml_pretty/r\303\244var_export.xml" deleted file mode 100644 index c34c5243..00000000 --- "a/tests/test_corpora/standard-swe/gold_export/xml_pretty/r\303\244var_export.xml" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a650546e5faaa0df8c815a27a9334782a5472b1093fadf169e80270fc6231c2b -size 141412 diff --git "a/tests/test_corpora/standard-swe/gold_export/xml_scrambled/r\303\244var_export.xml" "b/tests/test_corpora/standard-swe/gold_export/xml_scrambled/r\303\244var_export.xml" deleted file mode 100644 index cc18fcfe..00000000 --- "a/tests/test_corpora/standard-swe/gold_export/xml_scrambled/r\303\244var_export.xml" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a5621e525431276e23a85955dd6b24dbea53fd64c893482d126ec2f979949f5 -size 150460 diff --git a/tests/test_corpora/standard-swe/gold_sparv-workdir/cwb.datefirst b/tests/test_corpora/standard-swe/gold_sparv-workdir/cwb.datefirst deleted file mode 100644 index 096b1b7d..00000000 --- a/tests/test_corpora/standard-swe/gold_sparv-workdir/cwb.datefirst +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:451315ca45a6af259a28dab197c1df50637bd28037f6e8a848b5f8bad013cf29 -size 19 diff --git a/tests/test_corpora/standard-swe/gold_sparv-workdir/cwb.datelast b/tests/test_corpora/standard-swe/gold_sparv-workdir/cwb.datelast deleted file mode 100644 index c22b9e16..00000000 --- a/tests/test_corpora/standard-swe/gold_sparv-workdir/cwb.datelast +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:819bb57984c05e5e370a32a43612dc879e08478433caf51be6325951846afb04 -size 19 diff --git a/tests/test_corpora/standard-swe/gold_sparv-workdir/dateformat.resolution b/tests/test_corpora/standard-swe/gold_sparv-workdir/dateformat.resolution deleted file mode 100644 index a4c1de80..00000000 --- a/tests/test_corpora/standard-swe/gold_sparv-workdir/dateformat.resolution +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:545090164d18915eaef2b742d3725095c612850780ceb07f249b91a168f1c9ca -size 3 diff --git a/tests/test_corpora/standard-swe/gold_sparv-workdir/misc.segment.sentence_count b/tests/test_corpora/standard-swe/gold_sparv-workdir/misc.segment.sentence_count deleted file mode 100644 index 76f682f1..00000000 --- a/tests/test_corpora/standard-swe/gold_sparv-workdir/misc.segment.sentence_count +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb1e33e8a81b697b75855af6bfcdbcbf7cbbde9f94962ceaec1ed8af21f5a50f -size 2 diff --git "a/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/misc.docid" "b/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/misc.fileid" similarity index 100% rename from "tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/misc.docid" rename to "tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/misc.fileid" diff --git "a/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.paragraph/geo.geo_context" "b/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.paragraph/geo.geo_context" index 559d2086..c30fd759 100644 --- "a/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.paragraph/geo.geo_context" +++ "b/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.paragraph/geo.geo_context" @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccc9876f3b9fac885817cb49817891e496c04cb00fb19f4638ad6253ca880fce -size 9 +oid sha256:22558efb5d77fecb12ddd802231f223976f7c3fffb8a35585312098da0ec533c +size 16 diff --git "a/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.word.affixed" "b/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.affixed" similarity index 100% rename from "tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.word.affixed" rename to "tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.affixed" diff --git "a/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.ufeats" "b/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.ufeats" deleted file mode 100644 index d44bc051..00000000 --- "a/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.ufeats" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0bc4d0e6466c7c53aadfbe4921402b49c007630454d11f886558ce2f176465b4 -size 14586 diff --git "a/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.number_rel_segment.sentence" "b/tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/stanza.ref" similarity index 100% rename from "tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/misc.number_rel_segment.sentence" rename to "tests/test_corpora/standard-swe/gold_sparv-workdir/r\303\244var/segment.token/stanza.ref" diff --git "a/tests/test_corpora/standard-swe/source/r\303\244var.xml" "b/tests/test_corpora/standard-swe/source/r\303\244var.xml" index 53e9131f..bd5c1cec 100644 --- "a/tests/test_corpora/standard-swe/source/r\303\244var.xml" +++ "b/tests/test_corpora/standard-swe/source/r\303\244var.xml" @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ed52bc7ea74602aaec87570b27138f01d39ccc608b087d819e4accba0c82d61 -size 3248 +oid sha256:2a55835b7d4cf3e30ac5725355dbd65f92ea6c63c35d5ef5cd00777ebb708da0 +size 3351 diff --git a/tests/test_corpora/stanford-eng/config.yaml b/tests/test_corpora/stanford-eng/config.yaml index 3d2bd185..df7db2cd 100644 --- a/tests/test_corpora/stanford-eng/config.yaml +++ b/tests/test_corpora/stanford-eng/config.yaml @@ -8,7 +8,7 @@ metadata: # Corpus name (human readable) name: eng: Stanford Parser test corpus - # Language of the input documents, specified as ISO 639-3 code + # Language of the source files, specified as ISO 639-3 code language: eng description: @@ -22,8 +22,8 @@ metadata: #=============================================================================== import: - # The annotation representing one text document. Any text-level annotations will be attached to this annotation. - document_annotation: text + # The annotation representing one text. Any text-level annotations will be attached to this annotation. + text_annotation: text #=============================================================================== @@ -34,13 +34,15 @@ export: # Exports to create by default when running 'sparv run' default: - korp:timespan_sql - - cwb:info - csv_export:csv - cwb:vrt - cwb:vrt_scrambled - - stats_export:freq_list_simple + - stats_export:sbx_freq_list_simple - xml_export:pretty - xml_export:preserved_format # Automatic annotations to be included in the export annotations: - STANFORD.all + +sparv: + compression: none diff --git a/tests/test_corpora/stanford-eng/gold_export/csv/wikipedia.csv b/tests/test_corpora/stanford-eng/gold_export/csv_export/wikipedia.csv similarity index 100% rename from tests/test_corpora/stanford-eng/gold_export/csv/wikipedia.csv rename to tests/test_corpora/stanford-eng/gold_export/csv_export/wikipedia.csv diff --git a/tests/test_corpora/stanford-eng/gold_export/vrt/wikipedia.vrt b/tests/test_corpora/stanford-eng/gold_export/cwb.vrt/wikipedia.vrt similarity index 100% rename from tests/test_corpora/stanford-eng/gold_export/vrt/wikipedia.vrt rename to tests/test_corpora/stanford-eng/gold_export/cwb.vrt/wikipedia.vrt diff --git a/tests/test_corpora/stanford-eng/gold_export/vrt_scrambled/wikipedia.vrt b/tests/test_corpora/stanford-eng/gold_export/cwb.vrt_scrambled/wikipedia.vrt similarity index 100% rename from tests/test_corpora/stanford-eng/gold_export/vrt_scrambled/wikipedia.vrt rename to tests/test_corpora/stanford-eng/gold_export/cwb.vrt_scrambled/wikipedia.vrt diff --git a/tests/test_corpora/stanford-eng/gold_export/frequency_list/stats_stanford-eng.csv b/tests/test_corpora/stanford-eng/gold_export/frequency_list/stats_stanford-eng.csv deleted file mode 100644 index 4da14467..00000000 --- a/tests/test_corpora/stanford-eng/gold_export/frequency_list/stats_stanford-eng.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d50ca3a03d81bfde67dedd8eabc1977049e3218c2bcd383702900aec103b89c0 -size 5654 diff --git a/tests/test_corpora/stanford-eng/gold_export/korp_timespan/timespan.sql b/tests/test_corpora/stanford-eng/gold_export/korp.timespan/timespan.sql similarity index 100% rename from tests/test_corpora/stanford-eng/gold_export/korp_timespan/timespan.sql rename to tests/test_corpora/stanford-eng/gold_export/korp.timespan/timespan.sql diff --git a/tests/test_corpora/stanford-eng/gold_export/stats_export.frequency_list_sbx/stats_stanford-eng.csv b/tests/test_corpora/stanford-eng/gold_export/stats_export.frequency_list_sbx/stats_stanford-eng.csv new file mode 100644 index 00000000..2971b9f7 --- /dev/null +++ b/tests/test_corpora/stanford-eng/gold_export/stats_export.frequency_list_sbx/stats_stanford-eng.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5e2fe8f6d57cfd1901f8bea98c80ffdf93e284c42b063b5105152d105b10f06 +size 4920 diff --git a/tests/test_corpora/stanford-eng/gold_export/xml_preserved_format/wikipedia_export.xml b/tests/test_corpora/stanford-eng/gold_export/xml_export.preserved_format/wikipedia_export.xml similarity index 100% rename from tests/test_corpora/stanford-eng/gold_export/xml_preserved_format/wikipedia_export.xml rename to tests/test_corpora/stanford-eng/gold_export/xml_export.preserved_format/wikipedia_export.xml diff --git a/tests/test_corpora/stanford-eng/gold_export/xml_pretty/wikipedia_export.xml b/tests/test_corpora/stanford-eng/gold_export/xml_export.pretty/wikipedia_export.xml similarity index 100% rename from tests/test_corpora/stanford-eng/gold_export/xml_pretty/wikipedia_export.xml rename to tests/test_corpora/stanford-eng/gold_export/xml_export.pretty/wikipedia_export.xml diff --git a/tests/test_corpora/stanford-eng/gold_sparv-workdir/cwb.datefirst b/tests/test_corpora/stanford-eng/gold_sparv-workdir/cwb.datefirst deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/stanford-eng/gold_sparv-workdir/cwb.datelast b/tests/test_corpora/stanford-eng/gold_sparv-workdir/cwb.datelast deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/stanford-eng/gold_sparv-workdir/dateformat.resolution b/tests/test_corpora/stanford-eng/gold_sparv-workdir/dateformat.resolution deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/stanford-eng/gold_sparv-workdir/misc.stanford.sentence_count b/tests/test_corpora/stanford-eng/gold_sparv-workdir/misc.stanford.sentence_count deleted file mode 100644 index 7a47a698..00000000 --- a/tests/test_corpora/stanford-eng/gold_sparv-workdir/misc.stanford.sentence_count +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:785f3ec7eb32f30b90cd0fcf3657d388b5ff4297f2f9716ff66e9b69c05ddd09 -size 2 diff --git a/tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/misc.docid b/tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/misc.fileid similarity index 100% rename from tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/misc.docid rename to tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/misc.fileid diff --git a/tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/stanford.token/stanford.word b/tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/stanford.token/misc.word similarity index 100% rename from tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/stanford.token/stanford.word rename to tests/test_corpora/stanford-eng/gold_sparv-workdir/wikipedia/stanford.token/misc.word diff --git a/tests/test_corpora/swe-1800/config.yaml b/tests/test_corpora/swe-1800/config.yaml new file mode 100644 index 00000000..b549cf6a --- /dev/null +++ b/tests/test_corpora/swe-1800/config.yaml @@ -0,0 +1,43 @@ +#=============================================================================== +# Meta Data +#=============================================================================== +metadata: + id: swe-1800 + language: swe + variety: "1800" + description: + eng: | + This test corpus is for testing the standard annotations for Swedish + from the 1800's. + +#=============================================================================== +# Import Settings +#=============================================================================== +import: + text_annotation: text + +#=============================================================================== +# Export Settings +#=============================================================================== +export: + # Use the classes and annotations from the SWE_1800 preset + annotations: + - SWE_1800.all + # Exports to create by default when running 'sparv run' + default: + - csv_export:csv + - conll_export:conllu + - cwb:vrt + - stats_export:sbx_freq_list_simple_swe + - xml_export:pretty + - xml_export:preserved_format + +#=============================================================================== +# Module Settings +#=============================================================================== +segment: + # Sentences are pre-segmented by linebreaks + sentence_segmenter: linebreaks + +sparv: + compression: none diff --git a/tests/test_corpora/swe-1800/gold_export/conll_export/boaorm.conllu b/tests/test_corpora/swe-1800/gold_export/conll_export/boaorm.conllu new file mode 100644 index 00000000..53e6792d --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_export/conll_export/boaorm.conllu @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bf8a5d015bd732b0589191f3312ff337ae023a6215c810ffe40b29f740d5f10 +size 15878 diff --git a/tests/test_corpora/swe-1800/gold_export/csv_export/boaorm.csv b/tests/test_corpora/swe-1800/gold_export/csv_export/boaorm.csv new file mode 100644 index 00000000..f3712be3 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_export/csv_export/boaorm.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:523ccbfe8ba51f3c52b0619bf322f8a7a575300f7155f6dc2af2d7762091dca0 +size 28327 diff --git a/tests/test_corpora/swe-1800/gold_export/cwb.vrt/boaorm.vrt b/tests/test_corpora/swe-1800/gold_export/cwb.vrt/boaorm.vrt new file mode 100644 index 00000000..9c9c03b4 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_export/cwb.vrt/boaorm.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4162d59fa60bdfd119c57a9d20f02c92c6a3439089c51d68b0416545766d7721 +size 28112 diff --git a/tests/test_corpora/swe-1800/gold_export/stats_export.frequency_list_sbx/stats_swe-1800.csv b/tests/test_corpora/swe-1800/gold_export/stats_export.frequency_list_sbx/stats_swe-1800.csv new file mode 100644 index 00000000..85d85b5d --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_export/stats_export.frequency_list_sbx/stats_swe-1800.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68d625518ffa859fa5fa33296af1886eddaf369dadb24e68081c85fa4cd4e792 +size 2951 diff --git a/tests/test_corpora/swe-1800/gold_export/xml_export.preserved_format/boaorm_export.xml b/tests/test_corpora/swe-1800/gold_export/xml_export.preserved_format/boaorm_export.xml new file mode 100644 index 00000000..8adecb5b --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_export/xml_export.preserved_format/boaorm_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f60068d8e2b6754f4683b439715ea8e5eeeea154cbe359e34f3fe030793efd6f +size 53143 diff --git a/tests/test_corpora/swe-1800/gold_export/xml_export.pretty/boaorm_export.xml b/tests/test_corpora/swe-1800/gold_export/xml_export.pretty/boaorm_export.xml new file mode 100644 index 00000000..dc96d84a --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_export/xml_export.pretty/boaorm_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e4b23c62f5f7b9848d304ea9976db4abf0182d95ea1b4de9b41e64c0a8ab7af +size 54207 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/@structure b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/@structure new file mode 100644 index 00000000..e241c2c6 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/@structure @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38aaa4cd25ba6c0f4cac3e69d4f32d717b12f89853a097f69e4bc8469b57a9bc +size 25 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/@text b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/@text new file mode 100644 index 00000000..c2f72edc --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/@text @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:608dd0c06f8cc09bcaecb80be638b66fc1ed14cf52102689c2ec33e5c551f370 +size 1362 diff --git a/tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/misc.s_count b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/misc.fileid similarity index 100% rename from tests/test_corpora/freeling-eng-slevel/gold_sparv-workdir/misc.s_count rename to tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/misc.fileid diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/@span b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/@span new file mode 100644 index 00000000..58c35486 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/@span @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448e8172b0c928ef7852d3b5f0d9a6079250a0a3c9a0d090a5858038cfd1cff5 +size 89 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/geo.geo_context b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/geo.geo_context new file mode 100644 index 00000000..78a05221 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/geo.geo_context @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee9747704eaa831b8c20010f676ac3448d7cd12c2d1824768dc677dd35796fad +size 72 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/misc.id b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/misc.id new file mode 100644 index 00000000..e29c376f --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.sentence/misc.id @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59cb864df8bdfc983d2c79a62de69f635d48af79c080ee2c9004e91395cc26b +size 44 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/@span b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/@span new file mode 100644 index 00000000..9b34c258 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/@span @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b861266264deb3d416e8fe9ce23c875279e495dedfd91ec118229d6b1dde7f9 +size 1857 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.baseform b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.baseform new file mode 100644 index 00000000..a2f9be34 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.baseform @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f7c31460e519cd752940fa0a1bd28fb2948a755a258a8623fe6ef7d2d61bbdb +size 3707 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.combined_lemgrams b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.combined_lemgrams new file mode 100644 index 00000000..f03fff0f --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.combined_lemgrams @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a62e9d2464779dcd694125a5960a5bef394f015b1d9aeaa4b1d55889cd3e01 +size 6123 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.diapivot b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.diapivot new file mode 100644 index 00000000..b00cc61d --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.diapivot @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6f653a7d55eece3d1153ecb9bccf242c66326fee406e00060b4e3b0b22e14f1 +size 1277 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.lemgram b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.lemgram new file mode 100644 index 00000000..167dd833 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.lemgram @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ff78ad0ab80bc76d4e9f03b23b1ae34d578e7fcd63d17d104b287d239056e7 +size 6005 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.sense b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.sense new file mode 100644 index 00000000..c12d0a82 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hist.sense @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6ec4aabd78eb22bbfaab97cf45ed91463fd7025114b9f711aafb379b1fcb45a +size 2843 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hunpos.msd_hist b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hunpos.msd_hist new file mode 100644 index 00000000..b3cee657 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hunpos.msd_hist @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74a274391d4c10f4465f78a245f324fabacdafb3f032be3f6c8138c7261e6ab6 +size 2882 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hunpos.pos b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hunpos.pos new file mode 100644 index 00000000..5c63cba7 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/hunpos.pos @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f49a734ec857629813f48f06c50930685a86c55c66740940221aa46aa4b9f45 +size 673 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/lexical_classes.blingbring b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/lexical_classes.blingbring new file mode 100644 index 00000000..05eb67d9 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/lexical_classes.blingbring @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e515d49ef32c60b95d54525729430295f8399a5daccfd17c54d010391cc84d1 +size 4995 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/lexical_classes.swefn b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/lexical_classes.swefn new file mode 100644 index 00000000..a21910c6 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/lexical_classes.swefn @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63aeb5bdf7d5533a8b3c2d038439ff89cc4039abe174b11359d288310cfcfae +size 1582 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/misc.word b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/misc.word new file mode 100644 index 00000000..0bf04bb3 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/misc.word @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746a8f2286570884ff7f4ded2696ae34d463a36b2a2d2310c78b0e4124addec1 +size 1362 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/sensaldo.sentiment_label b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/sensaldo.sentiment_label new file mode 100644 index 00000000..389e9929 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/sensaldo.sentiment_label @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba35b5e90ee47c329d50cc16ba6ad29c83033e198b31ec8af02e160c2b8039f +size 579 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/sensaldo.sentiment_score b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/sensaldo.sentiment_score new file mode 100644 index 00000000..68a582d7 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/sensaldo.sentiment_score @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c7591f2131e48daca327ee769d8254f0ae34f0031fe7cd290eda091d991bad +size 275 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.baseform b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.baseform new file mode 100644 index 00000000..63fb99f1 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.baseform @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3757b15418bda8570308b413319f67255587a3fa5d4b576af594fd6abe67c53 +size 1285 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.dephead b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.dephead new file mode 100644 index 00000000..666082ca --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.dephead @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baeeda158b87f33f1c005e991884a583bd8fc9178083bf947a7df77d2b47c7c3 +size 769 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.dephead_ref b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.dephead_ref new file mode 100644 index 00000000..69f98d9e --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.dephead_ref @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56f5e84ac90e51e0e061060727b0ad98febabf8d4f7c0a53a029f341e2b92468 +size 564 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.deprel b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.deprel new file mode 100644 index 00000000..02f73666 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.deprel @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a01c4d75c5f4d9f2d6b01089ac084500fcb3d15baa9f95c33bae2a28c28a3a13 +size 694 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.msd b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.msd new file mode 100644 index 00000000..a3e8b918 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.msd @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33388f10b295b7c2a31c677c9b294adc39c997b642e321c52a40e6d9cefef0ca +size 2875 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.pos b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.pos new file mode 100644 index 00000000..160c11fc --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.pos @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59d75d3e17f3a9527236eca408d0c3e364ce49707390ffffee9ce34e046acf70 +size 674 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.ref b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.ref new file mode 100644 index 00000000..6580414f --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.ref @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f125d491fbf5a9927874aa5928d784a82731d8a74bf3f45c0d64504faf348090 +size 587 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.ufeats b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.ufeats new file mode 100644 index 00000000..27d90a87 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stanza.ufeats @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec1e6c67410135e2795115e268cdaa810b215c6c1ff82a8c12dbca9e9835504 +size 6891 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stats_export.baseform_first b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stats_export.baseform_first new file mode 100644 index 00000000..1ea9e92a --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/segment.token/stats_export.baseform_first @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7167beb3c65e6b894f18e8b7e741218c4635a633950da85fc43a413f851554f8 +size 1300 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/@span b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/@span new file mode 100644 index 00000000..f4260a5c --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/@span @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd3abf63fe4ff71ae1a9344e6ea177aa336a4ee4698f0c31f466b542bfedc0b +size 63 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.ex b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.ex new file mode 100644 index 00000000..de998705 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.ex @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05a9af0c3507de450816631e988dfea0bbcb99f3536f822a4d370f807630d8cd +size 55 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.name b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.name new file mode 100644 index 00000000..98972612 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.name @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55287fa37c25dd4ae472e6fc89224cfa8e4ce297d700877cc8f71e619bfa43e3 +size 69 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.subtype b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.subtype new file mode 100644 index 00000000..1d1e1a29 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.subtype @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:920d0f1d10c5340a3913c9016928935b40b829ffa38d76111bfd1974c9740c64 +size 32 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.type b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.type new file mode 100644 index 00000000..86714c85 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/swener.ne/swener.type @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f96af8a49ae3e2aaa0af6df88fbf557c63ce84e2e3144692c3865333a35f3c3 +size 32 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/@span b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/@span new file mode 100644 index 00000000..37c01f7d --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/@span @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11569fdf520ee1685bea0cf761d78bc235dc3e2d3ccbd3bb289325c71792d08d +size 11 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/date b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/date new file mode 100644 index 00000000..c8b354c3 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/date @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:299d1648c44d74a812551129e3076ab7c3bc934017b1f0c1a18c6b46aa759121 +size 11 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/lexical_classes.blingbring b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/lexical_classes.blingbring new file mode 100644 index 00000000..45f61089 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/lexical_classes.blingbring @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:823d2060528921f4793970ab02722c742800b252fca53b95c64b4dcb151011f6 +size 61 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/lexical_classes.swefn b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/lexical_classes.swefn new file mode 100644 index 00000000..866c76b3 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/lexical_classes.swefn @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3b84406e6f1bf1ba0cb0d9d39045de7a487edd4096dd1523bb25fa45ea4a967 +size 71 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.lix b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.lix new file mode 100644 index 00000000..89d80e3c --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.lix @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb2ec124df46f54aad6bc60dc7d8364b7bea23ebea134445849d44aeb78bcb4 +size 6 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.nk b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.nk new file mode 100644 index 00000000..f25430a0 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.nk @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b279feaf74c88392db20f72ccdbe579e0c97727fde1a8f261ac59ede8951ef8 +size 5 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.ovix b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.ovix new file mode 100644 index 00000000..11497649 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/readability.ovix @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29b230002e78f8f9cdafad1aa6197f3d20fc66af02baac07802cda5eea304273 +size 6 diff --git a/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/title b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/title new file mode 100644 index 00000000..96841db9 --- /dev/null +++ b/tests/test_corpora/swe-1800/gold_sparv-workdir/boaorm/text/title @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42cecbf3c3e8f8c916f3baa053feaae9cca659d4241e1dc82755d60d5207d6d8 +size 20 diff --git a/tests/test_corpora/swe-1800/source/boaorm.xml b/tests/test_corpora/swe-1800/source/boaorm.xml new file mode 100644 index 00000000..973901c9 --- /dev/null +++ b/tests/test_corpora/swe-1800/source/boaorm.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:052c31d87f663b0c28a1f46cdd764546514f5baeb0810a98d141b90b99fa1bd3 +size 1421 diff --git a/tests/test_corpora/swe-fsv/config.yaml b/tests/test_corpora/swe-fsv/config.yaml new file mode 100644 index 00000000..3ca4b3d0 --- /dev/null +++ b/tests/test_corpora/swe-fsv/config.yaml @@ -0,0 +1,35 @@ +#=============================================================================== +# Meta Data +#=============================================================================== +metadata: + id: swe-fsv + language: swe + variety: fsv + description: + eng: | + This test corpus is for testing the standard annotations for Old + Swedish. + +#=============================================================================== +# Import Settings +#=============================================================================== +import: + text_annotation: text + +#=============================================================================== +# Export Settings +#=============================================================================== +export: + # Use the classes and annotations from the SWE_1800 preset + annotations: + - SWE_FSV.all + # Exports to create by default when running 'sparv run' + default: + - csv_export:csv + - cwb:vrt + - stats_export:sbx_freq_list_fsv + - xml_export:pretty + - xml_export:preserved_format + +sparv: + compression: none diff --git a/tests/test_corpora/swe-fsv/gold_export/csv_export/fsv.csv b/tests/test_corpora/swe-fsv/gold_export/csv_export/fsv.csv new file mode 100644 index 00000000..bd71a752 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_export/csv_export/fsv.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa6780f6d3713bc0f8f60134484d5593617eb4a05c617fff3c53fe9a1a46081e +size 20004 diff --git a/tests/test_corpora/swe-fsv/gold_export/cwb.vrt/fsv.vrt b/tests/test_corpora/swe-fsv/gold_export/cwb.vrt/fsv.vrt new file mode 100644 index 00000000..6c5ef07d --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_export/cwb.vrt/fsv.vrt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d7318d7c08e9b2d7dcf0f73bbcb68be39ac35ad77f0740e33c5505f7a5aed3d +size 20155 diff --git a/tests/test_corpora/swe-fsv/gold_export/stats_export.frequency_list_sbx/stats_swe-fsv.csv b/tests/test_corpora/swe-fsv/gold_export/stats_export.frequency_list_sbx/stats_swe-fsv.csv new file mode 100644 index 00000000..3739da9b --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_export/stats_export.frequency_list_sbx/stats_swe-fsv.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b3437607d8967b50fb5e24028cd61257e36ed80b0f5b45b77568d5a8651bb3c +size 10655 diff --git a/tests/test_corpora/swe-fsv/gold_export/xml_export.preserved_format/fsv_export.xml b/tests/test_corpora/swe-fsv/gold_export/xml_export.preserved_format/fsv_export.xml new file mode 100644 index 00000000..94336e93 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_export/xml_export.preserved_format/fsv_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7b9ca23c2e10c78a7de225218d389ae434e5ab076b59bfe5fc9cfc4a8a3e814 +size 30795 diff --git a/tests/test_corpora/swe-fsv/gold_export/xml_export.pretty/fsv_export.xml b/tests/test_corpora/swe-fsv/gold_export/xml_export.pretty/fsv_export.xml new file mode 100644 index 00000000..8bd529d1 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_export/xml_export.pretty/fsv_export.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b7a42af1fd0470eda3878f0d434d5559528f7e2ade6f6f8fca450febc65e2c +size 31515 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/@structure b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/@structure new file mode 100644 index 00000000..bc5ed5d7 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/@structure @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1802ac5411eb8a9aa339b2282defd2337b96b8577f2d82f932eff8614456f8c +size 15 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/@text b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/@text new file mode 100644 index 00000000..7dc6f58a --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/@text @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8732463cc10257021a4b0329bc9605d4c25490b340dc46453231ceb6804c9365 +size 755 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/misc.fileid b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/misc.fileid new file mode 100644 index 00000000..8066cd7c --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/misc.fileid @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19581e27de7ced00ff1ce50b2047e7a567c76b1cbaebabe5ef03f7c3017bb5b7 +size 1 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.sentence/@span b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.sentence/@span new file mode 100644 index 00000000..73db8a11 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.sentence/@span @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c3467711d1132fedf168ca99c27f7ec6b96ab56da3aab93e12421f67c749be6 +size 140 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.sentence/misc.id b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.sentence/misc.id new file mode 100644 index 00000000..2f7dbeb2 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.sentence/misc.id @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db4059c2cb213a218f0ab7b9641963aeab52daef866c687cc73621133ad19f80 +size 72 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/@span b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/@span new file mode 100644 index 00000000..1fbc3a58 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/@span @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d683ef11b4ac34dc58c4cfca62a12b6adaf36fd23b2259fe0ab0ca3b0770418 +size 1144 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.all_spelling_variants b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.all_spelling_variants new file mode 100644 index 00000000..b72a4f9b --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.all_spelling_variants @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27136fbeb819ae858416ae0a40c765a6fcc73cea439efd18590c2da8140e5275 +size 2891 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.baseform b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.baseform new file mode 100644 index 00000000..ea031409 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.baseform @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a521031c4ad1b0683d7fd490426a6feaad0ce318b04c8a6c107aac0387e52cd7 +size 2214 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.combined_lemgrams b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.combined_lemgrams new file mode 100644 index 00000000..6b1c2c7f --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.combined_lemgrams @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59861f894c25891bd8422b7212442454b30fe7cd6065289dea77d6e1269a0cb +size 13398 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.diapivot b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.diapivot new file mode 100644 index 00000000..b970823e --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.diapivot @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42c21dba83da3eec44994c931c03a0a9eef0697b57eabd4b7a20991abb93d1bd +size 4018 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.homograph_set b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.homograph_set new file mode 100644 index 00000000..136392dd --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.homograph_set @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adecad8f57614912d35b673ae984d7306e70167c0a2cff603ecf750e0843069a +size 1034 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.lemgram b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.lemgram new file mode 100644 index 00000000..81c5838b --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.lemgram @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ff7d55e99a9f38a5e8dda3c809c90dd09a7ae5c585857d656ca6f097096c42 +size 9676 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.sense b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.sense new file mode 100644 index 00000000..48372de6 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.sense @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d9cfa8a5adeb0a0fd21e48a5420f95b402ee055f78aa35f35e9749ab3204d82 +size 296 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.spelling_variants b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.spelling_variants new file mode 100644 index 00000000..16c6e28e --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/hist.spelling_variants @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25997cf3d7469f5c59c3f6f974d474326ea8ff0b60c36394e795a22a3151adc6 +size 2118 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/misc.ref b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/misc.ref new file mode 100644 index 00000000..5682dceb --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/misc.ref @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:032a768231494e0da4a8936d7f1c31c0eea0f1b052acc2b4021ce87b955ffbab +size 327 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/misc.word b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/misc.word new file mode 100644 index 00000000..1026edfc --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/segment.token/misc.word @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5139e186b1339e97a48c533c172b1097ece9892e8f84292ecaf07e5452872703 +size 773 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/text/@span b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/text/@span new file mode 100644 index 00000000..e3bc032b --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/text/@span @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b480bca7c8bc4091e6cb495603813bc43fb6d2d1f70f5d0a1da0c5a61844827 +size 10 diff --git a/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/text/title b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/text/title new file mode 100644 index 00000000..98512cb9 --- /dev/null +++ b/tests/test_corpora/swe-fsv/gold_sparv-workdir/fsv/text/title @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e978332f1969b9b18d95582022456c5fd80bc0a728a867b530a9762cb804aa +size 20 diff --git a/tests/test_corpora/swe-fsv/source/fsv.xml b/tests/test_corpora/swe-fsv/source/fsv.xml new file mode 100644 index 00000000..76d4a9c2 --- /dev/null +++ b/tests/test_corpora/swe-fsv/source/fsv.xml @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e4980ab8837f324c7ec00dae9f7576894161ad037bf06e837b8590f2e6330d0 +size 797 diff --git a/tests/test_corpora/treetagger-nld/config.yaml b/tests/test_corpora/treetagger-nld/config.yaml index ae93c1ef..c7328beb 100644 --- a/tests/test_corpora/treetagger-nld/config.yaml +++ b/tests/test_corpora/treetagger-nld/config.yaml @@ -8,7 +8,7 @@ metadata: # Corpus name (human readable) name: eng: TreeTagger test corpus - # Language of the input documents, specified as ISO 639-3 code + # Language of the source files, specified as ISO 639-3 code language: nld description: @@ -22,8 +22,8 @@ metadata: #=============================================================================== import: - # The annotation representing one text document. Any text-level annotations will be attached to this annotation. - document_annotation: text + # The annotation representing one text. Any text-level annotations will be attached to this annotation. + text_annotation: text #=============================================================================== @@ -34,11 +34,10 @@ export: # Exports to create by default when running 'sparv run' default: - korp:timespan_sql - - cwb:info - csv_export:csv - cwb:vrt - cwb:vrt_scrambled - - stats_export:freq_list_simple + - stats_export:sbx_freq_list_simple - xml_export:pretty - xml_export:preserved_format # Automatic annotations to be included in the export @@ -46,3 +45,6 @@ export: - :misc.id - text:readability.lix - TREETAGGER.all + +sparv: + compression: none diff --git a/tests/test_corpora/treetagger-nld/gold_export/csv/document1.csv b/tests/test_corpora/treetagger-nld/gold_export/csv_export/document1.csv similarity index 100% rename from tests/test_corpora/treetagger-nld/gold_export/csv/document1.csv rename to tests/test_corpora/treetagger-nld/gold_export/csv_export/document1.csv diff --git a/tests/test_corpora/treetagger-nld/gold_export/vrt/document1.vrt b/tests/test_corpora/treetagger-nld/gold_export/cwb.vrt/document1.vrt similarity index 100% rename from tests/test_corpora/treetagger-nld/gold_export/vrt/document1.vrt rename to tests/test_corpora/treetagger-nld/gold_export/cwb.vrt/document1.vrt diff --git a/tests/test_corpora/treetagger-nld/gold_export/vrt_scrambled/document1.vrt b/tests/test_corpora/treetagger-nld/gold_export/cwb.vrt_scrambled/document1.vrt similarity index 100% rename from tests/test_corpora/treetagger-nld/gold_export/vrt_scrambled/document1.vrt rename to tests/test_corpora/treetagger-nld/gold_export/cwb.vrt_scrambled/document1.vrt diff --git a/tests/test_corpora/treetagger-nld/gold_export/frequency_list/stats_treetagger-nld.csv b/tests/test_corpora/treetagger-nld/gold_export/frequency_list/stats_treetagger-nld.csv deleted file mode 100644 index 7783b325..00000000 --- a/tests/test_corpora/treetagger-nld/gold_export/frequency_list/stats_treetagger-nld.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be0b44751298213f3e6ae5bef882ce91566ab48c77df808010728e3e868b74ee -size 2112 diff --git a/tests/test_corpora/treetagger-nld/gold_export/korp_timespan/timespan.sql b/tests/test_corpora/treetagger-nld/gold_export/korp.timespan/timespan.sql similarity index 100% rename from tests/test_corpora/treetagger-nld/gold_export/korp_timespan/timespan.sql rename to tests/test_corpora/treetagger-nld/gold_export/korp.timespan/timespan.sql diff --git a/tests/test_corpora/treetagger-nld/gold_export/stats_export.frequency_list_sbx/stats_treetagger-nld.csv b/tests/test_corpora/treetagger-nld/gold_export/stats_export.frequency_list_sbx/stats_treetagger-nld.csv new file mode 100644 index 00000000..d7a58da5 --- /dev/null +++ b/tests/test_corpora/treetagger-nld/gold_export/stats_export.frequency_list_sbx/stats_treetagger-nld.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0095e10ad9d407308a9edfb70566eb22ae48730b171d39303881ae69f0abaf +size 1855 diff --git a/tests/test_corpora/treetagger-nld/gold_export/xml_preserved_format/document1_export.xml b/tests/test_corpora/treetagger-nld/gold_export/xml_export.preserved_format/document1_export.xml similarity index 100% rename from tests/test_corpora/treetagger-nld/gold_export/xml_preserved_format/document1_export.xml rename to tests/test_corpora/treetagger-nld/gold_export/xml_export.preserved_format/document1_export.xml diff --git a/tests/test_corpora/treetagger-nld/gold_export/xml_pretty/document1_export.xml b/tests/test_corpora/treetagger-nld/gold_export/xml_export.pretty/document1_export.xml similarity index 100% rename from tests/test_corpora/treetagger-nld/gold_export/xml_pretty/document1_export.xml rename to tests/test_corpora/treetagger-nld/gold_export/xml_export.pretty/document1_export.xml diff --git a/tests/test_corpora/treetagger-nld/gold_sparv-workdir/cwb.datefirst b/tests/test_corpora/treetagger-nld/gold_sparv-workdir/cwb.datefirst deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/treetagger-nld/gold_sparv-workdir/cwb.datelast b/tests/test_corpora/treetagger-nld/gold_sparv-workdir/cwb.datelast deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/treetagger-nld/gold_sparv-workdir/dateformat.resolution b/tests/test_corpora/treetagger-nld/gold_sparv-workdir/dateformat.resolution deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_corpora/treetagger-nld/gold_sparv-workdir/document1/misc.docid b/tests/test_corpora/treetagger-nld/gold_sparv-workdir/document1/misc.fileid similarity index 100% rename from tests/test_corpora/treetagger-nld/gold_sparv-workdir/document1/misc.docid rename to tests/test_corpora/treetagger-nld/gold_sparv-workdir/document1/misc.fileid diff --git a/tests/test_corpora/treetagger-nld/gold_sparv-workdir/misc.segment.sentence_count b/tests/test_corpora/treetagger-nld/gold_sparv-workdir/misc.segment.sentence_count deleted file mode 100644 index 73fdfd27..00000000 --- a/tests/test_corpora/treetagger-nld/gold_sparv-workdir/misc.segment.sentence_count +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328cb08b5531fcacdabf8a -size 1 diff --git a/tests/test_corpora/txt-swe/config.yaml b/tests/test_corpora/txt-swe/config.yaml index 4e68720d..6eaa8e17 100644 --- a/tests/test_corpora/txt-swe/config.yaml +++ b/tests/test_corpora/txt-swe/config.yaml @@ -10,3 +10,6 @@ xml_export: - - - :stanza.pos + +sparv: + compression: none diff --git a/tests/test_corpora/txt-swe/gold_export/xml_pretty/dokument_export.xml b/tests/test_corpora/txt-swe/gold_export/xml_export.pretty/dokument_export.xml similarity index 100% rename from tests/test_corpora/txt-swe/gold_export/xml_pretty/dokument_export.xml rename to tests/test_corpora/txt-swe/gold_export/xml_export.pretty/dokument_export.xml diff --git a/tests/test_corpora/txt-swe/gold_sparv-workdir/dokument/misc.docid b/tests/test_corpora/txt-swe/gold_sparv-workdir/dokument/misc.fileid similarity index 100% rename from tests/test_corpora/txt-swe/gold_sparv-workdir/dokument/misc.docid rename to tests/test_corpora/txt-swe/gold_sparv-workdir/dokument/misc.fileid diff --git a/tests/utils.py b/tests/utils.py index ee1ee98e..761a634b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,6 +3,7 @@ import difflib import filecmp import pathlib +import re import shutil import subprocess import xml.etree.ElementTree as etree @@ -30,8 +31,7 @@ def run_sparv(gold_corpus_dir: pathlib.Path, args = ["sparv", "-d", str(new_corpus_dir), "run", *targets] process = subprocess.run(args, capture_output=True) - # Exclude progress updates and progress bar from output - stdout = process.stdout.strip().decode() + stdout = _remove_progress_info(process.stdout.strip().decode()) if stdout and process.returncode != 0: print_error(f"The following warnings/errors occurred:\n{stdout}") elif process.stderr.strip(): @@ -127,8 +127,8 @@ def _cmp_dirs(a: pathlib.Path, def _filediff(a: pathlib.Path, b: pathlib.Path): """Print a unified diff of files a and b.""" - a_contents = a.read_text().splitlines() - b_contents = b.read_text().splitlines() + a_contents = a.read_text(encoding="utf-8").splitlines() + b_contents = b.read_text(encoding="utf-8").splitlines() diff = difflib.unified_diff(a_contents, b_contents, fromfile=str(a), tofile=str(b)) for line in diff: @@ -138,12 +138,12 @@ def _filediff(a: pathlib.Path, b: pathlib.Path): def _xml_filediff(a: pathlib.Path, b: pathlib.Path): """Print a unified diff of canonicalize XML files a and b.""" try: - a_contents = etree.canonicalize(a.read_text()).splitlines() + a_contents = etree.canonicalize(a.read_text(encoding="utf-8")).splitlines() except etree.ParseError: print_error(f"File {a} could not be parsed.") return True try: - b_contents = etree.canonicalize(b.read_text()).splitlines() + b_contents = etree.canonicalize(b.read_text(encoding="utf-8")).splitlines() except etree.ParseError: print_error(f"File {a} could not be parsed.") return True @@ -156,3 +156,14 @@ def _xml_filediff(a: pathlib.Path, b: pathlib.Path): print(line.strip()) return True return False + + +def _remove_progress_info(output): + """Exclude progress updates from output.""" + lines = output.split("\n") + out = [] + for line in lines: + matchobj = re.match(r"(?:\d\d:\d\d:\d\d|\s{8}) (PROGRESS)\s+(.+)$", line) + if not matchobj: + out.append(line) + return "\n".join(out) diff --git a/utils/freq_list.py b/utils/freq_list.py deleted file mode 100644 index 92d23bd4..00000000 --- a/utils/freq_list.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -"""Build word frequency list from Sparv's XML output.""" - -import argparse -import csv -from collections import defaultdict -import glob -import xml.etree.ElementTree as etree - - -WORD_NODE = "w" - -parser = argparse.ArgumentParser(description="Build word frequency list from Sparv's XML output.") -parser.add_argument("-i", "--inpattern", help="pattern for XML files to process, relative to working dir (default: 'export*/**/*.xml')", - dest="inpattern", default="export*/**/*.xml") -parser.add_argument("-o", "--outfile", help="path to the output file (frequency list), e.g. stats_attasidor.tsv", required=True, dest="outfile") - - -def loop_exports(glob_exp, out_file): - """Loop through XML files matching glob_exp and write frequencies to out_file.""" - freq_dict = defaultdict(int) - pathlist = glob.glob(glob_exp, recursive=True) - print("\nCollecting frequencies from %s files:\n" % len(pathlist)) - for path in sorted(pathlist): - parse_export(path, freq_dict) - - with open(out_file, "w") as csvfile: - csv_writer = csv.writer(csvfile, delimiter="\t") - csv_writer.writerow(["token", "POS", "lemma", "SALDO sense", "lemgram", "compound", "count"]) - for (wordform, msd, lemma, sense, lemgram, complemgram), freq in sorted(freq_dict.items(), key=lambda x: -x[1]): - csv_writer.writerow([wordform, msd, lemma, sense, lemgram, complemgram, freq]) - - -def parse_export(in_file, freq_dict): - """Parse in_file and add frequencies in freq_dict.""" - print("Parsing %s" % in_file) - tree = etree.parse(in_file) - root = tree.getroot() - for word in root.findall(".//" + WORD_NODE): - wordform = word.text - msd = word.get("msd") - lemma = word.get("lemma").split("|")[1] - sense = word.get("sense").split("|")[1].split(":")[0] - lemgram = word.get("lex").split("|")[1].split(":")[0] - if not sense: - complemgram = word.get("complemgram").split("|")[1].split(":")[0] - else: - complemgram = "" - freq_dict[(wordform, msd, lemma, sense, lemgram, complemgram)] += 1 - - -if __name__ == "__main__": - args = parser.parse_args() - loop_exports(args.inpattern, args.outfile)