diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..2fa9b01 --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +# configuration for black compatibility + +[flake8] +max-line-length = 88 +extend-ignore = E203, W503 diff --git a/examples/language_identification/example0-invalid.json b/examples/language_identification/example0-invalid.json new file mode 100644 index 0000000..42e500f --- /dev/null +++ b/examples/language_identification/example0-invalid.json @@ -0,0 +1,62 @@ +{ + "id": "luxzeit1858-1859-01-18-a-i0026", + "lg_decision": "voting", + "tp": "tb", + "len": 63, + "orig_lg": null, + "alphabetical_ratio": 0.254, + "impresso_language_identifier_version": { + "version": "v1.4.1", + "ts": "2020-12-28T10:27:11+00:00" + }, + "language_identifier_version": { + "version": "v1.4.1", + "ts": "2020-12-28T10:15:45+00:00" + }, + "impresso_ft": [ + { + "lang": "fr", + "prob": 0.969 + }, + { + "lang": "de", + "prob": 0.03 + } + ], + "langdetect": [ + { + "lang": "ro", + "prob": 0.667 + }, + { + "lang": "ca", + "prob": 0.333 + } + ], + "langid": [ + { + "lang": "ro", + "prob": 0.655 + } + ], + "wp_ft": [ + { + "lang": "es", + "prob": 0.305 + }, + { + "lang": "ca", + "prob": 0.121 + }, + { + "lang": "war", + "prob": 0.106 + } + ], + "votes": [ + { + "lang": "fr", + "vote": 0.942 + } + ] +} diff --git a/json/language_identification/language_identification.schema.json b/json/language_identification/language_identification.schema.json index 38b839e..013bf03 100644 --- a/json/language_identification/language_identification.schema.json +++ b/json/language_identification/language_identification.schema.json @@ -45,13 +45,13 @@ "lg_decision": { "enum": [ "all", - "all-but-impresso-ft", + "all-but-impresso_ft", "voting", "dominant-by-len", "dominant-by-lowvote" ], "type": "string", - "description": "An identifier for the decision strategy applied to the content item: 'all' = all LID systems/info agree; 'all-but-impresso-ft' = all LID except impresso_ft agree on a language other than de/fr; 'dominant-by-len' = the most frequent language of the ensemble decisions is selected because there are too few characters; 'dominant-by-lowvote' = the most frequent language of the ensemble decisions is selected because there are too few votes; 'voting' = the language with the highest vote count is selected " + "description": "An identifier for the decision strategy applied to the content item: 'all' = all LID systems/info agree; 'all-but-impresso_ft' = all LID except impresso_ft agree on a language other than de/fr; 'dominant-by-len' = the most frequent language of the ensemble decisions is selected because there are too few characters; 'dominant-by-lowvote' = the most frequent language of the ensemble decisions is selected because there are too few votes; 'voting' = the language with the highest vote count is selected " }, "tp": { "type": "string", diff --git a/scripts/jsonlschema.py b/scripts/jsonlschema.py index 1c57b1d..0b7a915 100755 --- a/scripts/jsonlschema.py +++ b/scripts/jsonlschema.py @@ -1,109 +1,174 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -""" -Validate a jsonline file (raw or compressed) against a schema +#!/usr/bin/env python3 + """ +A module to validate JSON Line files (raw or compressed) against a specified JSON +schema. This script can handle files from various sources (local, cloud storage) through +smart_open, and logs validation errors while allowing for command-line configured +verbosity. -__appname__ = "jsonlschema" -__author__ = "simon.clematide@uzh.ch" -__version__ = "v1.0" +In cases of validation errors, the offending JSON object is logged and skipped. The +script exits with a non-zero status if any validation errors are encountered. +""" +__author__ = "simon.clematide@uzh.ch" +__version__ = "v1.1" -import sys -import logging +import argparse import json -from jsonschema import validate -from smart_open import open -from typing import Iterable -log = logging.getLogger(__name__) - -sys.stdin.reconfigure(encoding='utf-8') -sys.stdout.reconfigure(encoding='utf-8') -sys.stderr.reconfigure(encoding='utf-8') +import logging +import sys +from typing import Iterable, List, Optional +import jsonschema +import smart_open -class JSONLSchemaValidator(object): +log = logging.getLogger(__name__) - def __init__(self, schema, input_files, output_file=None): - self.input_files = input_files - self.output_writer = open(output_file, mode="w", encoding="utf-8") if output_file is not None else sys.stdout - self.schema = json.load(open(schema,encoding="utf-8")) - def run(self) -> None: +class JSONLSchemaValidator: + """ + A class to validate JSON objects against a given JSON schema. - try: - self.process() - except: - self.output_writer.close() + Attributes: + input_files (List[str]): A list of paths to input files containing JSON objects. + output_writer (TextIOWrapper): A file writer object for the output file, + defaults to sys.stdout if not specified. - def process(self) -> None: - for jo in self.next_json_object(): - validate(instance=jo, schema=self.schema) - print(json.dumps(jo,ensure_ascii=False,separators=(",", ":")),file=self.output_writer) + schema (dict): The loaded JSON schema against which JSON objects will be + validated. + """ + def __init__( + self, + schema: str, + input_files: List[str], + output_file: Optional[str] = None, + file_format: str = "jsonl", + ): + """ + Initializes the JSONLSchemaValidator with a schema, input files, and an optional + output file. + + Args: + schema (str): The path to the JSON schema file. + input_files (List[str]): A list of file paths to read the JSON objects from. + output_file (Optional[str]): The file path to write validated JSON objects + to. Writes to stdout if None. + """ + self.input_files = input_files + self.file_format = file_format + self.validation_errors = 0 + self.output_writer = ( + smart_open.open(output_file, mode="w", encoding="utf-8") + if output_file + else sys.stdout + ) + with smart_open.open(schema, encoding="utf-8") as schema_file: + self.schema = json.load(schema_file) - def next_json_object(self) -> Iterable[dict]: - """Yield each json object. + def run(self) -> None: + """Starts the validation process.""" - :return: Iterator over json objects. - :rtype: Iterable[dict] + self.process() + if self.validation_errors > 0: + log.error("Validation errors encountered: %d", self.validation_errors) + sys.exit(1) + def process(self) -> None: """ + Processes each JSON object from the input files, validating them against the + schema. - for infile in self.input_files: - with open(infile, encoding="utf-8") as infile: - for line in infile: - yield json.loads(line) - - + Invalid objects are logged and skipped. + """ + for jo in self.next_json_object(): + try: + jsonschema.validate(instance=jo, schema=self.schema) + except jsonschema.exceptions.ValidationError as e: + log.error("Validation error: %s", e.message) + log.info("Offending JSON object ignored: %s", jo) + self.validation_errors += 1 + continue + print( + json.dumps(jo, ensure_ascii=False, separators=(",", ":")), + file=self.output_writer, + ) -if __name__ == '__main__': - import argparse - description = "" - epilog = "" - parser = argparse.ArgumentParser(description=description, epilog=epilog) - parser.add_argument('-l', '--logfile', dest='logfile', - help='write log to FILE', metavar='FILE') - parser.add_argument('-v', '--verbose', dest='verbose',default=2,type=int, metavar="LEVEL", - help='set verbosity level: 0=CRITICAL, 1=ERROR, 2=WARNING, 3=INFO 4=DEBUG (default %(default)s)') + def next_json_object(self) -> Iterable[dict]: + """Yields JSON objects based on the specified file format.""" + + if self.file_format == "jsonl": + for infile_path in self.input_files: + with smart_open.open(infile_path, encoding="utf-8") as infile: + for line in infile: + if line.strip(): # Skip empty lines + yield json.loads(line) + elif self.file_format == "json": + for infile_path in self.input_files: + with smart_open.open(infile_path, encoding="utf-8") as infile: + data = json.load(infile) + if isinstance(data, list): + for item in data: + yield item + else: + yield data + + +def main(): + """ + Parses command-line arguments and initiates JSON line file validation. + """ + parser = argparse.ArgumentParser( + description="Validate JSON Lines files against a schema." + ) + parser.add_argument("--schema", help="Path to JSON schema file", required=True) + parser.add_argument("-l", "--logfile", help="Write log to FILE", metavar="FILE") parser.add_argument( - "-o", - "--output-file", - default="/dev/stdout", - help="Output file, writing to stdout if not specified", + "-o", "--output-file", help="Output file (defaults to stdout if not specified)" ) - + parser.add_argument("-i", "--input-files", nargs="+", help="Input JSON Lines files") parser.add_argument( - "-i", - "--input-files", - metavar="JSONL", - nargs="*", - help="Input files, reading from stdin if not provided", + "--file-format", + choices=["json", "jsonl"], + help="File format (jsonl or json)", + default="jsonl", ) parser.add_argument( - "schema", - metavar="SCHEMA", - help="path to schema", + "--level", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Set the logging level. Default: %(default)s", ) - - arguments = parser.parse_args() - - log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING, - logging.INFO, logging.DEBUG] - logging.basicConfig(level=log_levels[arguments.verbose], - format='%(asctime)-15s %(levelname)s: %(message)s') + args = parser.parse_args() + + to_logging_level = { + "CRITICAL": logging.CRITICAL, + "ERROR": logging.ERROR, + "WARNING": logging.WARNING, + "INFO": logging.INFO, + "DEBUG": logging.DEBUG, + } + logging.basicConfig( + level=to_logging_level[args.level], + format="%(asctime)-15s %(levelname)-8s: %(message)s", + force=True, + ) + if args.input_files and args.input_files[0].endswith(".json"): + args.file_format = "json" + log.info("File format set to JSON") + + validator = JSONLSchemaValidator( + schema=args.schema, + input_files=args.input_files or [], + output_file=args.output_file, + file_format=args.file_format, + ) + validator.run() - jsonl_schema_validator_args = { - "schema", - "output_file", - "input_files", - } - # launching application ... - JSONLSchemaValidator( - **{k: v for k, v in vars(arguments).items() if k in jsonl_schema_validator_args} - ).run() +if __name__ == "__main__": + main()