From 8b0ce8083a389c601b4f5743275b091a20ba7390 Mon Sep 17 00:00:00 2001 From: Simon Clematide Date: Sat, 6 Apr 2024 17:46:51 +0200 Subject: [PATCH 1/5] improve schema validator script and fix silent exception handling --- .../example0-invalid.json | 62 ++++++ scripts/jsonlschema.py | 199 +++++++++++------- 2 files changed, 183 insertions(+), 78 deletions(-) create mode 100644 examples/language_identification/example0-invalid.json diff --git a/examples/language_identification/example0-invalid.json b/examples/language_identification/example0-invalid.json new file mode 100644 index 0000000..42e500f --- /dev/null +++ b/examples/language_identification/example0-invalid.json @@ -0,0 +1,62 @@ +{ + "id": "luxzeit1858-1859-01-18-a-i0026", + "lg_decision": "voting", + "tp": "tb", + "len": 63, + "orig_lg": null, + "alphabetical_ratio": 0.254, + "impresso_language_identifier_version": { + "version": "v1.4.1", + "ts": "2020-12-28T10:27:11+00:00" + }, + "language_identifier_version": { + "version": "v1.4.1", + "ts": "2020-12-28T10:15:45+00:00" + }, + "impresso_ft": [ + { + "lang": "fr", + "prob": 0.969 + }, + { + "lang": "de", + "prob": 0.03 + } + ], + "langdetect": [ + { + "lang": "ro", + "prob": 0.667 + }, + { + "lang": "ca", + "prob": 0.333 + } + ], + "langid": [ + { + "lang": "ro", + "prob": 0.655 + } + ], + "wp_ft": [ + { + "lang": "es", + "prob": 0.305 + }, + { + "lang": "ca", + "prob": 0.121 + }, + { + "lang": "war", + "prob": 0.106 + } + ], + "votes": [ + { + "lang": "fr", + "vote": 0.942 + } + ] +} diff --git a/scripts/jsonlschema.py b/scripts/jsonlschema.py index 1c57b1d..b7a8ee9 100755 --- a/scripts/jsonlschema.py +++ b/scripts/jsonlschema.py @@ -1,109 +1,152 @@ #!/usr/bin/python3 -# -*- coding: utf-8 -*- + """ -Validate a jsonline file (raw or compressed) against a schema +A module to validate JSON Line files (raw or compressed) against a specified JSON schema. +This script can handle files from various sources (local, cloud storage) through smart_open, +and logs validation errors while allowing for command-line configured verbosity. """ __appname__ = "jsonlschema" -__author__ = "simon.clematide@uzh.ch" -__version__ = "v1.0" - +__author__ = "simon.clematide@uzh.ch" +__version__ = "v1.1" import sys import logging import json -from jsonschema import validate +import jsonschema from smart_open import open -from typing import Iterable +from typing import Iterable, List, Optional +import argparse + log = logging.getLogger(__name__) -sys.stdin.reconfigure(encoding='utf-8') -sys.stdout.reconfigure(encoding='utf-8') -sys.stderr.reconfigure(encoding='utf-8') +class JSONLSchemaValidator: + """ + A class to validate JSON objects against a given JSON schema. -class JSONLSchemaValidator(object): + Attributes: + input_files (List[str]): A list of paths to input files containing JSON objects. + output_writer (TextIOWrapper): A file writer object for the output file, defaults to sys.stdout if not specified. + schema (dict): The loaded JSON schema against which JSON objects will be validated. + """ - def __init__(self, schema, input_files, output_file=None): + def __init__( + self, + schema: str, + input_files: List[str], + output_file: Optional[str] = None, + file_format: str = "jsonl", + ): + """ + Initializes the JSONLSchemaValidator with a schema, input files, and an optional output file. + + Args: + schema (str): The path to the JSON schema file. + input_files (List[str]): A list of file paths to read the JSON objects from. + output_file (Optional[str]): The file path to write validated JSON objects to. Writes to stdout if None. + """ self.input_files = input_files - self.output_writer = open(output_file, mode="w", encoding="utf-8") if output_file is not None else sys.stdout - self.schema = json.load(open(schema,encoding="utf-8")) + self.file_format = file_format + self.output_writer = ( + open(output_file, mode="w", encoding="utf-8") if output_file else sys.stdout + ) + with open(schema, encoding="utf-8") as schema_file: + self.schema = json.load(schema_file) def run(self) -> None: - - try: - self.process() - except: - self.output_writer.close() - + """Starts the validation process.""" + self.process() def process(self) -> None: - for jo in self.next_json_object(): - validate(instance=jo, schema=self.schema) - print(json.dumps(jo,ensure_ascii=False,separators=(",", ":")),file=self.output_writer) - - - - def next_json_object(self) -> Iterable[dict]: - """Yield each json object. - - :return: Iterator over json objects. - :rtype: Iterable[dict] - """ + Processes each JSON object from the input files, validating them against the schema. - for infile in self.input_files: - with open(infile, encoding="utf-8") as infile: - for line in infile: - yield json.loads(line) - - - - -if __name__ == '__main__': - import argparse - description = "" - epilog = "" - parser = argparse.ArgumentParser(description=description, epilog=epilog) - parser.add_argument('-l', '--logfile', dest='logfile', - help='write log to FILE', metavar='FILE') - parser.add_argument('-v', '--verbose', dest='verbose',default=2,type=int, metavar="LEVEL", - help='set verbosity level: 0=CRITICAL, 1=ERROR, 2=WARNING, 3=INFO 4=DEBUG (default %(default)s)') + Invalid objects are logged and skipped. + """ + for jo in self.next_json_object(): + try: + jsonschema.validate(instance=jo, schema=self.schema) + except jsonschema.exceptions.ValidationError as e: + log.error("Validation error: %s", e.message) + log.info("Offending JSON object ignored: %s", jo) + continue + print( + json.dumps(jo, ensure_ascii=False, separators=(",", ":")), + file=self.output_writer, + ) + def next_json_object(self) -> Iterable[dict]: + """Yields JSON objects based on the specified file format.""" + if self.file_format == "jsonl": + for infile_path in self.input_files: + with open(infile_path, encoding="utf-8") as infile: + for line in infile: + if line.strip(): # Skip empty lines + yield json.loads(line) + elif self.file_format == "json": + for infile_path in self.input_files: + with open(infile_path, encoding="utf-8") as infile: + data = json.load(infile) + if isinstance(data, list): + for item in data: + yield item + else: + yield data + + +def main(): + """ + Parses command-line arguments and initiates JSON line file validation. + """ + parser = argparse.ArgumentParser( + description="Validate JSON Lines files against a schema." + ) + parser.add_argument("--schema", help="Path to JSON schema file", required=True) + parser.add_argument("-l", "--logfile", help="Write log to FILE", metavar="FILE") parser.add_argument( - "-o", - "--output-file", - default="/dev/stdout", - help="Output file, writing to stdout if not specified", + "-v", + "--verbose", + default=2, + type=int, + metavar="LEVEL", + help="Set verbosity level", ) - parser.add_argument( - "-i", - "--input-files", - metavar="JSONL", - nargs="*", - help="Input files, reading from stdin if not provided", + "-o", "--output-file", help="Output file (defaults to stdout if not specified)" ) + parser.add_argument("-i", "--input-files", nargs="+", help="Input JSON Lines files") parser.add_argument( - "schema", - metavar="SCHEMA", - help="path to schema", + "--file-format", + choices=["json", "jsonl"], + help="File format (jsonl or json)", + default="jsonl", ) - - arguments = parser.parse_args() - - log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING, - logging.INFO, logging.DEBUG] - logging.basicConfig(level=log_levels[arguments.verbose], - format='%(asctime)-15s %(levelname)s: %(message)s') + args = parser.parse_args() + + log_levels = [ + logging.CRITICAL, + logging.ERROR, + logging.WARNING, + logging.INFO, + logging.DEBUG, + ] + logging.basicConfig( + level=log_levels[min(args.verbose, len(log_levels) - 1)], + format="%(asctime)-15s %(levelname)s: %(message)s", + ) + if args.input_files and args.input_files[0].endswith(".json"): + args.file_format = "json" + log.info("File format set to JSON") + + validator = JSONLSchemaValidator( + schema=args.schema, + input_files=args.input_files or [], + output_file=args.output_file, + file_format=args.file_format, + ) + validator.run() - jsonl_schema_validator_args = { - "schema", - "output_file", - "input_files", - } - # launching application ... - JSONLSchemaValidator( - **{k: v for k, v in vars(arguments).items() if k in jsonl_schema_validator_args} - ).run() +if __name__ == "__main__": + main() From ea9d4a2cb256ba99177e0e696ceb9014596a381c Mon Sep 17 00:00:00 2001 From: Simon Clematide Date: Sun, 7 Apr 2024 01:05:31 +0200 Subject: [PATCH 2/5] Fix typo in language_identification.schema.json --- .../language_identification.schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/json/language_identification/language_identification.schema.json b/json/language_identification/language_identification.schema.json index 38b839e..013bf03 100644 --- a/json/language_identification/language_identification.schema.json +++ b/json/language_identification/language_identification.schema.json @@ -45,13 +45,13 @@ "lg_decision": { "enum": [ "all", - "all-but-impresso-ft", + "all-but-impresso_ft", "voting", "dominant-by-len", "dominant-by-lowvote" ], "type": "string", - "description": "An identifier for the decision strategy applied to the content item: 'all' = all LID systems/info agree; 'all-but-impresso-ft' = all LID except impresso_ft agree on a language other than de/fr; 'dominant-by-len' = the most frequent language of the ensemble decisions is selected because there are too few characters; 'dominant-by-lowvote' = the most frequent language of the ensemble decisions is selected because there are too few votes; 'voting' = the language with the highest vote count is selected " + "description": "An identifier for the decision strategy applied to the content item: 'all' = all LID systems/info agree; 'all-but-impresso_ft' = all LID except impresso_ft agree on a language other than de/fr; 'dominant-by-len' = the most frequent language of the ensemble decisions is selected because there are too few characters; 'dominant-by-lowvote' = the most frequent language of the ensemble decisions is selected because there are too few votes; 'voting' = the language with the highest vote count is selected " }, "tp": { "type": "string", From e96f28e36bd8e536c9d8c01d1a50e145103ef3c3 Mon Sep 17 00:00:00 2001 From: Simon Clematide Date: Sun, 7 Apr 2024 21:43:52 +0200 Subject: [PATCH 3/5] Add .flake8 configuration file --- .flake8 | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..2fa9b01 --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +# configuration for black compatibility + +[flake8] +max-line-length = 88 +extend-ignore = E203, W503 From fd68779890bd98e0d7f6d9fedc50cf4bf8cdf8d7 Mon Sep 17 00:00:00 2001 From: Simon Clematide Date: Sun, 7 Apr 2024 22:10:48 +0200 Subject: [PATCH 4/5] Refactor JSONLSchemaValidator class in jsonlschema.py --- scripts/jsonlschema.py | 86 ++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/scripts/jsonlschema.py b/scripts/jsonlschema.py index b7a8ee9..d678f28 100755 --- a/scripts/jsonlschema.py +++ b/scripts/jsonlschema.py @@ -1,22 +1,23 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 """ -A module to validate JSON Line files (raw or compressed) against a specified JSON schema. -This script can handle files from various sources (local, cloud storage) through smart_open, -and logs validation errors while allowing for command-line configured verbosity. +A module to validate JSON Line files (raw or compressed) against a specified JSON +schema. This script can handle files from various sources (local, cloud storage) through +smart_open, and logs validation errors while allowing for command-line configured +verbosity. """ -__appname__ = "jsonlschema" __author__ = "simon.clematide@uzh.ch" __version__ = "v1.1" -import sys -import logging +import argparse import json -import jsonschema -from smart_open import open +import logging +import sys from typing import Iterable, List, Optional -import argparse + +import jsonschema +import smart_open log = logging.getLogger(__name__) @@ -27,8 +28,12 @@ class JSONLSchemaValidator: Attributes: input_files (List[str]): A list of paths to input files containing JSON objects. - output_writer (TextIOWrapper): A file writer object for the output file, defaults to sys.stdout if not specified. - schema (dict): The loaded JSON schema against which JSON objects will be validated. + + output_writer (TextIOWrapper): A file writer object for the output file, + defaults to sys.stdout if not specified. + + schema (dict): The loaded JSON schema against which JSON objects will be + validated. """ def __init__( @@ -39,31 +44,39 @@ def __init__( file_format: str = "jsonl", ): """ - Initializes the JSONLSchemaValidator with a schema, input files, and an optional output file. + Initializes the JSONLSchemaValidator with a schema, input files, and an optional + output file. Args: schema (str): The path to the JSON schema file. input_files (List[str]): A list of file paths to read the JSON objects from. - output_file (Optional[str]): The file path to write validated JSON objects to. Writes to stdout if None. + output_file (Optional[str]): The file path to write validated JSON objects + to. Writes to stdout if None. """ + self.input_files = input_files self.file_format = file_format self.output_writer = ( - open(output_file, mode="w", encoding="utf-8") if output_file else sys.stdout + smart_open.open(output_file, mode="w", encoding="utf-8") + if output_file + else sys.stdout ) - with open(schema, encoding="utf-8") as schema_file: + with smart_open.open(schema, encoding="utf-8") as schema_file: self.schema = json.load(schema_file) def run(self) -> None: """Starts the validation process.""" + self.process() def process(self) -> None: """ - Processes each JSON object from the input files, validating them against the schema. + Processes each JSON object from the input files, validating them against the + schema. Invalid objects are logged and skipped. """ + for jo in self.next_json_object(): try: jsonschema.validate(instance=jo, schema=self.schema) @@ -78,15 +91,16 @@ def process(self) -> None: def next_json_object(self) -> Iterable[dict]: """Yields JSON objects based on the specified file format.""" + if self.file_format == "jsonl": for infile_path in self.input_files: - with open(infile_path, encoding="utf-8") as infile: + with smart_open.open(infile_path, encoding="utf-8") as infile: for line in infile: if line.strip(): # Skip empty lines yield json.loads(line) elif self.file_format == "json": for infile_path in self.input_files: - with open(infile_path, encoding="utf-8") as infile: + with smart_open.open(infile_path, encoding="utf-8") as infile: data = json.load(infile) if isinstance(data, list): for item in data: @@ -104,14 +118,7 @@ def main(): ) parser.add_argument("--schema", help="Path to JSON schema file", required=True) parser.add_argument("-l", "--logfile", help="Write log to FILE", metavar="FILE") - parser.add_argument( - "-v", - "--verbose", - default=2, - type=int, - metavar="LEVEL", - help="Set verbosity level", - ) + parser.add_argument( "-o", "--output-file", help="Output file (defaults to stdout if not specified)" ) @@ -122,18 +129,25 @@ def main(): help="File format (jsonl or json)", default="jsonl", ) + parser.add_argument( + "--level", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Set the logging level. Default: %(default)s", + ) args = parser.parse_args() - log_levels = [ - logging.CRITICAL, - logging.ERROR, - logging.WARNING, - logging.INFO, - logging.DEBUG, - ] + to_logging_level = { + "CRITICAL": logging.CRITICAL, + "ERROR": logging.ERROR, + "WARNING": logging.WARNING, + "INFO": logging.INFO, + "DEBUG": logging.DEBUG, + } logging.basicConfig( - level=log_levels[min(args.verbose, len(log_levels) - 1)], - format="%(asctime)-15s %(levelname)s: %(message)s", + level=to_logging_level[args.level], + format="%(asctime)-15s %(levelname)-8s: %(message)s", + force=True, ) if args.input_files and args.input_files[0].endswith(".json"): args.file_format = "json" From 080b39c465ec0ad67f641e1f2dc93d824b479f1f Mon Sep 17 00:00:00 2001 From: Simon Clematide Date: Sun, 7 Apr 2024 22:16:28 +0200 Subject: [PATCH 5/5] Exiti non-zero if validation failed --- scripts/jsonlschema.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/jsonlschema.py b/scripts/jsonlschema.py index d678f28..0b7a915 100755 --- a/scripts/jsonlschema.py +++ b/scripts/jsonlschema.py @@ -5,6 +5,9 @@ schema. This script can handle files from various sources (local, cloud storage) through smart_open, and logs validation errors while allowing for command-line configured verbosity. + +In cases of validation errors, the offending JSON object is logged and skipped. The +script exits with a non-zero status if any validation errors are encountered. """ __author__ = "simon.clematide@uzh.ch" @@ -56,6 +59,7 @@ def __init__( self.input_files = input_files self.file_format = file_format + self.validation_errors = 0 self.output_writer = ( smart_open.open(output_file, mode="w", encoding="utf-8") if output_file @@ -68,6 +72,9 @@ def run(self) -> None: """Starts the validation process.""" self.process() + if self.validation_errors > 0: + log.error("Validation errors encountered: %d", self.validation_errors) + sys.exit(1) def process(self) -> None: """ @@ -83,6 +90,7 @@ def process(self) -> None: except jsonschema.exceptions.ValidationError as e: log.error("Validation error: %s", e.message) log.info("Offending JSON object ignored: %s", jo) + self.validation_errors += 1 continue print( json.dumps(jo, ensure_ascii=False, separators=(",", ":")),