Skip to content

Commit

Permalink
Merge pull request #30 from impresso/feature-fix-schemavalidator
Browse files Browse the repository at this point in the history
Feature fix jsonlschema.py CLI script
  • Loading branch information
simon-clematide authored Apr 7, 2024
2 parents 92c81e6 + 080b39c commit ebc0feb
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 81 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# configuration for black compatibility

[flake8]
max-line-length = 88
extend-ignore = E203, W503
62 changes: 62 additions & 0 deletions examples/language_identification/example0-invalid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"id": "luxzeit1858-1859-01-18-a-i0026",
"lg_decision": "voting",
"tp": "tb",
"len": 63,
"orig_lg": null,
"alphabetical_ratio": 0.254,
"impresso_language_identifier_version": {
"version": "v1.4.1",
"ts": "2020-12-28T10:27:11+00:00"
},
"language_identifier_version": {
"version": "v1.4.1",
"ts": "2020-12-28T10:15:45+00:00"
},
"impresso_ft": [
{
"lang": "fr",
"prob": 0.969
},
{
"lang": "de",
"prob": 0.03
}
],
"langdetect": [
{
"lang": "ro",
"prob": 0.667
},
{
"lang": "ca",
"prob": 0.333
}
],
"langid": [
{
"lang": "ro",
"prob": 0.655
}
],
"wp_ft": [
{
"lang": "es",
"prob": 0.305
},
{
"lang": "ca",
"prob": 0.121
},
{
"lang": "war",
"prob": 0.106
}
],
"votes": [
{
"lang": "fr",
"vote": 0.942
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@
"lg_decision": {
"enum": [
"all",
"all-but-impresso-ft",
"all-but-impresso_ft",
"voting",
"dominant-by-len",
"dominant-by-lowvote"
],
"type": "string",
"description": "An identifier for the decision strategy applied to the content item: 'all' = all LID systems/info agree; 'all-but-impresso-ft' = all LID except impresso_ft agree on a language other than de/fr; 'dominant-by-len' = the most frequent language of the ensemble decisions is selected because there are too few characters; 'dominant-by-lowvote' = the most frequent language of the ensemble decisions is selected because there are too few votes; 'voting' = the language with the highest vote count is selected "
"description": "An identifier for the decision strategy applied to the content item: 'all' = all LID systems/info agree; 'all-but-impresso_ft' = all LID except impresso_ft agree on a language other than de/fr; 'dominant-by-len' = the most frequent language of the ensemble decisions is selected because there are too few characters; 'dominant-by-lowvote' = the most frequent language of the ensemble decisions is selected because there are too few votes; 'voting' = the language with the highest vote count is selected "
},
"tp": {
"type": "string",
Expand Down
223 changes: 144 additions & 79 deletions scripts/jsonlschema.py
Original file line number Diff line number Diff line change
@@ -1,109 +1,174 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Validate a jsonline file (raw or compressed) against a schema
#!/usr/bin/env python3

"""
A module to validate JSON Line files (raw or compressed) against a specified JSON
schema. This script can handle files from various sources (local, cloud storage) through
smart_open, and logs validation errors while allowing for command-line configured
verbosity.
__appname__ = "jsonlschema"
__author__ = "[email protected]"
__version__ = "v1.0"
In cases of validation errors, the offending JSON object is logged and skipped. The
script exits with a non-zero status if any validation errors are encountered.
"""

__author__ = "[email protected]"
__version__ = "v1.1"

import sys
import logging
import argparse
import json
from jsonschema import validate
from smart_open import open
from typing import Iterable
log = logging.getLogger(__name__)

sys.stdin.reconfigure(encoding='utf-8')
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')
import logging
import sys
from typing import Iterable, List, Optional

import jsonschema
import smart_open

class JSONLSchemaValidator(object):
log = logging.getLogger(__name__)

def __init__(self, schema, input_files, output_file=None):
self.input_files = input_files
self.output_writer = open(output_file, mode="w", encoding="utf-8") if output_file is not None else sys.stdout
self.schema = json.load(open(schema,encoding="utf-8"))

def run(self) -> None:
class JSONLSchemaValidator:
"""
A class to validate JSON objects against a given JSON schema.
try:
self.process()
except:
self.output_writer.close()
Attributes:
input_files (List[str]): A list of paths to input files containing JSON objects.
output_writer (TextIOWrapper): A file writer object for the output file,
defaults to sys.stdout if not specified.
def process(self) -> None:
for jo in self.next_json_object():
validate(instance=jo, schema=self.schema)
print(json.dumps(jo,ensure_ascii=False,separators=(",", ":")),file=self.output_writer)
schema (dict): The loaded JSON schema against which JSON objects will be
validated.
"""

def __init__(
self,
schema: str,
input_files: List[str],
output_file: Optional[str] = None,
file_format: str = "jsonl",
):
"""
Initializes the JSONLSchemaValidator with a schema, input files, and an optional
output file.
Args:
schema (str): The path to the JSON schema file.
input_files (List[str]): A list of file paths to read the JSON objects from.
output_file (Optional[str]): The file path to write validated JSON objects
to. Writes to stdout if None.
"""

self.input_files = input_files
self.file_format = file_format
self.validation_errors = 0
self.output_writer = (
smart_open.open(output_file, mode="w", encoding="utf-8")
if output_file
else sys.stdout
)
with smart_open.open(schema, encoding="utf-8") as schema_file:
self.schema = json.load(schema_file)

def next_json_object(self) -> Iterable[dict]:
"""Yield each json object.
def run(self) -> None:
"""Starts the validation process."""

:return: Iterator over json objects.
:rtype: Iterable[dict]
self.process()
if self.validation_errors > 0:
log.error("Validation errors encountered: %d", self.validation_errors)
sys.exit(1)

def process(self) -> None:
"""
Processes each JSON object from the input files, validating them against the
schema.
for infile in self.input_files:
with open(infile, encoding="utf-8") as infile:
for line in infile:
yield json.loads(line)


Invalid objects are logged and skipped.
"""

for jo in self.next_json_object():
try:
jsonschema.validate(instance=jo, schema=self.schema)
except jsonschema.exceptions.ValidationError as e:
log.error("Validation error: %s", e.message)
log.info("Offending JSON object ignored: %s", jo)
self.validation_errors += 1
continue
print(
json.dumps(jo, ensure_ascii=False, separators=(",", ":")),
file=self.output_writer,
)

if __name__ == '__main__':
import argparse
description = ""
epilog = ""
parser = argparse.ArgumentParser(description=description, epilog=epilog)
parser.add_argument('-l', '--logfile', dest='logfile',
help='write log to FILE', metavar='FILE')
parser.add_argument('-v', '--verbose', dest='verbose',default=2,type=int, metavar="LEVEL",
help='set verbosity level: 0=CRITICAL, 1=ERROR, 2=WARNING, 3=INFO 4=DEBUG (default %(default)s)')
def next_json_object(self) -> Iterable[dict]:
"""Yields JSON objects based on the specified file format."""

if self.file_format == "jsonl":
for infile_path in self.input_files:
with smart_open.open(infile_path, encoding="utf-8") as infile:
for line in infile:
if line.strip(): # Skip empty lines
yield json.loads(line)
elif self.file_format == "json":
for infile_path in self.input_files:
with smart_open.open(infile_path, encoding="utf-8") as infile:
data = json.load(infile)
if isinstance(data, list):
for item in data:
yield item
else:
yield data


def main():
"""
Parses command-line arguments and initiates JSON line file validation.
"""
parser = argparse.ArgumentParser(
description="Validate JSON Lines files against a schema."
)
parser.add_argument("--schema", help="Path to JSON schema file", required=True)
parser.add_argument("-l", "--logfile", help="Write log to FILE", metavar="FILE")

parser.add_argument(
"-o",
"--output-file",
default="/dev/stdout",
help="Output file, writing to stdout if not specified",
"-o", "--output-file", help="Output file (defaults to stdout if not specified)"
)

parser.add_argument("-i", "--input-files", nargs="+", help="Input JSON Lines files")
parser.add_argument(
"-i",
"--input-files",
metavar="JSONL",
nargs="*",
help="Input files, reading from stdin if not provided",
"--file-format",
choices=["json", "jsonl"],
help="File format (jsonl or json)",
default="jsonl",
)
parser.add_argument(
"schema",
metavar="SCHEMA",
help="path to schema",
"--level",
default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help="Set the logging level. Default: %(default)s",
)

arguments = parser.parse_args()

log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING,
logging.INFO, logging.DEBUG]
logging.basicConfig(level=log_levels[arguments.verbose],
format='%(asctime)-15s %(levelname)s: %(message)s')
args = parser.parse_args()

to_logging_level = {
"CRITICAL": logging.CRITICAL,
"ERROR": logging.ERROR,
"WARNING": logging.WARNING,
"INFO": logging.INFO,
"DEBUG": logging.DEBUG,
}
logging.basicConfig(
level=to_logging_level[args.level],
format="%(asctime)-15s %(levelname)-8s: %(message)s",
force=True,
)
if args.input_files and args.input_files[0].endswith(".json"):
args.file_format = "json"
log.info("File format set to JSON")

validator = JSONLSchemaValidator(
schema=args.schema,
input_files=args.input_files or [],
output_file=args.output_file,
file_format=args.file_format,
)
validator.run()


jsonl_schema_validator_args = {
"schema",
"output_file",
"input_files",
}
# launching application ...
JSONLSchemaValidator(
**{k: v for k, v in vars(arguments).items() if k in jsonl_schema_validator_args}
).run()
if __name__ == "__main__":
main()

0 comments on commit ebc0feb

Please sign in to comment.