-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #30 from impresso/feature-fix-schemavalidator
Feature fix jsonlschema.py CLI script
- Loading branch information
Showing
4 changed files
with
213 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# configuration for black compatibility | ||
|
||
[flake8] | ||
max-line-length = 88 | ||
extend-ignore = E203, W503 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
{ | ||
"id": "luxzeit1858-1859-01-18-a-i0026", | ||
"lg_decision": "voting", | ||
"tp": "tb", | ||
"len": 63, | ||
"orig_lg": null, | ||
"alphabetical_ratio": 0.254, | ||
"impresso_language_identifier_version": { | ||
"version": "v1.4.1", | ||
"ts": "2020-12-28T10:27:11+00:00" | ||
}, | ||
"language_identifier_version": { | ||
"version": "v1.4.1", | ||
"ts": "2020-12-28T10:15:45+00:00" | ||
}, | ||
"impresso_ft": [ | ||
{ | ||
"lang": "fr", | ||
"prob": 0.969 | ||
}, | ||
{ | ||
"lang": "de", | ||
"prob": 0.03 | ||
} | ||
], | ||
"langdetect": [ | ||
{ | ||
"lang": "ro", | ||
"prob": 0.667 | ||
}, | ||
{ | ||
"lang": "ca", | ||
"prob": 0.333 | ||
} | ||
], | ||
"langid": [ | ||
{ | ||
"lang": "ro", | ||
"prob": 0.655 | ||
} | ||
], | ||
"wp_ft": [ | ||
{ | ||
"lang": "es", | ||
"prob": 0.305 | ||
}, | ||
{ | ||
"lang": "ca", | ||
"prob": 0.121 | ||
}, | ||
{ | ||
"lang": "war", | ||
"prob": 0.106 | ||
} | ||
], | ||
"votes": [ | ||
{ | ||
"lang": "fr", | ||
"vote": 0.942 | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,109 +1,174 @@ | ||
#!/usr/bin/python3 | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Validate a jsonline file (raw or compressed) against a schema | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
A module to validate JSON Line files (raw or compressed) against a specified JSON | ||
schema. This script can handle files from various sources (local, cloud storage) through | ||
smart_open, and logs validation errors while allowing for command-line configured | ||
verbosity. | ||
__appname__ = "jsonlschema" | ||
__author__ = "[email protected]" | ||
__version__ = "v1.0" | ||
In cases of validation errors, the offending JSON object is logged and skipped. The | ||
script exits with a non-zero status if any validation errors are encountered. | ||
""" | ||
|
||
__author__ = "[email protected]" | ||
__version__ = "v1.1" | ||
|
||
import sys | ||
import logging | ||
import argparse | ||
import json | ||
from jsonschema import validate | ||
from smart_open import open | ||
from typing import Iterable | ||
log = logging.getLogger(__name__) | ||
|
||
sys.stdin.reconfigure(encoding='utf-8') | ||
sys.stdout.reconfigure(encoding='utf-8') | ||
sys.stderr.reconfigure(encoding='utf-8') | ||
import logging | ||
import sys | ||
from typing import Iterable, List, Optional | ||
|
||
import jsonschema | ||
import smart_open | ||
|
||
class JSONLSchemaValidator(object): | ||
log = logging.getLogger(__name__) | ||
|
||
def __init__(self, schema, input_files, output_file=None): | ||
self.input_files = input_files | ||
self.output_writer = open(output_file, mode="w", encoding="utf-8") if output_file is not None else sys.stdout | ||
self.schema = json.load(open(schema,encoding="utf-8")) | ||
|
||
def run(self) -> None: | ||
class JSONLSchemaValidator: | ||
""" | ||
A class to validate JSON objects against a given JSON schema. | ||
try: | ||
self.process() | ||
except: | ||
self.output_writer.close() | ||
Attributes: | ||
input_files (List[str]): A list of paths to input files containing JSON objects. | ||
output_writer (TextIOWrapper): A file writer object for the output file, | ||
defaults to sys.stdout if not specified. | ||
def process(self) -> None: | ||
for jo in self.next_json_object(): | ||
validate(instance=jo, schema=self.schema) | ||
print(json.dumps(jo,ensure_ascii=False,separators=(",", ":")),file=self.output_writer) | ||
schema (dict): The loaded JSON schema against which JSON objects will be | ||
validated. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
schema: str, | ||
input_files: List[str], | ||
output_file: Optional[str] = None, | ||
file_format: str = "jsonl", | ||
): | ||
""" | ||
Initializes the JSONLSchemaValidator with a schema, input files, and an optional | ||
output file. | ||
Args: | ||
schema (str): The path to the JSON schema file. | ||
input_files (List[str]): A list of file paths to read the JSON objects from. | ||
output_file (Optional[str]): The file path to write validated JSON objects | ||
to. Writes to stdout if None. | ||
""" | ||
|
||
self.input_files = input_files | ||
self.file_format = file_format | ||
self.validation_errors = 0 | ||
self.output_writer = ( | ||
smart_open.open(output_file, mode="w", encoding="utf-8") | ||
if output_file | ||
else sys.stdout | ||
) | ||
with smart_open.open(schema, encoding="utf-8") as schema_file: | ||
self.schema = json.load(schema_file) | ||
|
||
def next_json_object(self) -> Iterable[dict]: | ||
"""Yield each json object. | ||
def run(self) -> None: | ||
"""Starts the validation process.""" | ||
|
||
:return: Iterator over json objects. | ||
:rtype: Iterable[dict] | ||
self.process() | ||
if self.validation_errors > 0: | ||
log.error("Validation errors encountered: %d", self.validation_errors) | ||
sys.exit(1) | ||
|
||
def process(self) -> None: | ||
""" | ||
Processes each JSON object from the input files, validating them against the | ||
schema. | ||
for infile in self.input_files: | ||
with open(infile, encoding="utf-8") as infile: | ||
for line in infile: | ||
yield json.loads(line) | ||
|
||
|
||
Invalid objects are logged and skipped. | ||
""" | ||
|
||
for jo in self.next_json_object(): | ||
try: | ||
jsonschema.validate(instance=jo, schema=self.schema) | ||
except jsonschema.exceptions.ValidationError as e: | ||
log.error("Validation error: %s", e.message) | ||
log.info("Offending JSON object ignored: %s", jo) | ||
self.validation_errors += 1 | ||
continue | ||
print( | ||
json.dumps(jo, ensure_ascii=False, separators=(",", ":")), | ||
file=self.output_writer, | ||
) | ||
|
||
if __name__ == '__main__': | ||
import argparse | ||
description = "" | ||
epilog = "" | ||
parser = argparse.ArgumentParser(description=description, epilog=epilog) | ||
parser.add_argument('-l', '--logfile', dest='logfile', | ||
help='write log to FILE', metavar='FILE') | ||
parser.add_argument('-v', '--verbose', dest='verbose',default=2,type=int, metavar="LEVEL", | ||
help='set verbosity level: 0=CRITICAL, 1=ERROR, 2=WARNING, 3=INFO 4=DEBUG (default %(default)s)') | ||
def next_json_object(self) -> Iterable[dict]: | ||
"""Yields JSON objects based on the specified file format.""" | ||
|
||
if self.file_format == "jsonl": | ||
for infile_path in self.input_files: | ||
with smart_open.open(infile_path, encoding="utf-8") as infile: | ||
for line in infile: | ||
if line.strip(): # Skip empty lines | ||
yield json.loads(line) | ||
elif self.file_format == "json": | ||
for infile_path in self.input_files: | ||
with smart_open.open(infile_path, encoding="utf-8") as infile: | ||
data = json.load(infile) | ||
if isinstance(data, list): | ||
for item in data: | ||
yield item | ||
else: | ||
yield data | ||
|
||
|
||
def main(): | ||
""" | ||
Parses command-line arguments and initiates JSON line file validation. | ||
""" | ||
parser = argparse.ArgumentParser( | ||
description="Validate JSON Lines files against a schema." | ||
) | ||
parser.add_argument("--schema", help="Path to JSON schema file", required=True) | ||
parser.add_argument("-l", "--logfile", help="Write log to FILE", metavar="FILE") | ||
|
||
parser.add_argument( | ||
"-o", | ||
"--output-file", | ||
default="/dev/stdout", | ||
help="Output file, writing to stdout if not specified", | ||
"-o", "--output-file", help="Output file (defaults to stdout if not specified)" | ||
) | ||
|
||
parser.add_argument("-i", "--input-files", nargs="+", help="Input JSON Lines files") | ||
parser.add_argument( | ||
"-i", | ||
"--input-files", | ||
metavar="JSONL", | ||
nargs="*", | ||
help="Input files, reading from stdin if not provided", | ||
"--file-format", | ||
choices=["json", "jsonl"], | ||
help="File format (jsonl or json)", | ||
default="jsonl", | ||
) | ||
parser.add_argument( | ||
"schema", | ||
metavar="SCHEMA", | ||
help="path to schema", | ||
"--level", | ||
default="INFO", | ||
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], | ||
help="Set the logging level. Default: %(default)s", | ||
) | ||
|
||
arguments = parser.parse_args() | ||
|
||
log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING, | ||
logging.INFO, logging.DEBUG] | ||
logging.basicConfig(level=log_levels[arguments.verbose], | ||
format='%(asctime)-15s %(levelname)s: %(message)s') | ||
args = parser.parse_args() | ||
|
||
to_logging_level = { | ||
"CRITICAL": logging.CRITICAL, | ||
"ERROR": logging.ERROR, | ||
"WARNING": logging.WARNING, | ||
"INFO": logging.INFO, | ||
"DEBUG": logging.DEBUG, | ||
} | ||
logging.basicConfig( | ||
level=to_logging_level[args.level], | ||
format="%(asctime)-15s %(levelname)-8s: %(message)s", | ||
force=True, | ||
) | ||
if args.input_files and args.input_files[0].endswith(".json"): | ||
args.file_format = "json" | ||
log.info("File format set to JSON") | ||
|
||
validator = JSONLSchemaValidator( | ||
schema=args.schema, | ||
input_files=args.input_files or [], | ||
output_file=args.output_file, | ||
file_format=args.file_format, | ||
) | ||
validator.run() | ||
|
||
|
||
jsonl_schema_validator_args = { | ||
"schema", | ||
"output_file", | ||
"input_files", | ||
} | ||
# launching application ... | ||
JSONLSchemaValidator( | ||
**{k: v for k, v in vars(arguments).items() if k in jsonl_schema_validator_args} | ||
).run() | ||
if __name__ == "__main__": | ||
main() |