conbench · joosthooz · Sep 1, 2022 · Aug 11, 2022 · Aug 11, 2022 · Aug 12, 2022
diff --git a/datalogistik/util.py b/datalogistik/util.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import datetime
 import gzip
 import hashlib
@@ -20,6 +21,8 @@
 import pathlib
 import shutil
 import time
+from collections import OrderedDict
+from collections.abc import Mapping
 
 import pyarrow as pa
 import urllib3
@@ -54,7 +57,7 @@ def file_visitor(written_file):
 # Construct a path to a dataset entry in the cache (possibly not existing yet)
 def create_cached_dataset_path(name, scale_factor, format, partitioning_nrows):
     local_cache_location = config.get_cache_location()
-    scale_factor = f"scalefactor_{scale_factor}" if scale_factor != "" else ""
+    scale_factor = f"scalefactor_{scale_factor}" if scale_factor else ""
     partitioning_nrows = f"partitioning_{partitioning_nrows}"
     return pathlib.Path(
         local_cache_location, name, scale_factor, format, partitioning_nrows
@@ -284,16 +287,108 @@ def schema_to_dict(schema):
     return field_dict
 
 
+def convert_arrow_alias(type_name):
+    aliases = {
+        "bool": "bool_",
+        "halffloat": "float16",
+        "float": "float32",
+        "double": "float64",
+        "decimal": "decimal128",
+    }
+    for (alias, func_name) in aliases.items():
+        if type_name == alias:
+            return func_name
+    # no alias was found
+    return type_name
+
+
+# Create an instance of the pyarrow datatype with the given name
+def arrow_type_function_lookup(function_name):
+    if isinstance(function_name, str):
+        function_name = convert_arrow_alias(function_name)
+        pa_type_func = getattr(pa, function_name)
+        return pa_type_func
+
+    # The argument was not a pyarrow type (maybe a nested structure?)
+    return None
+
+
+# Convert a given item (string or dict) to the corresponding Arrow datatype
+def arrow_type_from_json(input_type):
+    arrow_nested_types = {
+        "list_",
+        "large_list",
+        "map_",
+        "struct",
+        "dictionary",
+        # Could be useful for the user to have control over nullability
+        "field",
+    }
+
+    # In case the type is a simple string
+    if isinstance(input_type, str):
+        if input_type in arrow_nested_types:
+            msg = "Nested types in schema not supported yet"
+            log.error(msg)
+            raise ValueError(msg)
+        return arrow_type_function_lookup(input_type)()
+
+    # Alternatively, a type can be encoded as a name:value pair
+    if not input_type.get("type_name"):
+        msg = "Schema field type 'type_name' missing"
+        log.error(msg)
+        raise ValueError(msg)
+
+    type_name = input_type.get("type_name")
+    args = input_type.get("arguments")
+    if type_name in arrow_nested_types:
+        msg = "Nested types in schema not supported yet"
+        log.error(msg)
+        raise ValueError(msg)
+
+    if args is None:
+        return arrow_type_function_lookup(type_name)()
+    if isinstance(args, Mapping):
+        return arrow_type_function_lookup(type_name)(**args)
+    elif isinstance(args, list):
+        log.debug(f"args {args}")
+        return arrow_type_function_lookup(type_name)(*args)
+    else:  # args is probably a single value
+        return arrow_type_function_lookup(type_name)(args)
+
+
+# Convert the given dict to a pyarrow.schema
+def get_arrow_schema(input_schema):
+    log.debug("Converting schema to pyarrow.schema...")
+    if input_schema is None:
+        return None
+    field_list = []
+    # TODO: a `field()` entry is not a (name, type) tuple
+    for (field_name, type) in input_schema.items():
+        log.debug(f"Schema: adding field {field_name}")
+        arrow_type = arrow_type_from_json(type)
+        field_list.append(pa.field(field_name, arrow_type))
+
+    output_schema = pa.schema(field_list)
+    return output_schema
+
+
 # Create Arrow Dataset for a given input file
 def get_dataset(input_file, dataset_info, table_name=None):
-    column_list = None  # Default
-    if dataset_info["format"] == "parquet":
+    # Defaults
+    column_list = None
+    schema = None
+    format = dataset_info["format"]
+    if format == "parquet":
         dataset_read_format = ds.ParquetFileFormat()
-    if dataset_info["format"] == "csv":
+    if format == "csv":
         # defaults
         po = csv.ParseOptions()
-        ro = csv.ReadOptions()  # autogenerate_column_names=True)
+        ro = csv.ReadOptions()
         co = csv.ConvertOptions()
+        # TODO: Should we autogenerate column names by default?
+        # Or add a property in the metadata about it?
+        # or allow a fall-back to read_csv in case schema detection fails?
 
         if "delim" in dataset_info:
             po = csv.ParseOptions(delimiter=dataset_info["delim"])
@@ -312,15 +407,25 @@ def get_dataset(input_file, dataset_info, table_name=None):
             column_types_trailed = column_types.copy()
             column_types_trailed["trailing_columns"] = pa.string()
             ro = csv.ReadOptions(
-                column_names=column_types_trailed.keys(), encoding="ISO-8859"
+                column_names=column_types_trailed.keys(),
+                encoding="iso8859" if dataset_info["name"] == "tpc-ds" else "utf8",
             )
             co = csv.ConvertOptions(column_types=column_types_trailed)
+        else:  # not a TPC dataset
+            if dataset_info.get("tables"):
+                log.debug("Found schema information in metadata")
+                for table_entry in dataset_info.get("tables"):
+                    if table_name is None or table_entry["table"] == table_name:
+                        schema = get_arrow_schema(table_entry["schema"])
+                        column_names = list(table_entry["schema"].keys())
+                        break
+                ro = csv.ReadOptions(column_names=column_names)
 
         dataset_read_format = ds.CsvFileFormat(
             read_options=ro, parse_options=po, convert_options=co
         )
 
-    dataset = ds.dataset(input_file, format=dataset_read_format)
+    dataset = ds.dataset(input_file, schema=schema, format=dataset_read_format)
     scanner = dataset.scanner(columns=column_list)
     return dataset, scanner
 
@@ -358,7 +463,7 @@ def convert_dataset(
         raise ValueError(msg)
 
     with open(cached_dataset_metadata_file) as f:
-        dataset_metadata = json.load(f)
+        dataset_metadata = json.load(f, object_pairs_hook=OrderedDict)
 
     if (dataset_metadata["format"] == new_format) and (old_nrows == new_nrows):
         log.info("Conversion not needed.")
@@ -383,10 +488,18 @@ def convert_dataset(
                 if parquet_compression is None:
                     parquet_compression = "snappy"  # Use snappy by default
                 write_options = dataset_write_format.make_write_options(
-                    compression=parquet_compression
+                    compression=parquet_compression,
+                    use_deprecated_int96_timestamps=False,
+                    coerce_timestamps="us",
+                    allow_truncated_timestamps=True,
                 )
             if new_format == "csv":
                 dataset_write_format = ds.CsvFileFormat()
+                # Don't include header if there's a known schema
+                if dataset_info.get("tables"):
+                    write_options = dataset_write_format.make_write_options(
+                        include_header=False
+                    )
 
             ds.write_dataset(
                 scanner,
@@ -411,22 +524,19 @@ def convert_dataset(
 
             metadata_table_list.append(
                 {
-                    "table": f"{file_name}.{new_format}",
+                    "table": file_name,
                     "schema": schema_to_dict(dataset.schema),
                 }
             )
 
-            # TODO: The dataset API does a poor job at detecting the schema.
-            # Would be nice to be able to fall back to read/write_csv etc.
-            # Another option is to store the schema as metadata in the repo and pass it
-            # to dataset.
-            # It would also be nice to detect/provide option whether the first line
-            # contains column names.
-
         conv_time = time.perf_counter() - conv_start
         log.info("Finished conversion.")
         log.debug(f"conversion took {conv_time:0.2f} s")
-        dataset_info["tables"] = metadata_table_list
+        # Parquet already stores the schema internally
+        if new_format == "csv":
+            # Don't overwrite schema if it is already known
+            if dataset_info.get("tables") is None:
+                dataset_info["tables"] = metadata_table_list
         dataset_info["format"] = new_format
         dataset_info["partitioning-nrows"] = new_nrows
         if parquet_compression is not None:
@@ -467,18 +577,31 @@ def generate_dataset(dataset_info, argument_info):
             out_dir=cached_dataset_path, scale_factor=argument_info.scale_factor
         )
 
-        metadata_table_list = []
-        for table in tpc_info.tpc_table_names[dataset_name]:
-            input_file = pathlib.Path(cached_dataset_path, table + ".csv")
-            dataset, scanner = get_dataset(input_file, dataset_info, table)
-            metadata_table_list.append(
-                {"table": table + ".csv", "schema": schema_to_dict(dataset.schema)}
-            )
+        # If the entry in the repo file does not specify the schema, try to detect it
+        if not dataset_info.get("tables"):
+            metadata_table_list = []
+            for table in tpc_info.tpc_table_names[dataset_name]:
+                input_file = pathlib.Path(cached_dataset_path, table + ".csv")
+                try:
+                    dataset, scanner = get_dataset(input_file, dataset_info, table)
+                    metadata_table_list.append(
+                        {
+                            "table": table,
+                            "schema": schema_to_dict(dataset.schema),
+                        }
+                    )
+                except Exception:
+                    log.error(
+                        f"pyarrow.dataset is unable to read schema from generated file {input_file}"
+                    )
+                    clean_cache_dir(cached_dataset_path)
+                    raise
+
+        dataset_info["tables"] = metadata_table_list
 
         gen_time = time.perf_counter() - gen_start
         log.info("Finished generating.")
         log.debug(f"generation took {gen_time:0.2f} s")
-        dataset_info["tables"] = metadata_table_list
         write_metadata(dataset_info, cached_dataset_path)
 
     except Exception:
@@ -534,7 +657,7 @@ def download_dataset(dataset_info, argument_info):
     # so something could have gone wrong while downloading/converting previously
     if dataset_file_path.exists():
         log.debug(f"Removing existing file '{dataset_file_path}'")
-        dataset_file_path.rmdir()
+        dataset_file_path.unlink()
     url = dataset_info["url"]
     try:
         http = urllib3.PoolManager()
@@ -557,15 +680,24 @@ def download_dataset(dataset_info, argument_info):
         dataset_file_name = removesuffix(dataset_file_name, "." + compression)
         dataset_file_path = removesuffix(dataset_file_path, "." + compression)
 
-    try:
-        dataset, scanner = get_dataset(dataset_file_path, dataset_info)
-        dataset_info["tables"] = [
-            {"table": str(dataset_file_name), "schema": schema_to_dict(dataset.schema)}
-        ]
-    except Exception:
-        log.error("pyarrow.dataset is unable to read downloaded file")
-        clean_cache_dir(cached_dataset_path)
-        raise
+    # Parquet already stores the schema internally
+    if dataset_info["format"] == "csv":
+        # If the entry in the repo file does not specify the schema, try to detect it
+        if not dataset_info.get("tables"):
+            try:
+                dataset, scanner = get_dataset(dataset_file_path, dataset_info)
+                dataset_info["tables"] = [
+                    {
+                        "table": str(pathlib.Path(dataset_file_name).stem),
+                        "schema": schema_to_dict(dataset.schema),
+                    }
+                ]
+            except Exception:
+                log.error(
+                    "pyarrow.dataset is unable to read schema from downloaded file"
+                )
+                clean_cache_dir(cached_dataset_path)
+                raise
 
     if dataset_info.get("files"):
         # In this case, the dataset info contained checksums. Check them

diff --git a/repo.json b/repo.json
@@ -7,7 +7,7 @@
     "dim" : [22180168, 31],
     "format" : "csv",
     "file-compression" : "gz",
-    "tables": [{"table" : "fanniemae_2016Q4.csv", "schema" : {
+    "tables": [{"table" : "2016Q4", "schema" : {
         "LOAN_ID" : "string",
         "ACT_PERIOD" : "string",
         "SERVICER" : "string",