From d56cd8b22a5712ef067d254ef7f1616c1ce71a44 Mon Sep 17 00:00:00 2001 From: "teocalvo2@gmail.com" Date: Sat, 5 Aug 2023 02:09:10 +0000 Subject: [PATCH] Optimize file readers --- src/01.raw/dota/pos_collect.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/01.raw/dota/pos_collect.py b/src/01.raw/dota/pos_collect.py index 1f83cd4..7558118 100644 --- a/src/01.raw/dota/pos_collect.py +++ b/src/01.raw/dota/pos_collect.py @@ -1,18 +1,13 @@ # Databricks notebook source from pyspark.sql import types +import pandas as pd # COMMAND ---------- # DBTITLE 1,Consolida collect -schema = types.StructType(fields=[ - types.StructField(name="match_id", - dataType=types.StringType()) -]) - -df = (spark.read - .format("json") - .schema(schema) - .load("/mnt/datalake/dota/matches_details")) +files = [i.name.split(".")[0] for i in dbutils.fs.ls("/mnt/datalake/dota/matches_details")] + +df = spark.createDataFrame(pd.DataFrame({"match_id":files})) (df.write .format("delta")