diff --git a/queries/modin/q1.py b/queries/modin/q1.py index b6460d3..22b1b43 100644 --- a/queries/modin/q1.py +++ b/queries/modin/q1.py @@ -10,33 +10,28 @@ def q() -> None: - VAR1 = date(1998, 9, 2) - - lineitem = utils.get_line_item_ds + line_item_ds = utils.get_line_item_ds # first call one time to cache in case we don't include the IO times - lineitem() + line_item_ds() def query() -> pd.DataFrame: - nonlocal lineitem - lineitem = lineitem() + nonlocal line_item_ds + line_item_ds = line_item_ds() + + var1 = date(1998, 9, 2) - sel = lineitem.l_shipdate <= VAR1 - lineitem_filtered = lineitem[sel] + filt = line_item_ds[line_item_ds["l_shipdate"] <= var1] # This is lenient towards pandas as normally an optimizer should decide # that this could be computed before the groupby aggregation. # Other implementations don't enjoy this benefit. - lineitem_filtered["disc_price"] = lineitem_filtered.l_extendedprice * ( - 1 - lineitem_filtered.l_discount - ) - lineitem_filtered["charge"] = ( - lineitem_filtered.l_extendedprice - * (1 - lineitem_filtered.l_discount) - * (1 + lineitem_filtered.l_tax) + filt["disc_price"] = filt.l_extendedprice * (1.0 - filt.l_discount) + filt["charge"] = ( + filt.l_extendedprice * (1.0 - filt.l_discount) * (1.0 + filt.l_tax) ) - gb = lineitem_filtered.groupby(["l_returnflag", "l_linestatus"], as_index=False) - total = gb.agg( + gb = filt.groupby(["l_returnflag", "l_linestatus"], as_index=False) + agg = gb.agg( sum_qty=pd.NamedAgg(column="l_quantity", aggfunc="sum"), sum_base_price=pd.NamedAgg(column="l_extendedprice", aggfunc="sum"), sum_disc_price=pd.NamedAgg(column="disc_price", aggfunc="sum"), @@ -47,7 +42,7 @@ def query() -> pd.DataFrame: count_order=pd.NamedAgg(column="l_orderkey", aggfunc="size"), ) - result_df = total.sort_values(["l_returnflag", "l_linestatus"]) + result_df = agg.sort_values(["l_returnflag", "l_linestatus"]) return result_df diff --git a/queries/modin/q2.py b/queries/modin/q2.py index 0a4a2d0..0e2c77f 100644 --- a/queries/modin/q2.py +++ b/queries/modin/q2.py @@ -11,10 +11,6 @@ def q() -> None: - var1 = 15 - var2 = "BRASS" - var3 = "EUROPE" - region_ds = utils.get_region_ds nation_ds = utils.get_nation_ds supplier_ds = utils.get_supplier_ds @@ -40,96 +36,26 @@ def query() -> pd.DataFrame: part_ds = part_ds() part_supp_ds = part_supp_ds() - nation_filtered = nation_ds.loc[:, ["n_nationkey", "n_name", "n_regionkey"]] - region_filtered = region_ds[(region_ds["r_name"] == var3)] - region_filtered = region_filtered.loc[:, ["r_regionkey"]] - r_n_merged = nation_filtered.merge( - region_filtered, left_on="n_regionkey", right_on="r_regionkey", how="inner" - ) - r_n_merged = r_n_merged.loc[:, ["n_nationkey", "n_name"]] - supplier_filtered = supplier_ds.loc[ - :, - [ - "s_suppkey", - "s_name", - "s_address", - "s_nationkey", - "s_phone", - "s_acctbal", - "s_comment", - ], - ] - s_r_n_merged = r_n_merged.merge( - supplier_filtered, - left_on="n_nationkey", - right_on="s_nationkey", - how="inner", - ) - s_r_n_merged = s_r_n_merged.loc[ - :, - [ - "n_name", - "s_suppkey", - "s_name", - "s_address", - "s_phone", - "s_acctbal", - "s_comment", - ], - ] - partsupp_filtered = part_supp_ds.loc[ - :, ["ps_partkey", "ps_suppkey", "ps_supplycost"] - ] - ps_s_r_n_merged = s_r_n_merged.merge( - partsupp_filtered, left_on="s_suppkey", right_on="ps_suppkey", how="inner" - ) - ps_s_r_n_merged = ps_s_r_n_merged.loc[ - :, - [ - "n_name", - "s_name", - "s_address", - "s_phone", - "s_acctbal", - "s_comment", - "ps_partkey", - "ps_supplycost", - ], - ] - part_filtered = part_ds.loc[:, ["p_partkey", "p_mfgr", "p_size", "p_type"]] - part_filtered = part_filtered[ - (part_filtered["p_size"] == var1) - & (part_filtered["p_type"].str.endswith(var2)) - ] - part_filtered = part_filtered.loc[:, ["p_partkey", "p_mfgr"]] - merged_df = part_filtered.merge( - ps_s_r_n_merged, left_on="p_partkey", right_on="ps_partkey", how="inner" - ) - merged_df = merged_df.loc[ - :, - [ - "n_name", - "s_name", - "s_address", - "s_phone", - "s_acctbal", - "s_comment", - "ps_supplycost", - "p_partkey", - "p_mfgr", - ], - ] - min_values = merged_df.groupby("p_partkey", as_index=False)[ - "ps_supplycost" - ].min() - min_values.columns = ["P_PARTKEY_CPY", "MIN_SUPPLYCOST"] - merged_df = merged_df.merge( - min_values, - left_on=["p_partkey", "ps_supplycost"], - right_on=["P_PARTKEY_CPY", "MIN_SUPPLYCOST"], - how="inner", + var1 = 15 + var2 = "BRASS" + var3 = "EUROPE" + + jn = ( + part_ds.merge(part_supp_ds, left_on="p_partkey", right_on="ps_partkey") + .merge(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey") + .merge(nation_ds, left_on="s_nationkey", right_on="n_nationkey") + .merge(region_ds, left_on="n_regionkey", right_on="r_regionkey") ) - result_df = merged_df.loc[ + + jn = jn[jn["p_size"] == var1] + jn = jn[jn["p_type"].str.endswith(var2)] + jn = jn[jn["r_name"] == var3] + + gb = jn.groupby("p_partkey", as_index=False) + agg = gb["ps_supplycost"].min() + jn2 = agg.merge(jn, on=["p_partkey", "ps_supplycost"]) + + sel = jn2.loc[ :, [ "s_acctbal", @@ -142,20 +68,12 @@ def query() -> pd.DataFrame: "s_comment", ], ] - result_df = result_df.sort_values( - by=[ - "s_acctbal", - "n_name", - "s_name", - "p_partkey", - ], - ascending=[ - False, - True, - True, - True, - ], - ).head(100) + + sort = sel.sort_values( + by=["s_acctbal", "n_name", "s_name", "p_partkey"], + ascending=[False, True, True, True], + ) + result_df = sort.head(100) return result_df diff --git a/queries/modin/q3.py b/queries/modin/q3.py index 7f86646..37c42a5 100644 --- a/queries/modin/q3.py +++ b/queries/modin/q3.py @@ -12,9 +12,6 @@ def q() -> None: - var1 = var2 = date(1995, 3, 15) - var3 = "BUILDING" - customer_ds = utils.get_customer_ds line_item_ds = utils.get_line_item_ds orders_ds = utils.get_orders_ds @@ -32,33 +29,29 @@ def query() -> pd.DataFrame: line_item_ds = line_item_ds() orders_ds = orders_ds() - lineitem_filtered = line_item_ds.loc[ - :, ["l_orderkey", "l_extendedprice", "l_discount", "l_shipdate"] - ] - orders_filtered = orders_ds.loc[ - :, ["o_orderkey", "o_custkey", "o_orderdate", "o_shippriority"] - ] - customer_filtered = customer_ds.loc[:, ["c_mktsegment", "c_custkey"]] - lsel = lineitem_filtered.l_shipdate > var1 - osel = orders_filtered.o_orderdate < var2 - csel = customer_filtered.c_mktsegment == var3 - flineitem = lineitem_filtered[lsel] - forders = orders_filtered[osel] - fcustomer = customer_filtered[csel] - jn1 = fcustomer.merge(forders, left_on="c_custkey", right_on="o_custkey") - jn2 = jn1.merge(flineitem, left_on="o_orderkey", right_on="l_orderkey") + var1 = "BUILDING" + var2 = date(1995, 3, 15) + + fcustomer = customer_ds[customer_ds["c_mktsegment"] == var1] + + jn1 = fcustomer.merge(orders_ds, left_on="c_custkey", right_on="o_custkey") + jn2 = jn1.merge(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + + jn2 = jn2[jn2["o_orderdate"] < var2] + jn2 = jn2[jn2["l_shipdate"] > var2] jn2["revenue"] = jn2.l_extendedprice * (1 - jn2.l_discount) - total = ( - jn2.groupby( - ["l_orderkey", "o_orderdate", "o_shippriority"], as_index=False - )["revenue"] - .sum() - .sort_values(["revenue"], ascending=False) + gb = jn2.groupby( + ["o_orderkey", "o_orderdate", "o_shippriority"], as_index=False ) - result_df = total.head(10).loc[ - :, ["l_orderkey", "revenue", "o_orderdate", "o_shippriority"] - ] + agg = gb["revenue"].sum() + + sel = agg.loc[:, ["o_orderkey", "revenue", "o_orderdate", "o_shippriority"]] + sel = sel.rename({"o_orderkey": "l_orderkey"}, axis="columns") + + sorted = sel.sort_values(by=["revenue", "o_orderdate"], ascending=[False, True]) + result_df = sorted.head(10) + return result_df utils.run_query(Q_NUM, query) diff --git a/queries/modin/q4.py b/queries/modin/q4.py index 69bc46f..d14ea46 100644 --- a/queries/modin/q4.py +++ b/queries/modin/q4.py @@ -1,20 +1,15 @@ from __future__ import annotations from datetime import date -from typing import TYPE_CHECKING -from queries.modin import utils +import modin.pandas as pd -if TYPE_CHECKING: - import modin.pandas as pd +from queries.modin import utils Q_NUM = 4 def q() -> None: - date1 = date(1993, 10, 1) - date2 = date(1993, 7, 1) - line_item_ds = utils.get_line_item_ds orders_ds = utils.get_orders_ds @@ -28,17 +23,21 @@ def query() -> pd.DataFrame: line_item_ds = line_item_ds() orders_ds = orders_ds() - lsel = line_item_ds.l_commitdate < line_item_ds.l_receiptdate - osel = (orders_ds.o_orderdate < date1) & (orders_ds.o_orderdate >= date2) - flineitem = line_item_ds[lsel] - forders = orders_ds[osel] - jn = forders[forders["o_orderkey"].isin(flineitem["l_orderkey"])] - result_df = ( - jn.groupby("o_orderpriority", as_index=False)["o_orderkey"] - .count() - .sort_values(["o_orderpriority"]) - .rename(columns={"o_orderkey": "order_count"}) - ) + var1 = date(1993, 7, 1) + var2 = date(1993, 10, 1) + + jn = line_item_ds.merge(orders_ds, left_on="l_orderkey", right_on="o_orderkey") + + jn = jn[(jn["o_orderdate"] >= var1) & (jn["o_orderdate"] < var2)] + jn = jn[jn["l_commitdate"] < jn["l_receiptdate"]] + + jn = jn.drop_duplicates(subset=["o_orderpriority", "l_orderkey"]) + + gb = jn.groupby("o_orderpriority", as_index=False) + agg = gb.agg(order_count=pd.NamedAgg(column="o_orderkey", aggfunc="count")) + + result_df = agg.sort_values(["o_orderpriority"]) + return result_df utils.run_query(Q_NUM, query) diff --git a/queries/modin/q5.py b/queries/modin/q5.py index 5d09a71..3684aa8 100644 --- a/queries/modin/q5.py +++ b/queries/modin/q5.py @@ -12,9 +12,6 @@ def q() -> None: - date1 = date(1994, 1, 1) - date2 = date(1995, 1, 1) - region_ds = utils.get_region_ds nation_ds = utils.get_nation_ds customer_ds = utils.get_customer_ds @@ -37,7 +34,6 @@ def query() -> pd.DataFrame: nonlocal line_item_ds nonlocal orders_ds nonlocal supplier_ds - region_ds = region_ds() nation_ds = nation_ds() customer_ds = customer_ds() @@ -45,22 +41,24 @@ def query() -> pd.DataFrame: orders_ds = orders_ds() supplier_ds = supplier_ds() + var1 = "ASIA" + var2 = date(1994, 1, 1) + var3 = date(1995, 1, 1) + jn1 = region_ds.merge(nation_ds, left_on="r_regionkey", right_on="n_regionkey") jn2 = jn1.merge(customer_ds, left_on="n_nationkey", right_on="c_nationkey") jn3 = jn2.merge(orders_ds, left_on="c_custkey", right_on="o_custkey") jn4 = jn3.merge(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") - jn5 = supplier_ds.merge( - jn4, - left_on=["s_suppkey", "s_nationkey"], - right_on=["l_suppkey", "n_nationkey"], + jn5 = jn4.merge( + supplier_ds, + left_on=["l_suppkey", "n_nationkey"], + right_on=["s_suppkey", "s_nationkey"], ) + + jn5 = jn5[jn5["r_name"] == var1] + jn5 = jn5[(jn5["o_orderdate"] >= var2) & (jn5["o_orderdate"] < var3)] jn5["revenue"] = jn5.l_extendedprice * (1.0 - jn5.l_discount) - jn5 = jn5[ - (jn5.o_orderdate >= date1) - & (jn5.o_orderdate < date2) - & (jn5.r_name == "ASIA") - ] gb = jn5.groupby("n_name", as_index=False)["revenue"].sum() result_df = gb.sort_values("revenue", ascending=False) diff --git a/queries/modin/q6.py b/queries/modin/q6.py index 712d570..0269715 100644 --- a/queries/modin/q6.py +++ b/queries/modin/q6.py @@ -10,10 +10,6 @@ def q() -> None: - date1 = date(1994, 1, 1) - date2 = date(1995, 1, 1) - var3 = 24 - line_item_ds = utils.get_line_item_ds # first call one time to cache in case we don't include the IO times @@ -23,20 +19,22 @@ def query() -> pd.DataFrame: nonlocal line_item_ds line_item_ds = line_item_ds() - lineitem_filtered = line_item_ds.loc[ - :, ["l_quantity", "l_extendedprice", "l_discount", "l_shipdate"] + var1 = date(1994, 1, 1) + var2 = date(1995, 1, 1) + var3 = 0.05 + var4 = 0.07 + var5 = 24 + + flineitem = line_item_ds[ + (line_item_ds["l_shipdate"] >= var1) & (line_item_ds["l_shipdate"] < var2) ] - sel = ( - (lineitem_filtered.l_shipdate >= date1) - & (lineitem_filtered.l_shipdate < date2) - & (lineitem_filtered.l_discount >= 0.05) - & (lineitem_filtered.l_discount <= 0.07) - & (lineitem_filtered.l_quantity < var3) - ) - - flineitem = lineitem_filtered[sel] - result_value = (flineitem.l_extendedprice * flineitem.l_discount).sum() + flineitem = line_item_ds[ + (line_item_ds["l_discount"] >= var3) & (line_item_ds["l_discount"] <= var4) + ] + flineitem = line_item_ds[line_item_ds["l_quantity"] < var5] + result_value = (flineitem["l_extendedprice"] * flineitem["l_discount"]).sum() result_df = pd.DataFrame({"revenue": [result_value]}) + return result_df utils.run_query(Q_NUM, query) diff --git a/queries/modin/q7.py b/queries/modin/q7.py index 5d1734d..6d1d9ab 100644 --- a/queries/modin/q7.py +++ b/queries/modin/q7.py @@ -10,9 +10,6 @@ def q() -> None: - var1 = date(1995, 1, 1) - var2 = date(1997, 1, 1) - nation_ds = utils.get_nation_ds customer_ds = utils.get_customer_ds line_item_ds = utils.get_line_item_ds @@ -32,102 +29,50 @@ def query() -> pd.DataFrame: nonlocal line_item_ds nonlocal orders_ds nonlocal supplier_ds - nation_ds = nation_ds() customer_ds = customer_ds() line_item_ds = line_item_ds() orders_ds = orders_ds() supplier_ds = supplier_ds() - lineitem_filtered = line_item_ds[ - (line_item_ds["l_shipdate"] >= var1) & (line_item_ds["l_shipdate"] < var2) - ] - lineitem_filtered["l_year"] = lineitem_filtered["l_shipdate"].dt.year - lineitem_filtered["revenue"] = lineitem_filtered["l_extendedprice"] * ( - 1.0 - lineitem_filtered["l_discount"] - ) - lineitem_filtered = lineitem_filtered.loc[ - :, ["l_orderkey", "l_suppkey", "l_year", "revenue"] - ] - supplier_filtered = supplier_ds.loc[:, ["s_suppkey", "s_nationkey"]] - orders_filtered = orders_ds.loc[:, ["o_orderkey", "o_custkey"]] - customer_filtered = customer_ds.loc[:, ["c_custkey", "c_nationkey"]] - n1 = nation_ds[(nation_ds["n_name"] == "FRANCE")].loc[ - :, ["n_nationkey", "n_name"] - ] - n2 = nation_ds[(nation_ds["n_name"] == "GERMANY")].loc[ - :, ["n_nationkey", "n_name"] - ] - - # ----- do nation 1 ----- - N1_C = customer_filtered.merge( - n1, left_on="c_nationkey", right_on="n_nationkey", how="inner" - ) - N1_C = N1_C.drop(columns=["c_nationkey", "n_nationkey"]).rename( - columns={"n_name": "cust_nation"} - ) - N1_C_O = N1_C.merge( - orders_filtered, left_on="c_custkey", right_on="o_custkey", how="inner" - ) - N1_C_O = N1_C_O.drop(columns=["c_custkey", "o_custkey"]) - - N2_S = supplier_filtered.merge( - n2, left_on="s_nationkey", right_on="n_nationkey", how="inner" - ) - N2_S = N2_S.drop(columns=["s_nationkey", "n_nationkey"]).rename( - columns={"n_name": "supp_nation"} - ) - N2_S_L = N2_S.merge( - lineitem_filtered, left_on="s_suppkey", right_on="l_suppkey", how="inner" - ) - N2_S_L = N2_S_L.drop(columns=["s_suppkey", "l_suppkey"]) - - total1 = N1_C_O.merge( - N2_S_L, left_on="o_orderkey", right_on="l_orderkey", how="inner" - ) - total1 = total1.drop(columns=["o_orderkey", "l_orderkey"]) - - # ----- do nation 2 ----- (same as nation 1 section but with nation 2) - N2_C = customer_filtered.merge( - n2, left_on="c_nationkey", right_on="n_nationkey", how="inner" - ) - N2_C = N2_C.drop(columns=["c_nationkey", "n_nationkey"]).rename( - columns={"n_name": "cust_nation"} - ) - N2_C_O = N2_C.merge( - orders_filtered, left_on="c_custkey", right_on="o_custkey", how="inner" - ) - N2_C_O = N2_C_O.drop(columns=["c_custkey", "o_custkey"]) - - N1_S = supplier_filtered.merge( - n1, left_on="s_nationkey", right_on="n_nationkey", how="inner" - ) - N1_S = N1_S.drop(columns=["s_nationkey", "n_nationkey"]).rename( - columns={"n_name": "supp_nation"} - ) - N1_S_L = N1_S.merge( - lineitem_filtered, left_on="s_suppkey", right_on="l_suppkey", how="inner" - ) - N1_S_L = N1_S_L.drop(columns=["s_suppkey", "l_suppkey"]) - - total2 = N2_C_O.merge( - N1_S_L, left_on="o_orderkey", right_on="l_orderkey", how="inner" - ) - total2 = total2.drop(columns=["o_orderkey", "l_orderkey"]) - - # concat results - total = pd.concat([total1, total2]) - result_df = ( - total.groupby(["supp_nation", "cust_nation", "l_year"]) - .revenue.agg("sum") - .reset_index() - ) - result_df.columns = ["supp_nation", "cust_nation", "l_year", "revenue"] - - result_df = result_df.sort_values( - by=["supp_nation", "cust_nation", "l_year"], - ascending=[True, True, True], - ) + var1 = "FRANCE" + var2 = "GERMANY" + var3 = date(1995, 1, 1) + var4 = date(1996, 12, 31) + + n1 = nation_ds[(nation_ds["n_name"] == var1)] + n2 = nation_ds[(nation_ds["n_name"] == var2)] + + # Part 1 + jn1 = customer_ds.merge(n1, left_on="c_nationkey", right_on="n_nationkey") + jn2 = jn1.merge(orders_ds, left_on="c_custkey", right_on="o_custkey") + jn2 = jn2.rename({"n_name": "cust_nation"}, axis="columns") + jn3 = jn2.merge(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + jn4 = jn3.merge(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + jn5 = jn4.merge(n2, left_on="s_nationkey", right_on="n_nationkey") + df1 = jn5.rename({"n_name": "supp_nation"}, axis="columns") + + # Part 2 + jn1 = customer_ds.merge(n2, left_on="c_nationkey", right_on="n_nationkey") + jn2 = jn1.merge(orders_ds, left_on="c_custkey", right_on="o_custkey") + jn2 = jn2.rename({"n_name": "cust_nation"}, axis="columns") + jn3 = jn2.merge(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + jn4 = jn3.merge(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + jn5 = jn4.merge(n1, left_on="s_nationkey", right_on="n_nationkey") + df2 = jn5.rename({"n_name": "supp_nation"}, axis="columns") + + # Combine + total = pd.concat([df1, df2]) + + total = total[(total["l_shipdate"] >= var3) & (total["l_shipdate"] <= var4)] + total["volume"] = total["l_extendedprice"] * (1.0 - total["l_discount"]) + total["l_year"] = total["l_shipdate"].dt.year + + gb = total.groupby(["supp_nation", "cust_nation", "l_year"], as_index=False) + agg = gb.agg(revenue=pd.NamedAgg(column="volume", aggfunc="sum")) + + result_df = agg.sort_values(by=["supp_nation", "cust_nation", "l_year"]) + return result_df utils.run_query(Q_NUM, query) diff --git a/queries/modin/q8.py b/queries/modin/q8.py index 18aa57a..6494afe 100644 --- a/queries/modin/q8.py +++ b/queries/modin/q8.py @@ -1,9 +1,13 @@ from __future__ import annotations -import modin.pandas as pd +from datetime import date +from typing import TYPE_CHECKING from queries.modin import utils +if TYPE_CHECKING: + import modin.pandas as pd + Q_NUM = 8 @@ -33,75 +37,53 @@ def query() -> pd.DataFrame: nonlocal part_ds nonlocal region_ds nonlocal supplier_ds - customer_ds = customer_ds() - lineitem_ds = line_item_ds() + line_item_ds = line_item_ds() nation_ds = nation_ds() orders_ds = orders_ds() part_ds = part_ds() region_ds = region_ds() supplier_ds = supplier_ds() - part_filtered = part_ds[(part_ds["p_type"] == "ECONOMY ANODIZED STEEL")] - part_filtered = part_filtered.loc[:, ["p_partkey"]] - lineitem_filtered = lineitem_ds.loc[:, ["l_partkey", "l_suppkey", "l_orderkey"]] - lineitem_filtered["volume"] = lineitem_ds["l_extendedprice"] * ( - 1.0 - lineitem_ds["l_discount"] - ) - total = part_filtered.merge( - lineitem_filtered, left_on="p_partkey", right_on="l_partkey", how="inner" - ) - total = total.loc[:, ["l_suppkey", "l_orderkey", "volume"]] - supplier_filtered = supplier_ds.loc[:, ["s_suppkey", "s_nationkey"]] - total = total.merge( - supplier_filtered, left_on="l_suppkey", right_on="s_suppkey", how="inner" - ) - total = total.loc[:, ["l_orderkey", "volume", "s_nationkey"]] - orders_filtered = orders_ds[ - (orders_ds["o_orderdate"] >= pd.Timestamp("1995-01-01")) - & (orders_ds["o_orderdate"] < pd.Timestamp("1997-01-01")) - ] - orders_filtered["o_year"] = orders_filtered["o_orderdate"].dt.year - orders_filtered = orders_filtered.loc[:, ["o_orderkey", "o_custkey", "o_year"]] - total = total.merge( - orders_filtered, left_on="l_orderkey", right_on="o_orderkey", how="inner" - ) - total = total.loc[:, ["volume", "s_nationkey", "o_custkey", "o_year"]] - customer_filtered = customer_ds.loc[:, ["c_custkey", "c_nationkey"]] - total = total.merge( - customer_filtered, left_on="o_custkey", right_on="c_custkey", how="inner" - ) - total = total.loc[:, ["volume", "s_nationkey", "o_year", "c_nationkey"]] - n1_filtered = nation_ds.loc[:, ["n_nationkey", "n_regionkey"]] - n2_filtered = nation_ds.loc[:, ["n_nationkey", "n_name"]].rename( - columns={"n_name": "nation"} - ) - total = total.merge( - n1_filtered, left_on="c_nationkey", right_on="n_nationkey", how="inner" - ) - total = total.loc[:, ["volume", "s_nationkey", "o_year", "n_regionkey"]] - total = total.merge( - n2_filtered, left_on="s_nationkey", right_on="n_nationkey", how="inner" - ) - total = total.loc[:, ["volume", "o_year", "n_regionkey", "nation"]] - region_filtered = region_ds[(region_ds["r_name"] == "AMERICA")] - region_filtered = region_filtered.loc[:, ["r_regionkey"]] - total = total.merge( - region_filtered, left_on="n_regionkey", right_on="r_regionkey", how="inner" - ) - total = total.loc[:, ["volume", "o_year", "nation"]] + var1 = "BRAZIL" + var2 = "AMERICA" + var3 = "ECONOMY ANODIZED STEEL" + var4 = date(1995, 1, 1) + var5 = date(1996, 12, 31) + + n1 = nation_ds.loc[:, ["n_nationkey", "n_regionkey"]] + n2 = nation_ds.loc[:, ["n_nationkey", "n_name"]] + + jn1 = part_ds.merge(line_item_ds, left_on="p_partkey", right_on="l_partkey") + jn2 = jn1.merge(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + jn3 = jn2.merge(orders_ds, left_on="l_orderkey", right_on="o_orderkey") + jn4 = jn3.merge(customer_ds, left_on="o_custkey", right_on="c_custkey") + jn5 = jn4.merge(n1, left_on="c_nationkey", right_on="n_nationkey") + jn6 = jn5.merge(region_ds, left_on="n_regionkey", right_on="r_regionkey") + + jn6 = jn6[(jn6["r_name"] == var2)] + + jn7 = jn6.merge(n2, left_on="s_nationkey", right_on="n_nationkey") + + jn7 = jn7[(jn7["o_orderdate"] >= var4) & (jn7["o_orderdate"] <= var5)] + jn7 = jn7[jn7["p_type"] == var3] + + jn7["o_year"] = jn7["o_orderdate"].dt.year + jn7["volume"] = jn7["l_extendedprice"] * (1.0 - jn7["l_discount"]) + jn7 = jn7.rename({"n_name": "nation"}, axis="columns") def udf(df: pd.DataFrame) -> float: demonimator: float = df["volume"].sum() - df = df[df["nation"] == "BRAZIL"] + df = df[df["nation"] == var1] numerator: float = df["volume"].sum() return round(numerator / demonimator, 2) - total = total.groupby("o_year", as_index=False).apply(udf, include_groups=False) - total.columns = ["o_year", "mkt_share"] - total = total.sort_values(by=["o_year"], ascending=[True]) + gb = jn7.groupby("o_year", as_index=False) + agg = gb.apply(udf, include_groups=False) + agg.columns = ["o_year", "mkt_share"] + result_df = agg.sort_values("o_year") - return total + return result_df utils.run_query(Q_NUM, query) diff --git a/queries/pandas/q1.py b/queries/pandas/q1.py index f21a754..20c3f62 100644 --- a/queries/pandas/q1.py +++ b/queries/pandas/q1.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import date import pandas as pd diff --git a/queries/pandas/q2.py b/queries/pandas/q2.py index 7df0a18..05c59d6 100644 --- a/queries/pandas/q2.py +++ b/queries/pandas/q2.py @@ -1,7 +1,12 @@ -import pandas as pd +from __future__ import annotations + +from typing import TYPE_CHECKING from queries.pandas import utils +if TYPE_CHECKING: + import pandas as pd + Q_NUM = 2 diff --git a/queries/pandas/q3.py b/queries/pandas/q3.py index e582525..916b9ab 100644 --- a/queries/pandas/q3.py +++ b/queries/pandas/q3.py @@ -1,9 +1,13 @@ -from datetime import date +from __future__ import annotations -import pandas as pd +from datetime import date +from typing import TYPE_CHECKING from queries.pandas import utils +if TYPE_CHECKING: + import pandas as pd + Q_NUM = 3 diff --git a/queries/pandas/q4.py b/queries/pandas/q4.py index d517a27..efd2e92 100644 --- a/queries/pandas/q4.py +++ b/queries/pandas/q4.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import date import pandas as pd diff --git a/queries/pandas/q5.py b/queries/pandas/q5.py index f7c4a07..2da113c 100644 --- a/queries/pandas/q5.py +++ b/queries/pandas/q5.py @@ -1,9 +1,13 @@ -from datetime import date +from __future__ import annotations -import pandas as pd +from datetime import date +from typing import TYPE_CHECKING from queries.pandas import utils +if TYPE_CHECKING: + import pandas as pd + Q_NUM = 5 diff --git a/queries/pandas/q6.py b/queries/pandas/q6.py index 5565d6a..95a8fe8 100644 --- a/queries/pandas/q6.py +++ b/queries/pandas/q6.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import date import pandas as pd diff --git a/queries/pandas/q7.py b/queries/pandas/q7.py index 88fe749..a7b9ebb 100644 --- a/queries/pandas/q7.py +++ b/queries/pandas/q7.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import date import pandas as pd diff --git a/queries/pandas/q8.py b/queries/pandas/q8.py index 165c062..6c48cd5 100644 --- a/queries/pandas/q8.py +++ b/queries/pandas/q8.py @@ -1,9 +1,13 @@ -from datetime import date +from __future__ import annotations -import pandas as pd +from datetime import date +from typing import TYPE_CHECKING from queries.pandas import utils +if TYPE_CHECKING: + import pandas as pd + Q_NUM = 8