Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate raw data #54

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
176 changes: 176 additions & 0 deletions data/generate_raw_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import os
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random


def generate_random_transactions(
users_df: pd.DataFrame, max_transactions: int = 11, max_days_back=365
) -> pd.DataFrame:
# Predefined lists of categories and locations
transaction_categories = [
"Groceries",
"Utilities",
"Entertainment",
"Dining",
"Travel",
"Health",
"Education",
"Shopping",
"Automotive",
"Rent",
]
cities_and_states = [
("New York", "NY"),
("Los Angeles", "CA"),
("Chicago", "IL"),
("Houston", "TX"),
("Phoenix", "AZ"),
("Philadelphia", "PA"),
("San Antonio", "TX"),
("San Diego", "CA"),
("Dallas", "TX"),
("San Jose", "CA"),
]
transactions_list = []
total_users = users_df.shape[0]
batch = total_users // 10

for i, row in users_df.iterrows():
num_transactions = np.random.randint(1, max_transactions)
for j in range(num_transactions):
# Random date within the last 10-max_days_back (default 365) days
random_days = np.random.randint(10, max_days_back)
date_of_transaction = datetime.now() - timedelta(days=random_days)
city, state = random.choice(cities_and_states)
if j == (num_transactions - 1):
date_of_transaction = row["created"]

transactions_list.append(
{
"user_id": row["user_id"],
"created": date_of_transaction,
"updated": date_of_transaction,
"date_of_transaction": date_of_transaction,
"transaction_amount": round(np.random.uniform(10, 1000), 2),
"transaction_category": random.choice(transaction_categories),
"card_token": str(uuid.uuid4()),
"city": city,
"state": state,
}
)
if (i % batch) == 0:
formatted_i = f"{i:,}"
percent_complete = i / total_users * 100
print(
f"{formatted_i:>{len(f'{total_users:,}')}} of {total_users:,} "
f"({percent_complete:.0f}%) complete"
)

return pd.DataFrame(transactions_list)


def calculate_point_in_time_features(label_dataset, transactions_df) -> pd.DataFrame:
label_dataset["created"] = pd.to_datetime(label_dataset["created"])
transactions_df["transaction_timestamp"] = pd.to_datetime(
transactions_df["date_of_transaction"]
)

# Get all transactions before the created time
transactions_before = pd.merge(
label_dataset[["user_id", "created"]], transactions_df, on="user_id"
)
transactions_before = transactions_before[
transactions_before["transaction_timestamp"] < transactions_before["created_x"]
]
transactions_before["days_between_transactions"] = (
transactions_before["transaction_timestamp"] - transactions_before["created_x"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
transactions_before["transaction_timestamp"] - transactions_before["created_x"]
abs(transactions_before["transaction_timestamp"] - transactions_before["created_x"])

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This currently generates negative values, causing the "days_since_last_transaction" and "days_since_first_transaction" to get mixed up.

).dt.days

# Group by user_id and created to calculate features
features = (
transactions_before.groupby(["user_id", "created_x"])
.agg(
num_prev_transactions=("transaction_amount", "count"),
avg_prev_transaction_amount=("transaction_amount", "mean"),
max_prev_transaction_amount=("transaction_amount", "max"),
stdv_prev_transaction_amount=("transaction_amount", "std"),
days_since_last_transaction=("days_between_transactions", "min"),
days_since_first_transaction=("days_between_transactions", "max"),
)
.reset_index()
.fillna(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.fillna(0)

Copy link
Contributor

@RHRolun RHRolun Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still leaves NaNs in the dataframe, applying fillna(0) on the final_df fixes this issue

)

final_df = (
pd.merge(
label_dataset,
features,
left_on=["user_id", "created"],
right_on=["user_id", "created_x"],
how="left",
)
.reset_index(drop=True)
.drop("created_x", axis=1)
)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
final_df = final_df.fillna(0)

return final_df


def main():
print("loading data...")
script_dir = os.path.dirname(os.path.abspath(__file__))
train = pd.read_csv(os.path.join(script_dir, "train.csv"))
test = pd.read_csv(os.path.join(script_dir, "test.csv"))
valid = pd.read_csv(os.path.join(script_dir, "validate.csv"))
train["set"] = "train"
test["set"] = "test"
valid["set"] = "valid"

df = pd.concat([train, test, valid], axis=0).reset_index(drop=True)

df["user_id"] = [f"user_{i}" for i in range(df.shape[0])]
df["transaction_id"] = [f"txn_{i}" for i in range(df.shape[0])]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This transaction_id seems to not be used again later and is not part of the output parquets.
Should it be added as a column in transactions_list in generate_random_transactions?


for date_col in ["created", "updated"]:
df[date_col] = pd.Timestamp.now()

label_dataset = pd.DataFrame(
df[
[
"user_id",
"fraud",
"created",
"updated",
"set",
"distance_from_home",
"distance_from_last_transaction",
"ratio_to_median_purchase_price",
]
]
)

print("generating transaction level data...")
user_purchase_history = generate_random_transactions(
users_df=df[df["repeat_retailer"] == 1].reset_index(drop=True),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Im curious, why we only are using data from when they have purchased multiple times from the same retailer?

max_transactions=5,
max_days_back=365,
)
user_purchase_history.to_parquet(
os.path.join(script_dir, "raw_transaction_datasource.parquet")
)
print("calculating point in time features...")
finaldf = calculate_point_in_time_features(label_dataset, user_purchase_history)
print("merging final dataset...")
finaldf = finaldf.merge(
df[["user_id", "created", "used_chip", "used_pin_number", "online_order"]],
on=["user_id", "created"],
)
finaldf.to_parquet(os.path.join(script_dir, "final_data.parquet"))
print("...data processing complete.")


if __name__ == "__main__":
main()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great PR!
Just so I understand this correctly - do you think this should come in as its own data prep section, or that the parquet files this code produces should exist ahead of time and just be used during training/inference?