-
Notifications
You must be signed in to change notification settings - Fork 95
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
148 additions
and
0 deletions.
There are no files selected for viewing
122 changes: 122 additions & 0 deletions
122
scripts/data_preparation/Gaussian/generate_training_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import os | ||
import sys | ||
import shutil | ||
import pickle | ||
import argparse | ||
|
||
import numpy as np | ||
|
||
# TODO: remove it when basicts can be installed by pip | ||
sys.path.append(os.path.abspath(__file__ + "/../../../..")) | ||
from basicts.data.transform import standard_transform | ||
|
||
|
||
def generate_data(args: argparse.Namespace): | ||
"""Preprocess and generate train/valid/test datasets. | ||
Args: | ||
args (argparse): configurations of preprocessing | ||
""" | ||
|
||
target_channel = args.target_channel | ||
future_seq_len = args.future_seq_len | ||
history_seq_len = args.history_seq_len | ||
output_dir = args.output_dir | ||
train_ratio = args.train_ratio | ||
valid_ratio = args.valid_ratio | ||
data_file_path = args.data_file_path | ||
norm_each_channel = args.norm_each_channel | ||
if_rescale = not norm_each_channel # if evaluate on rescaled data. see `basicts.runner.base_tsf_runner.BaseTimeSeriesForecastingRunner.build_train_dataset` for details. | ||
|
||
# read data | ||
data = np.load(data_file_path) | ||
data = data[..., target_channel] | ||
print("raw time series shape: {0}".format(data.shape)) | ||
|
||
# split data | ||
l, n, f = data.shape | ||
num_samples = l - (history_seq_len + future_seq_len) + 1 | ||
train_num = round(num_samples * train_ratio) | ||
valid_num = round(num_samples * valid_ratio) | ||
test_num = num_samples - train_num - valid_num | ||
print("number of training samples:{0}".format(train_num)) | ||
print("number of validation samples:{0}".format(valid_num)) | ||
print("number of test samples:{0}".format(test_num)) | ||
|
||
index_list = [] | ||
for t in range(history_seq_len, num_samples + history_seq_len): | ||
index = (t-history_seq_len, t, t+future_seq_len) | ||
index_list.append(index) | ||
|
||
train_index = index_list[:train_num] | ||
valid_index = index_list[train_num: train_num + valid_num] | ||
test_index = index_list[train_num + | ||
valid_num: train_num + valid_num + test_num] | ||
|
||
# normalize data | ||
scaler = standard_transform | ||
data_norm = scaler(data, output_dir, train_index, history_seq_len, future_seq_len, norm_each_channel=norm_each_channel) | ||
|
||
# add temporal feature | ||
feature_list = [data_norm] | ||
|
||
processed_data = np.concatenate(feature_list, axis=-1) | ||
|
||
# save data | ||
index = {} | ||
index["train"] = train_index | ||
index["valid"] = valid_index | ||
index["test"] = test_index | ||
with open(output_dir + "/index_in_{0}_out_{1}_rescale_{2}.pkl".format(history_seq_len, future_seq_len, if_rescale), "wb") as f: | ||
pickle.dump(index, f) | ||
|
||
data = {} | ||
data["processed_data"] = processed_data | ||
with open(output_dir + "/data_in_{0}_out_{1}_rescale_{2}.pkl".format(history_seq_len, future_seq_len, if_rescale), "wb") as f: | ||
pickle.dump(data, f) | ||
|
||
|
||
if __name__ == "__main__": | ||
# sliding window size for generating history sequence and target sequence | ||
HISTORY_SEQ_LEN = 96 | ||
FUTURE_SEQ_LEN = 96 | ||
|
||
TRAIN_RATIO = 0.6 | ||
VALID_RATIO = 0.2 | ||
TARGET_CHANNEL = [0] # target channel(s) | ||
|
||
DATASET_NAME = "Gaussian" | ||
|
||
OUTPUT_DIR = "datasets/" + DATASET_NAME | ||
DATA_FILE_PATH = "datasets/raw_data/{0}/{0}.npy".format(DATASET_NAME) | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--output_dir", type=str, | ||
default=OUTPUT_DIR, help="Output directory.") | ||
parser.add_argument("--data_file_path", type=str, | ||
default=DATA_FILE_PATH, help="Raw traffic readings.") | ||
parser.add_argument("--history_seq_len", type=int, | ||
default=HISTORY_SEQ_LEN, help="Sequence Length.") | ||
parser.add_argument("--future_seq_len", type=int, | ||
default=FUTURE_SEQ_LEN, help="Sequence Length.") | ||
parser.add_argument("--target_channel", type=list, | ||
default=TARGET_CHANNEL, help="Selected channels.") | ||
parser.add_argument("--train_ratio", type=float, | ||
default=TRAIN_RATIO, help="Train ratio") | ||
parser.add_argument("--valid_ratio", type=float, | ||
default=VALID_RATIO, help="Validate ratio.") | ||
parser.add_argument("--norm_each_channel", type=float, help="Validate ratio.") | ||
args = parser.parse_args() | ||
|
||
# print args | ||
print("-"*(20+45+5)) | ||
for key, value in sorted(vars(args).items()): | ||
print("|{0:>20} = {1:<45}|".format(key, str(value))) | ||
print("-"*(20+45+5)) | ||
|
||
if not os.path.exists(args.output_dir): | ||
os.makedirs(args.output_dir) | ||
args.norm_each_channel = True | ||
generate_data(args) | ||
args.norm_each_channel = False | ||
generate_data(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import os | ||
import sys | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
PROJECT_DIR = os.path.abspath(__file__ + "/../../../..") | ||
os.chdir(PROJECT_DIR) | ||
|
||
|
||
def generate_gaussian_noise_sequence(duration): | ||
time_points = np.arange(0, duration, 1) | ||
gaussion_noise_sequence = np.random.normal(0, 1, duration) | ||
return time_points, gaussion_noise_sequence | ||
|
||
# hyper parameterts | ||
duration = 10000 # time series length | ||
|
||
# generate gaussian sequence | ||
time_points, gaussian_noise_sequence = generate_gaussian_noise_sequence(duration) | ||
|
||
# save pulse sequence | ||
import torch | ||
data = torch.Tensor(gaussian_noise_sequence).unsqueeze(-1).unsqueeze(-1).numpy() | ||
# mkdir datasets/raw_data/Gaussian | ||
if not os.path.exists('datasets/raw_data/Gaussian'): | ||
os.makedirs('datasets/raw_data/Gaussian') | ||
np.save('datasets/raw_data/Gaussian/Gaussian.npy', data) |