From 0c60982ec46b50fe8c2b17bd778effa308a55ada Mon Sep 17 00:00:00 2001 From: blisky-li <2621142272@qq.com> Date: Thu, 26 Sep 2024 20:47:49 +0800 Subject: [PATCH] Add more baselines (#148) --- baselines/ETSformer/Electricity.py | 154 +++++++ baselines/ETSformer/arch/__init__.py | 3 + baselines/ETSformer/arch/decoder.py | 84 ++++ baselines/ETSformer/arch/encoder.py | 227 ++++++++++ baselines/ETSformer/arch/etsformer_arch.py | 116 +++++ .../ETSformer/arch/exponential_smoothing.py | 68 +++ baselines/ETSformer/arch/modules.py | 32 ++ baselines/FiLM/Electricity.py | 155 +++++++ baselines/FiLM/arch/__init__.py | 1 + baselines/FiLM/arch/film_arch.py | 154 +++++++ baselines/Koopa/Electricity.py | 152 +++++++ baselines/Koopa/arch/__init__.py | 2 + baselines/Koopa/arch/koopa_arch.py | 341 ++++++++++++++ baselines/LightTS/Electricity.py | 157 +++++++ baselines/LightTS/arch/__init__.py | 1 + baselines/LightTS/arch/lightts_arch.py | 135 ++++++ baselines/MTSMixer/Electricity.py | 164 +++++++ baselines/MTSMixer/arch/Invertible.py | 105 +++++ baselines/MTSMixer/arch/Projection.py | 25 ++ baselines/MTSMixer/arch/__init__.py | 1 + baselines/MTSMixer/arch/decomposition.py | 58 +++ baselines/MTSMixer/arch/mtsmixer_arch.py | 128 ++++++ .../Nonstationary_Transformer/Electricity.py | 157 +++++++ .../Nonstationary_Transformer/arch/Embed.py | 132 ++++++ .../arch/SelfAttention_Family.py | 172 +++++++ .../arch/Transformer_EncDec.py | 143 ++++++ .../arch/__init__.py | 1 + .../Nonstationary_Transformer/arch/masking.py | 26 ++ .../arch/nstransformer_arch.py | 161 +++++++ baselines/SegRNN/Electricity.py | 152 +++++++ baselines/SegRNN/arch/Autoformer_EncDec.py | 203 +++++++++ baselines/SegRNN/arch/__init__.py | 1 + baselines/SegRNN/arch/segrnn_arch.py | 83 ++++ baselines/SparseTSF/Electricity.py | 158 +++++++ baselines/SparseTSF/arch/Embed.py | 234 ++++++++++ baselines/SparseTSF/arch/__init__.py | 1 + baselines/SparseTSF/arch/sparsetsf_arch.py | 46 ++ baselines/TiDE/Electricity.py | 153 +++++++ baselines/TiDE/arch/__init__.py | 1 + baselines/TiDE/arch/tide_arch.py | 118 +++++ baselines/TimeMixer/Electricity.py | 162 +++++++ baselines/TimeMixer/arch/Autoformer_EncDec.py | 203 +++++++++ baselines/TimeMixer/arch/Embed.py | 234 ++++++++++ baselines/TimeMixer/arch/StandardNorm.py | 68 +++ baselines/TimeMixer/arch/__init__.py | 1 + baselines/TimeMixer/arch/timemixer_arch.py | 419 ++++++++++++++++++ baselines/UMixer/Electricity.py | 159 +++++++ baselines/UMixer/arch/Embed.py | 230 ++++++++++ baselines/UMixer/arch/RevIN.py | 103 +++++ baselines/UMixer/arch/__init__.py | 1 + baselines/UMixer/arch/umixer_arch.py | 234 ++++++++++ baselines/iTransformer/Electricity.py | 158 +++++++ baselines/iTransformer/arch/Embed.py | 143 ++++++ .../iTransformer/arch/SelfAttention_Family.py | 302 +++++++++++++ .../iTransformer/arch/Transformer_EncDec.py | 134 ++++++ baselines/iTransformer/arch/__init__.py | 1 + .../iTransformer/arch/itransformer_arch.py | 108 +++++ baselines/iTransformer/arch/masking.py | 26 ++ 58 files changed, 6891 insertions(+) create mode 100644 baselines/ETSformer/Electricity.py create mode 100644 baselines/ETSformer/arch/__init__.py create mode 100644 baselines/ETSformer/arch/decoder.py create mode 100644 baselines/ETSformer/arch/encoder.py create mode 100644 baselines/ETSformer/arch/etsformer_arch.py create mode 100644 baselines/ETSformer/arch/exponential_smoothing.py create mode 100644 baselines/ETSformer/arch/modules.py create mode 100644 baselines/FiLM/Electricity.py create mode 100644 baselines/FiLM/arch/__init__.py create mode 100644 baselines/FiLM/arch/film_arch.py create mode 100644 baselines/Koopa/Electricity.py create mode 100644 baselines/Koopa/arch/__init__.py create mode 100644 baselines/Koopa/arch/koopa_arch.py create mode 100644 baselines/LightTS/Electricity.py create mode 100644 baselines/LightTS/arch/__init__.py create mode 100644 baselines/LightTS/arch/lightts_arch.py create mode 100644 baselines/MTSMixer/Electricity.py create mode 100644 baselines/MTSMixer/arch/Invertible.py create mode 100644 baselines/MTSMixer/arch/Projection.py create mode 100644 baselines/MTSMixer/arch/__init__.py create mode 100644 baselines/MTSMixer/arch/decomposition.py create mode 100644 baselines/MTSMixer/arch/mtsmixer_arch.py create mode 100644 baselines/Nonstationary_Transformer/Electricity.py create mode 100644 baselines/Nonstationary_Transformer/arch/Embed.py create mode 100644 baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py create mode 100644 baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py create mode 100644 baselines/Nonstationary_Transformer/arch/__init__.py create mode 100644 baselines/Nonstationary_Transformer/arch/masking.py create mode 100644 baselines/Nonstationary_Transformer/arch/nstransformer_arch.py create mode 100644 baselines/SegRNN/Electricity.py create mode 100644 baselines/SegRNN/arch/Autoformer_EncDec.py create mode 100644 baselines/SegRNN/arch/__init__.py create mode 100644 baselines/SegRNN/arch/segrnn_arch.py create mode 100644 baselines/SparseTSF/Electricity.py create mode 100644 baselines/SparseTSF/arch/Embed.py create mode 100644 baselines/SparseTSF/arch/__init__.py create mode 100644 baselines/SparseTSF/arch/sparsetsf_arch.py create mode 100644 baselines/TiDE/Electricity.py create mode 100644 baselines/TiDE/arch/__init__.py create mode 100644 baselines/TiDE/arch/tide_arch.py create mode 100644 baselines/TimeMixer/Electricity.py create mode 100644 baselines/TimeMixer/arch/Autoformer_EncDec.py create mode 100644 baselines/TimeMixer/arch/Embed.py create mode 100644 baselines/TimeMixer/arch/StandardNorm.py create mode 100644 baselines/TimeMixer/arch/__init__.py create mode 100644 baselines/TimeMixer/arch/timemixer_arch.py create mode 100644 baselines/UMixer/Electricity.py create mode 100644 baselines/UMixer/arch/Embed.py create mode 100644 baselines/UMixer/arch/RevIN.py create mode 100644 baselines/UMixer/arch/__init__.py create mode 100644 baselines/UMixer/arch/umixer_arch.py create mode 100644 baselines/iTransformer/Electricity.py create mode 100644 baselines/iTransformer/arch/Embed.py create mode 100644 baselines/iTransformer/arch/SelfAttention_Family.py create mode 100644 baselines/iTransformer/arch/Transformer_EncDec.py create mode 100644 baselines/iTransformer/arch/__init__.py create mode 100644 baselines/iTransformer/arch/itransformer_arch.py create mode 100644 baselines/iTransformer/arch/masking.py diff --git a/baselines/ETSformer/Electricity.py b/baselines/ETSformer/Electricity.py new file mode 100644 index 0000000..304b240 --- /dev/null +++ b/baselines/ETSformer/Electricity.py @@ -0,0 +1,154 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import ETSformer + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = ETSformer +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 3, # attn factor + "d_model": 512, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 2, # num of encoder layers + "d_layers": 2, # num of decoder layers + "d_ff": 2048, + "K": 3, + "sigma" : 0.2, + "dropout": 0.2, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "sigmoid", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/ETSformer/arch/__init__.py b/baselines/ETSformer/arch/__init__.py new file mode 100644 index 0000000..e76c95d --- /dev/null +++ b/baselines/ETSformer/arch/__init__.py @@ -0,0 +1,3 @@ +from .etsformer_arch import ETSformer + +__all__ = ["ETSformer"] diff --git a/baselines/ETSformer/arch/decoder.py b/baselines/ETSformer/arch/decoder.py new file mode 100644 index 0000000..61496da --- /dev/null +++ b/baselines/ETSformer/arch/decoder.py @@ -0,0 +1,84 @@ +import torch +import torch.nn as nn +from einops import rearrange, reduce, repeat + + +class DampingLayer(nn.Module): + + def __init__(self, pred_len, nhead, dropout=0.1, output_attention=False): + super().__init__() + self.pred_len = pred_len + self.nhead = nhead + self.output_attention = output_attention + self._damping_factor = nn.Parameter(torch.randn(1, nhead)) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + x = repeat(x, 'b 1 d -> b t d', t=self.pred_len) + b, t, d = x.shape + + powers = torch.arange(self.pred_len).to(self._damping_factor.device) + 1 + powers = powers.view(self.pred_len, 1) + damping_factors = self.damping_factor ** powers + damping_factors = damping_factors.cumsum(dim=0) + x = x.view(b, t, self.nhead, -1) + x = self.dropout(x) * damping_factors.unsqueeze(-1) + x = x.view(b, t, d) + if self.output_attention: + return x, damping_factors + return x, None + + @property + def damping_factor(self): + return torch.sigmoid(self._damping_factor) + + +class DecoderLayer(nn.Module): + + def __init__(self, d_model, nhead, c_out, pred_len, dropout=0.1, output_attention=False): + super().__init__() + self.d_model = d_model + self.nhead = nhead + self.c_out = c_out + self.pred_len = pred_len + self.output_attention = output_attention + + self.growth_damping = DampingLayer(pred_len, nhead, dropout=dropout, output_attention=output_attention) + self.dropout1 = nn.Dropout(dropout) + + def forward(self, growth, season): + growth_horizon, growth_damping = self.growth_damping(growth[:, -1:]) + growth_horizon = self.dropout1(growth_horizon) + + seasonal_horizon = season[:, -self.pred_len:] + + if self.output_attention: + return growth_horizon, seasonal_horizon, growth_damping + return growth_horizon, seasonal_horizon, None + + +class Decoder(nn.Module): + + def __init__(self, layers): + super().__init__() + self.d_model = layers[0].d_model + self.c_out = layers[0].c_out + self.pred_len = layers[0].pred_len + self.nhead = layers[0].nhead + + self.layers = nn.ModuleList(layers) + self.pred = nn.Linear(self.d_model, self.c_out) + + def forward(self, growths, seasons): + growth_repr = [] + season_repr = [] + growth_dampings = [] + + for idx, layer in enumerate(self.layers): + growth_horizon, season_horizon, growth_damping = layer(growths[idx], seasons[idx]) + growth_repr.append(growth_horizon) + season_repr.append(season_horizon) + growth_dampings.append(growth_damping) + growth_repr = sum(growth_repr) + season_repr = sum(season_repr) + return self.pred(growth_repr), self.pred(season_repr), growth_dampings diff --git a/baselines/ETSformer/arch/encoder.py b/baselines/ETSformer/arch/encoder.py new file mode 100644 index 0000000..9c30eb7 --- /dev/null +++ b/baselines/ETSformer/arch/encoder.py @@ -0,0 +1,227 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.fft as fft + +import numpy as np +from einops import rearrange, reduce, repeat +import math, random + +from .modules import Feedforward +from .exponential_smoothing import ExponentialSmoothing + + +class GrowthLayer(nn.Module): + + def __init__(self, d_model, nhead, d_head=None, dropout=0.1, output_attention=False): + super().__init__() + self.d_head = d_head or (d_model // nhead) + self.d_model = d_model + self.nhead = nhead + self.output_attention = output_attention + + self.z0 = nn.Parameter(torch.randn(self.nhead, self.d_head)) + self.in_proj = nn.Linear(self.d_model, self.d_head * self.nhead) + self.es = ExponentialSmoothing(self.d_head, self.nhead, dropout=dropout) + self.out_proj = nn.Linear(self.d_head * self.nhead, self.d_model) + + assert self.d_head * self.nhead == self.d_model, "d_model must be divisible by nhead" + + def forward(self, inputs): + """ + :param inputs: shape: (batch, seq_len, dim) + :return: shape: (batch, seq_len, dim) + """ + b, t, d = inputs.shape + values = self.in_proj(inputs).view(b, t, self.nhead, -1) + values = torch.cat([repeat(self.z0, 'h d -> b 1 h d', b=b), values], dim=1) + values = values[:, 1:] - values[:, :-1] + out = self.es(values) + out = torch.cat([repeat(self.es.v0, '1 1 h d -> b 1 h d', b=b), out], dim=1) + out = rearrange(out, 'b t h d -> b t (h d)') + out = self.out_proj(out) + + if self.output_attention: + return out, self.es.get_exponential_weight(t)[1] + return out, None + + +class FourierLayer(nn.Module): + + def __init__(self, d_model, pred_len, k=None, low_freq=1, output_attention=False): + super().__init__() + self.d_model = d_model + self.pred_len = pred_len + self.k = k + self.low_freq = low_freq + self.output_attention = output_attention + + def forward(self, x): + """x: (b, t, d)""" + + if self.output_attention: + return self.dft_forward(x) + + b, t, d = x.shape + x_freq = fft.rfft(x, dim=1) + + if t % 2 == 0: + x_freq = x_freq[:, self.low_freq:-1] + f = fft.rfftfreq(t)[self.low_freq:-1] + else: + x_freq = x_freq[:, self.low_freq:] + f = fft.rfftfreq(t)[self.low_freq:] + + x_freq, index_tuple = self.topk_freq(x_freq) + + index_tuple = tuple(t.to(x_freq.device) for t in index_tuple) + + + f = repeat(f, 'f -> b f d', b=x_freq.size(0), d=x_freq.size(2)).to(x_freq.device) + f = rearrange(f[index_tuple], 'b f d -> b f () d').to(x_freq.device) + + return self.extrapolate(x_freq, f, t), None + + def extrapolate(self, x_freq, f, t): + x_freq = torch.cat([x_freq, x_freq.conj()], dim=1) + f = torch.cat([f, -f], dim=1) + t_val = rearrange(torch.arange(t + self.pred_len, dtype=torch.float), + 't -> () () t ()').to(x_freq.device) + + amp = rearrange(x_freq.abs() / t, 'b f d -> b f () d') + phase = rearrange(x_freq.angle(), 'b f d -> b f () d') + + x_time = amp * torch.cos(2 * math.pi * f * t_val + phase) + + return reduce(x_time, 'b f t d -> b t d', 'sum') + + def topk_freq(self, x_freq): + values, indices = torch.topk(x_freq.abs(), self.k, dim=1, largest=True, sorted=True) + mesh_a, mesh_b = torch.meshgrid(torch.arange(x_freq.size(0)), torch.arange(x_freq.size(2))) + index_tuple = (mesh_a.unsqueeze(1), indices, mesh_b.unsqueeze(1)) + x_freq = x_freq[index_tuple] + + return x_freq, index_tuple + + def dft_forward(self, x): + T = x.size(1) + + dft_mat = fft.fft(torch.eye(T)) + i, j = torch.meshgrid(torch.arange(self.pred_len + T), torch.arange(T)) + omega = np.exp(2 * math.pi * 1j / T) + idft_mat = (np.power(omega, i * j) / T).cfloat() + + x_freq = torch.einsum('ft,btd->bfd', [dft_mat, x.cfloat()]) + + if T % 2 == 0: + x_freq = x_freq[:, self.low_freq:T // 2] + else: + x_freq = x_freq[:, self.low_freq:T // 2 + 1] + + _, indices = torch.topk(x_freq.abs(), self.k, dim=1, largest=True, sorted=True) + indices = indices + self.low_freq + indices = torch.cat([indices, -indices], dim=1) + + dft_mat = repeat(dft_mat, 'f t -> b f t d', b=x.shape[0], d=x.shape[-1]) + idft_mat = repeat(idft_mat, 't f -> b t f d', b=x.shape[0], d=x.shape[-1]) + + mesh_a, mesh_b = torch.meshgrid(torch.arange(x.size(0)), torch.arange(x.size(2))) + + dft_mask = torch.zeros_like(dft_mat) + dft_mask[mesh_a, indices, :, mesh_b] = 1 + dft_mat = dft_mat * dft_mask + + idft_mask = torch.zeros_like(idft_mat) + idft_mask[mesh_a, :, indices, mesh_b] = 1 + idft_mat = idft_mat * idft_mask + + attn = torch.einsum('bofd,bftd->botd', [idft_mat, dft_mat]).real + return torch.einsum('botd,btd->bod', [attn, x]), rearrange(attn, 'b o t d -> b d o t') + + +class LevelLayer(nn.Module): + + def __init__(self, d_model, c_out, dropout=0.1): + super().__init__() + self.d_model = d_model + self.c_out = c_out + + self.es = ExponentialSmoothing(1, self.c_out, dropout=dropout, aux=True) + self.growth_pred = nn.Linear(self.d_model, self.c_out) + self.season_pred = nn.Linear(self.d_model, self.c_out) + + def forward(self, level, growth, season): + b, t, _ = level.shape + growth = self.growth_pred(growth).view(b, t, self.c_out, 1) + season = self.season_pred(season).view(b, t, self.c_out, 1) + growth = growth.view(b, t, self.c_out, 1) + season = season.view(b, t, self.c_out, 1) + level = level.view(b, t, self.c_out, 1) + out = self.es(level - season, aux_values=growth) + out = rearrange(out, 'b t h d -> b t (h d)') + return out + +class EncoderLayer(nn.Module): + + def __init__(self, d_model, nhead, c_out, seq_len, pred_len, k, dim_feedforward=None, dropout=0.1, + activation='sigmoid', layer_norm_eps=1e-5, output_attention=False): + super().__init__() + self.d_model = d_model + self.nhead = nhead + self.c_out = c_out + self.seq_len = seq_len + self.pred_len = pred_len + dim_feedforward = dim_feedforward or 4 * d_model + self.dim_feedforward = dim_feedforward + + self.growth_layer = GrowthLayer(d_model, nhead, dropout=dropout, output_attention=output_attention) + self.seasonal_layer = FourierLayer(d_model, pred_len, k=k, output_attention=output_attention) + self.level_layer = LevelLayer(d_model, c_out, dropout=dropout) + + # Implementation of Feedforward model + self.ff = Feedforward(d_model, dim_feedforward, dropout=dropout, activation=activation) + self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) + self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) + + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + def forward(self, res, level, attn_mask=None): + season, season_attn = self._season_block(res) + res = res - season[:, :-self.pred_len] + growth, growth_attn = self._growth_block(res) + res = self.norm1(res - growth[:, 1:]) + res = self.norm2(res + self.ff(res)) + + level = self.level_layer(level, growth[:, :-1], season[:, :-self.pred_len]) + + return res, level, growth, season, season_attn, growth_attn + + def _growth_block(self, x): + x, growth_attn = self.growth_layer(x) + return self.dropout1(x), growth_attn + + def _season_block(self, x): + x, season_attn = self.seasonal_layer(x) + return self.dropout2(x), season_attn + + +class Encoder(nn.Module): + + def __init__(self, layers): + super().__init__() + self.layers = nn.ModuleList(layers) + + def forward(self, res, level, attn_mask=None): + growths = [] + seasons = [] + season_attns = [] + growth_attns = [] + for layer in self.layers: + res, level, growth, season, season_attn, growth_attn = layer(res, level, attn_mask=None) + growths.append(growth) + seasons.append(season) + season_attns.append(season_attn) + growth_attns.append(growth_attn) + + return level, growths, seasons, season_attns, growth_attns diff --git a/baselines/ETSformer/arch/etsformer_arch.py b/baselines/ETSformer/arch/etsformer_arch.py new file mode 100644 index 0000000..217efbb --- /dev/null +++ b/baselines/ETSformer/arch/etsformer_arch.py @@ -0,0 +1,116 @@ +import torch +import torch.nn as nn +from einops import reduce + +from .modules import ETSEmbedding +from .encoder import EncoderLayer, Encoder +from .decoder import DecoderLayer, Decoder + + +class Transform: + def __init__(self, sigma): + self.sigma = sigma + + @torch.no_grad() + def transform(self, x): + return self.jitter(self.shift(self.scale(x))) + + def jitter(self, x): + return x + (torch.randn(x.shape).to(x.device) * self.sigma) + + def scale(self, x): + return x * (torch.randn(x.size(-1)).to(x.device) * self.sigma + 1) + + def shift(self, x): + return x + (torch.randn(x.size(-1)).to(x.device) * self.sigma) + + +class ETSformer(nn.Module): + + def __init__(self, **model_args): + super().__init__() + + + self.seq_len = model_args['seq_len'] + self.pred_len = model_args['pred_len'] + self.e_layers = model_args['e_layers'] + self.d_layers = model_args['d_layers'] + self.enc_in = model_args['enc_in'] + self.d_model = model_args['d_model'] + self.dropout = model_args['dropout'] + self.n_head = model_args['n_heads'] + self.c_out = model_args['c_out'] + self.K = model_args['K'] + self.d_ff = model_args['d_ff'] + self.sigma = model_args['sigma'] + self.activation = model_args['activation'] + self.output_attention = model_args['output_attention'] + + assert self.e_layers == self.d_layers, "Encoder and decoder layers must be equal" + + # Embedding + self.enc_embedding = ETSEmbedding(self.enc_in, self.d_model, dropout=self.dropout) + + # Encoder + self.encoder = Encoder( + [ + EncoderLayer( + self.d_model, self.n_head, self.c_out, self.seq_len, self.pred_len, self.K, + dim_feedforward=self.d_ff, + dropout=self.dropout, + activation=self.activation, + output_attention=self.output_attention, + ) for _ in range(self.e_layers) + ] + ) + + # Decoder + self.decoder = Decoder( + [ + DecoderLayer( + self.d_model, self.n_head, self.c_out, self.pred_len, + dropout=self.dropout, + output_attention=self.output_attention, + ) for _ in range(self.d_layers) + ], + ) + + self.transform = Transform(sigma=self.sigma) + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + enc_self_mask=None, + decomposed=False, attention=False): + """ + Args: + history_data (Tensor): Input data with shape: [B, L1, N, C] + future_data (Tensor): Future data with shape: [B, L2, N, C] + + Returns: + torch.Tensor: outputs with shape [B, L2, N, 1] + """ + x_enc = history_data[:,:,:,0] + with torch.no_grad(): + if self.training: + x_enc = self.transform.transform(x_enc) + res = self.enc_embedding(x_enc) + level, growths, seasons, season_attns, growth_attns = self.encoder(res, x_enc, attn_mask=enc_self_mask) + + growth, season, growth_dampings = self.decoder(growths, seasons) + + if decomposed: + return level[:, -1:], growth, season + + preds = level[:, -1:] + growth + season + + if attention: + decoder_growth_attns = [] + for growth_attn, growth_damping in zip(growth_attns, growth_dampings): + decoder_growth_attns.append(torch.einsum('bth,oh->bhot', [growth_attn.squeeze(-1), growth_damping])) + + season_attns = torch.stack(season_attns, dim=0)[:, :, -self.pred_len:] + season_attns = reduce(season_attns, 'l b d o t -> b o t', reduction='mean') + decoder_growth_attns = torch.stack(decoder_growth_attns, dim=0)[:, :, -self.pred_len:] + decoder_growth_attns = reduce(decoder_growth_attns, 'l b d o t -> b o t', reduction='mean') + return preds, season_attns, decoder_growth_attns + preds = preds.unsqueeze(-1) + return preds diff --git a/baselines/ETSformer/arch/exponential_smoothing.py b/baselines/ETSformer/arch/exponential_smoothing.py new file mode 100644 index 0000000..96c167d --- /dev/null +++ b/baselines/ETSformer/arch/exponential_smoothing.py @@ -0,0 +1,68 @@ +import math + +import torch +import torch.nn as nn +import torch.fft as fft + +from einops import rearrange, reduce, repeat +from scipy.fftpack import next_fast_len + + +def conv1d_fft(f, g, dim=-1): + N = f.size(dim) + M = g.size(dim) + + fast_len = next_fast_len(N + M - 1) + + F_f = fft.rfft(f, fast_len, dim=dim) + F_g = fft.rfft(g, fast_len, dim=dim) + + F_fg = F_f * F_g.conj() + out = fft.irfft(F_fg, fast_len, dim=dim) + out = out.roll((-1,), dims=(dim,)) + idx = torch.as_tensor(range(fast_len - N, fast_len)).to(out.device) + out = out.index_select(dim, idx) + + return out + + +class ExponentialSmoothing(nn.Module): + + def __init__(self, dim, nhead, dropout=0.1, aux=False): + super().__init__() + self._smoothing_weight = nn.Parameter(torch.randn(nhead, 1)) + self.v0 = nn.Parameter(torch.randn(1, 1, nhead, dim)) + self.dropout = nn.Dropout(dropout) + if aux: + self.aux_dropout = nn.Dropout(dropout) + + def forward(self, values, aux_values=None): + b, t, h, d = values.shape + + init_weight, weight = self.get_exponential_weight(t) + output = conv1d_fft(self.dropout(values), weight, dim=1) + output = init_weight * self.v0 + output + + if aux_values is not None: + aux_weight = weight / (1 - self.weight) * self.weight + aux_output = conv1d_fft(self.aux_dropout(aux_values), aux_weight) + output = output + aux_output + + return output + + def get_exponential_weight(self, T): + # Generate array [0, 1, ..., T-1] + powers = torch.arange(T, dtype=torch.float, device=self.weight.device) + + # (1 - \alpha) * \alpha^t, for all t = T-1, T-2, ..., 0] + weight = (1 - self.weight) * (self.weight ** torch.flip(powers, dims=(0,))) + + # \alpha^t for all t = 1, 2, ..., T + init_weight = self.weight ** (powers + 1) + + return rearrange(init_weight, 'h t -> 1 t h 1'), \ + rearrange(weight, 'h t -> 1 t h 1') + + @property + def weight(self): + return torch.sigmoid(self._smoothing_weight) diff --git a/baselines/ETSformer/arch/modules.py b/baselines/ETSformer/arch/modules.py new file mode 100644 index 0000000..fd2572c --- /dev/null +++ b/baselines/ETSformer/arch/modules.py @@ -0,0 +1,32 @@ +import torch.nn as nn +import torch.nn.functional as F + + +class ETSEmbedding(nn.Module): + def __init__(self, c_in, d_model, dropout=0.1): + super().__init__() + self.conv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=2, bias=False) + self.dropout = nn.Dropout(p=dropout) + nn.init.kaiming_normal_(self.conv.weight) + + def forward(self, x,): + + x = self.conv(x.permute(0,2,1))[..., :-2] + + return self.dropout(x.transpose(1,2)) + + +class Feedforward(nn.Module): + def __init__(self, d_model, dim_feedforward, dropout=0.1, activation='sigmoid'): + # Implementation of Feedforward model + super().__init__() + self.linear1 = nn.Linear(d_model, dim_feedforward, bias=False) + self.dropout1 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model, bias=False) + self.dropout2 = nn.Dropout(dropout) + self.activation = getattr(F, activation) + + def forward(self, x): + x = self.linear2(self.dropout1(self.activation(self.linear1(x)))) + return self.dropout2(x) diff --git a/baselines/FiLM/Electricity.py b/baselines/FiLM/Electricity.py new file mode 100644 index 0000000..1ff99a9 --- /dev/null +++ b/baselines/FiLM/Electricity.py @@ -0,0 +1,155 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import FiLM + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = FiLM +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 1, # attn factor + "d_model": 512, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 2, # num of encoder layers + "ratio": 0.5, + "multiscale" : [1, 2, 4], + "window_size" : [256], + "dropout": 0.05, + "freq": 'h', + "use_norm" : False, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 8 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 8 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 8 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/FiLM/arch/__init__.py b/baselines/FiLM/arch/__init__.py new file mode 100644 index 0000000..e297243 --- /dev/null +++ b/baselines/FiLM/arch/__init__.py @@ -0,0 +1 @@ +from .film_arch import FiLM \ No newline at end of file diff --git a/baselines/FiLM/arch/film_arch.py b/baselines/FiLM/arch/film_arch.py new file mode 100644 index 0000000..efdfecb --- /dev/null +++ b/baselines/FiLM/arch/film_arch.py @@ -0,0 +1,154 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from scipy import signal +from scipy import special as ss + + + +def transition(N): + Q = np.arange(N, dtype=np.float64) + R = (2 * Q + 1)[:, None] # / theta + j, i = np.meshgrid(Q, Q) + A = np.where(i < j, -1, (-1.) ** (i - j + 1)) * R + B = (-1.) ** Q[:, None] * R + return A, B + + +class HiPPO_LegT(nn.Module): + def __init__(self, N, dt=1.0, discretization='bilinear'): + """ + N: the order of the HiPPO projection + dt: discretization step size - should be roughly inverse to the length of the sequence + """ + super(HiPPO_LegT, self).__init__() + self.N = N + A, B = transition(N) + C = np.ones((1, N)) + D = np.zeros((1,)) + A, B, _, _, _ = signal.cont2discrete((A, B, C, D), dt=dt, method=discretization) + + B = B.squeeze(-1) + + self.register_buffer('A', torch.Tensor(A)) + self.register_buffer('B', torch.Tensor(B)) + vals = np.arange(0.0, 1.0, dt) + self.register_buffer('eval_matrix', torch.Tensor( + ss.eval_legendre(np.arange(N)[:, None], 1 - 2 * vals).T)) + + def forward(self, inputs): + """ + inputs : (length, ...) + output : (length, ..., N) where N is the order of the HiPPO projection + """ + c = torch.zeros(inputs.shape[:-1] + tuple([self.N])).to(inputs.device) + cs = [] + for f in inputs.permute([-1, 0, 1]): + f = f.unsqueeze(-1) + new = f @ self.B.unsqueeze(0) + new = new.to(inputs.device) + c = F.linear(c, self.A.to(inputs.device)) + new + cs.append(c) + return torch.stack(cs, dim=0) + + def reconstruct(self, c): + return (self.eval_matrix @ c.unsqueeze(-1)).squeeze(-1) + + +class SpectralConv1d(nn.Module): + def __init__(self, in_channels, out_channels, seq_len, ratio=0.5): + """ + 1D Fourier layer. It does FFT, linear transform, and Inverse FFT. + """ + super(SpectralConv1d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.ratio = ratio + self.modes = min(32, seq_len // 2) + self.index = list(range(0, self.modes)) + + self.scale = (1 / (in_channels * out_channels)) + self.weights_real = nn.Parameter( + self.scale * torch.rand(in_channels, out_channels, len(self.index), dtype=torch.float)) + self.weights_imag = nn.Parameter( + self.scale * torch.rand(in_channels, out_channels, len(self.index), dtype=torch.float)) + + def compl_mul1d(self, order, x, weights_real, weights_imag): + return torch.complex(torch.einsum(order, x.real, weights_real) - torch.einsum(order, x.imag, weights_imag), + torch.einsum(order, x.real, weights_imag) + torch.einsum(order, x.imag, weights_real)) + + def forward(self, x): + B, H, E, N = x.shape + x_ft = torch.fft.rfft(x) + out_ft = torch.zeros(B, H, self.out_channels, x.size(-1) // 2 + 1, device=x.device, dtype=torch.cfloat) + a = x_ft[:, :, :, :self.modes] + out_ft[:, :, :, :self.modes] = self.compl_mul1d("bjix,iox->bjox", a, self.weights_real, self.weights_imag) + x = torch.fft.irfft(out_ft, n=x.size(-1)) + return x + + +class FiLM(nn.Module): + """ + Paper link: https://arxiv.org/abs/2205.08897 + """ + def __init__(self, **model_args): + super(FiLM, self).__init__() + + self.seq_len = model_args['seq_len'] + self.pred_len = self.seq_len if model_args['pred_len'] == 0 else model_args['pred_len'] + + self.output_attention = model_args['output_attention'] + self.layers = model_args['e_layers'] + self.enc_in = model_args['enc_in'] + self.e_layers = model_args['e_layers'] + # b, s, f means b, f + self.affine_weight = nn.Parameter(torch.ones(1, 1, model_args['enc_in'])) + self.affine_bias = nn.Parameter(torch.zeros(1, 1, model_args['enc_in'])) + + self.multiscale = model_args['multiscale'] # 1 2 4 + self.window_size = model_args['window_size'] # 256 + self.ratio = model_args['ratio'] + self.legts = nn.ModuleList( + [HiPPO_LegT(N=n, dt=1. / self.pred_len / i) for n in self.window_size for i in self.multiscale]) + self.spec_conv_1 = nn.ModuleList([SpectralConv1d(in_channels=n, out_channels=n, + seq_len=min(self.pred_len, self.seq_len), + ratio=self.ratio) for n in + self.window_size for _ in range(len(self.multiscale))]) + self.mlp = nn.Linear(len(self.multiscale) * len(self.window_size), 1) + + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs): + # Normalization from Non-stationary Transformer + x_enc = history_data[:, :, :, 0] + means = x_enc.mean(1, keepdim=True).detach() + x_enc = x_enc - means + stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach() + x_enc /= stdev + + x_enc = x_enc * self.affine_weight + self.affine_bias + x_decs = [] + jump_dist = 0 + for i in range(0, len(self.multiscale) * len(self.window_size)): + x_in_len = self.multiscale[i % len(self.multiscale)] * self.pred_len + x_in = x_enc[:, -x_in_len:] + legt = self.legts[i] + x_in_c = legt(x_in.transpose(1, 2)).permute([1, 2, 3, 0])[:, :, :, jump_dist:] + out1 = self.spec_conv_1[i](x_in_c) + if self.seq_len >= self.pred_len: + x_dec_c = out1.transpose(2, 3)[:, :, self.pred_len - 1 - jump_dist, :] + else: + x_dec_c = out1.transpose(2, 3)[:, :, -1, :] + x_dec = x_dec_c @ legt.eval_matrix[-self.pred_len:, :].T + x_decs.append(x_dec) + x_dec = torch.stack(x_decs, dim=-1) + x_dec = self.mlp(x_dec).squeeze(-1).permute(0, 2, 1) + + # De-Normalization from Non-stationary Transformer + x_dec = x_dec - self.affine_bias + x_dec = x_dec / (self.affine_weight + 1e-10) + x_dec = x_dec * stdev + x_dec = x_dec + means + return x_dec.unsqueeze(-1) + diff --git a/baselines/Koopa/Electricity.py b/baselines/Koopa/Electricity.py new file mode 100644 index 0000000..b569f78 --- /dev/null +++ b/baselines/Koopa/Electricity.py @@ -0,0 +1,152 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import Koopa + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = Koopa +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "seg_len": 168, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "dynamic_dim": 512,#128 + "hidden_dim": 512, #256 # window size of moving average. This is a CRUCIAL hyper-parameter. + "hidden_layers": 3, + "num_blocks": 1, # num of encoder layers + "alpha" : 0.2, + "dropout": 0.05, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "multistep" : False, + "activation": "sigmoid", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/Koopa/arch/__init__.py b/baselines/Koopa/arch/__init__.py new file mode 100644 index 0000000..96273e7 --- /dev/null +++ b/baselines/Koopa/arch/__init__.py @@ -0,0 +1,2 @@ +from .koopa_arch import Koopa + diff --git a/baselines/Koopa/arch/koopa_arch.py b/baselines/Koopa/arch/koopa_arch.py new file mode 100644 index 0000000..cdf7752 --- /dev/null +++ b/baselines/Koopa/arch/koopa_arch.py @@ -0,0 +1,341 @@ +import math +import torch +import torch.nn as nn + + + +class FourierFilter(nn.Module): + """ + Fourier Filter: to time-variant and time-invariant term + """ + def __init__(self, mask_spectrum): + super(FourierFilter, self).__init__() + self.mask_spectrum = mask_spectrum + + def forward(self, x): + xf = torch.fft.rfft(x, dim=1) + mask = torch.ones_like(xf) + mask[:, self.mask_spectrum, :] = 0 + x_var = torch.fft.irfft(xf*mask, dim=1) + x_inv = x - x_var + + return x_var, x_inv + + +class MLP(nn.Module): + ''' + Multilayer perceptron to encode/decode high dimension representation of sequential data + ''' + def __init__(self, + f_in, + f_out, + hidden_dim=128, + hidden_layers=2, + dropout=0.05, + activation='tanh'): + super(MLP, self).__init__() + self.f_in = f_in + self.f_out = f_out + self.hidden_dim = hidden_dim + self.hidden_layers = hidden_layers + self.dropout = dropout + if activation == 'relu': + self.activation = nn.ReLU() + elif activation == 'tanh': + self.activation = nn.Tanh() + else: + raise NotImplementedError + + layers = [nn.Linear(self.f_in, self.hidden_dim), + self.activation, nn.Dropout(self.dropout)] + for i in range(self.hidden_layers-2): + layers += [nn.Linear(self.hidden_dim, self.hidden_dim), + self.activation, nn.Dropout(dropout)] + + layers += [nn.Linear(hidden_dim, f_out)] + self.layers = nn.Sequential(*layers) + + def forward(self, x): + # x: B x S x f_in + # y: B x S x f_out + y = self.layers(x) + return y + + +class KPLayer(nn.Module): + """ + A demonstration of finding one step transition of linear system by DMD iteratively + """ + def __init__(self): + super(KPLayer, self).__init__() + + self.K = None # B E E + + def one_step_forward(self, z, return_rec=False, return_K=False): + B, input_len, E = z.shape + assert input_len > 1, 'snapshots number should be larger than 1' + x, y = z[:, :-1], z[:, 1:] + + # solve linear system + self.K = torch.linalg.lstsq(x, y).solution # B E E + if torch.isnan(self.K).any(): + print('Encounter K with nan, replace K by identity matrix') + self.K = torch.eye(self.K.shape[1]).to(self.K.device).unsqueeze(0).repeat(B, 1, 1) + + z_pred = torch.bmm(z[:, -1:], self.K) + if return_rec: + z_rec = torch.cat((z[:, :1], torch.bmm(x, self.K)), dim=1) + return z_rec, z_pred + + return z_pred + + def forward(self, z, pred_len=1): + assert pred_len >= 1, 'prediction length should not be less than 1' + z_rec, z_pred= self.one_step_forward(z, return_rec=True) + z_preds = [z_pred] + for i in range(1, pred_len): + z_pred = torch.bmm(z_pred, self.K) + z_preds.append(z_pred) + z_preds = torch.cat(z_preds, dim=1) + return z_rec, z_preds + + +class KPLayerApprox(nn.Module): + """ + Find koopman transition of linear system by DMD with multistep K approximation + """ + def __init__(self): + super(KPLayerApprox, self).__init__() + + self.K = None # B E E + self.K_step = None # B E E + + def forward(self, z, pred_len=1): + # z: B L E, koopman invariance space representation + # z_rec: B L E, reconstructed representation + # z_pred: B S E, forecasting representation + B, input_len, E = z.shape + assert input_len > 1, 'snapshots number should be larger than 1' + x, y = z[:, :-1], z[:, 1:] + + # solve linear system + self.K = torch.linalg.lstsq(x, y).solution # B E E + + if torch.isnan(self.K).any(): + print('Encounter K with nan, replace K by identity matrix') + self.K = torch.eye(self.K.shape[1]).to(self.K.device).unsqueeze(0).repeat(B, 1, 1) + + z_rec = torch.cat((z[:, :1], torch.bmm(x, self.K)), dim=1) # B L E + + if pred_len <= input_len: + self.K_step = torch.linalg.matrix_power(self.K, pred_len) + if torch.isnan(self.K_step).any(): + print('Encounter multistep K with nan, replace it by identity matrix') + self.K_step = torch.eye(self.K_step.shape[1]).to(self.K_step.device).unsqueeze(0).repeat(B, 1, 1) + z_pred = torch.bmm(z[:, -pred_len:, :], self.K_step) + else: + self.K_step = torch.linalg.matrix_power(self.K, input_len) + if torch.isnan(self.K_step).any(): + print('Encounter multistep K with nan, replace it by identity matrix') + self.K_step = torch.eye(self.K_step.shape[1]).to(self.K_step.device).unsqueeze(0).repeat(B, 1, 1) + temp_z_pred, all_pred = z, [] + for _ in range(math.ceil(pred_len / input_len)): + temp_z_pred = torch.bmm(temp_z_pred, self.K_step) + all_pred.append(temp_z_pred) + z_pred = torch.cat(all_pred, dim=1)[:, :pred_len, :] + + return z_rec, z_pred + + +class TimeVarKP(nn.Module): + """ + Koopman Predictor with DMD (analysitical solution of Koopman operator) + Utilize local variations within individual sliding window to predict the future of time-variant term + """ + def __init__(self, + enc_in=8, + input_len=96, + pred_len=96, + seg_len=24, + dynamic_dim=128, + encoder=None, + decoder=None, + multistep=False, + ): + super(TimeVarKP, self).__init__() + self.input_len = input_len + self.pred_len = pred_len + self.enc_in = enc_in + self.seg_len = seg_len + self.dynamic_dim = dynamic_dim + self.multistep = multistep + self.encoder, self.decoder = encoder, decoder + self.freq = math.ceil(self.input_len / self.seg_len) # segment number of input + self.step = math.ceil(self.pred_len / self.seg_len) # segment number of output + self.padding_len = self.seg_len * self.freq - self.input_len + # Approximate mulitstep K by KPLayerApprox when pred_len is large + self.dynamics = KPLayerApprox() if self.multistep else KPLayer() + + def forward(self, x): + # x: B L C + B, L, C = x.shape + + res = torch.cat((x[:, L-self.padding_len:, :], x) ,dim=1) + + res = res.chunk(self.freq, dim=1) # F x B P C, P means seg_len + res = torch.stack(res, dim=1).reshape(B, self.freq, -1) # B F PC + + res = self.encoder(res) # B F H + x_rec, x_pred = self.dynamics(res, self.step) # B F H, B S H + + x_rec = self.decoder(x_rec) # B F PC + x_rec = x_rec.reshape(B, self.freq, self.seg_len, self.enc_in) + x_rec = x_rec.reshape(B, -1, self.enc_in)[:, :self.input_len, :] # B L C + + x_pred = self.decoder(x_pred) # B S PC + x_pred = x_pred.reshape(B, self.step, self.seg_len, self.enc_in) + x_pred = x_pred.reshape(B, -1, self.enc_in)[:, :self.pred_len, :] # B S C + + return x_rec, x_pred + + +class TimeInvKP(nn.Module): + """ + Koopman Predictor with learnable Koopman operator + Utilize lookback and forecast window snapshots to predict the future of time-invariant term + """ + def __init__(self, + input_len=96, + pred_len=96, + dynamic_dim=128, + encoder=None, + decoder=None): + super(TimeInvKP, self).__init__() + self.dynamic_dim = dynamic_dim + self.input_len = input_len + self.pred_len = pred_len + self.encoder = encoder + self.decoder = decoder + + K_init = torch.randn(self.dynamic_dim, self.dynamic_dim) + U, _, V = torch.svd(K_init) # stable initialization + self.K = nn.Linear(self.dynamic_dim, self.dynamic_dim, bias=False) + self.K.weight.data = torch.mm(U, V.t()) + + def forward(self, x): + # x: B L C + res = x.transpose(1, 2) # B C L + res = self.encoder(res) # B C H + res = self.K(res) # B C H + res = self.decoder(res) # B C S + res = res.transpose(1, 2) # B S C + + return res + + +class Koopa(nn.Module): + ''' + Koopman Forecasting Model + ''' + def __init__(self, **model_args): + super(Koopa, self).__init__() + self.mask_spectrum = None # 初始化为None + self.alpha = model_args['alpha'] # 假设configs中有alpha参数 + self.enc_in = model_args['enc_in'] + self.input_len = model_args['seq_len'] + self.pred_len = model_args['pred_len'] + self.seg_len = model_args['seg_len'] + self.num_blocks = model_args['num_blocks'] + self.dynamic_dim = model_args['dynamic_dim'] + self.hidden_dim = model_args['hidden_dim'] + self.hidden_layers = model_args['hidden_layers'] + self.multistep = model_args['multistep'] + self.amps = 0.0 + self.disentanglement = FourierFilter(self.mask_spectrum) + + # shared encoder/decoder to make koopman embedding consistent + self.time_inv_encoder = MLP(f_in=self.input_len, f_out=self.dynamic_dim, activation='relu', + hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers) + self.time_inv_decoder = MLP(f_in=self.dynamic_dim, f_out=self.pred_len, activation='relu', + hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers) + self.time_inv_kps = self.time_var_kps = nn.ModuleList([ + TimeInvKP(input_len=self.input_len, + pred_len=self.pred_len, + dynamic_dim=self.dynamic_dim, + encoder=self.time_inv_encoder, + decoder=self.time_inv_decoder) + for _ in range(self.num_blocks)]) + + # shared encoder/decoder to make koopman embedding consistent + self.time_var_encoder = MLP(f_in=self.seg_len*self.enc_in, f_out=self.dynamic_dim, activation='tanh', + hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers) + self.time_var_decoder = MLP(f_in=self.dynamic_dim, f_out=self.seg_len*self.enc_in, activation='tanh', + hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers) + self.time_var_kps = nn.ModuleList([ + TimeVarKP(enc_in=self.enc_in, + input_len=self.input_len, + pred_len=self.pred_len, + seg_len=self.seg_len, + dynamic_dim=self.dynamic_dim, + encoder=self.time_var_encoder, + decoder=self.time_var_decoder, + multistep=self.multistep) + for _ in range(self.num_blocks)]) + + def _get_mask_spectrum(self, train_loader): + """ + get shared frequency spectrums + """ + + for data in train_loader: + lookback_window = data + self.amps += abs(torch.fft.rfft(lookback_window, dim=1)).mean(dim=0).mean(dim=1) + mask_spectrum = self.amps.topk(int(self.amps.shape[0]*self.alpha)).indices + + return mask_spectrum + + def train_model(self, train_loader): + # 在训练阶段计算mask_spectrum + self.mask_spectrum = self._get_mask_spectrum(train_loader) + self.disentanglement = FourierFilter(self.mask_spectrum) + # 继续训练过程... + + def test_model(self): + if self.mask_spectrum is None: + raise ValueError("Model has not been trained yet.") + # 在测试阶段直接使用训练阶段计算的mask_spectrum + # 继续测试过程... + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, **kwargs): + # x_enc: B L C + # print(history_data.shape) + x_enc = history_data[:, :, :, 0] + + # Series Stationarization adopted from NSformer + mean_enc = x_enc.mean(1, keepdim=True).detach() # B x 1 x E + x_enc = x_enc - mean_enc + std_enc = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach() + x_enc = x_enc / std_enc + + # 在训练阶段计算mask_spectrum + if train: + self.mask_spectrum = self._get_mask_spectrum([history_data[:, :, :, 0]]) + self.disentanglement = FourierFilter(self.mask_spectrum) + + # Koopman Forecasting + residual, forecast = x_enc, None + for i in range(self.num_blocks): + time_var_input, time_inv_input = self.disentanglement(residual) + time_inv_output = self.time_inv_kps[i](time_inv_input) + time_var_backcast, time_var_output = self.time_var_kps[i](time_var_input) + residual = residual - time_var_backcast + if forecast is None: + forecast = (time_inv_output + time_var_output) + else: + forecast += (time_inv_output + time_var_output) + + # Series Stationarization adopted from NSformer + res = forecast * std_enc + mean_enc + res = res.unsqueeze(-1) + return res + diff --git a/baselines/LightTS/Electricity.py b/baselines/LightTS/Electricity.py new file mode 100644 index 0000000..91387f4 --- /dev/null +++ b/baselines/LightTS/Electricity.py @@ -0,0 +1,157 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import LightTS + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = LightTS +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 3, # attn factor + "chunk_size": 24, + "d_model": 32, # 512 + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 2, # num of encoder layers + "d_layers": 1, # num of decoder layers + "d_ff": 64, # 2048 + "distil": True, + "sigma" : 0.2, + "dropout": 0.3, + "freq": 'h', + "use_norm" : True, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/LightTS/arch/__init__.py b/baselines/LightTS/arch/__init__.py new file mode 100644 index 0000000..dbf4711 --- /dev/null +++ b/baselines/LightTS/arch/__init__.py @@ -0,0 +1 @@ +from .lightts_arch import LightTS \ No newline at end of file diff --git a/baselines/LightTS/arch/lightts_arch.py b/baselines/LightTS/arch/lightts_arch.py new file mode 100644 index 0000000..eb6809d --- /dev/null +++ b/baselines/LightTS/arch/lightts_arch.py @@ -0,0 +1,135 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class IEBlock(nn.Module): + def __init__(self, input_dim, hid_dim, output_dim, num_node): + super(IEBlock, self).__init__() + + self.input_dim = input_dim + self.hid_dim = hid_dim + self.output_dim = output_dim + self.num_node = num_node + + self._build() + + def _build(self): + self.spatial_proj = nn.Sequential( + nn.Linear(self.input_dim, self.hid_dim), + nn.LeakyReLU(), + nn.Linear(self.hid_dim, self.hid_dim // 4) + ) + + self.channel_proj = nn.Linear(self.num_node, self.num_node) + torch.nn.init.eye_(self.channel_proj.weight) + + self.output_proj = nn.Linear(self.hid_dim // 4, self.output_dim) + + def forward(self, x): + x = self.spatial_proj(x.permute(0, 2, 1)) + x = x.permute(0, 2, 1) + self.channel_proj(x.permute(0, 2, 1)) + x = self.output_proj(x.permute(0, 2, 1)) + + x = x.permute(0, 2, 1) + + return x + + +class LightTS(nn.Module): + """ + Paper link: https://arxiv.org/abs/2207.01186 + """ + + def __init__(self, **model_args): + """ + chunk_size: int, reshape T into [num_chunks, chunk_size] + """ + super(LightTS, self).__init__() + + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + chunk_size = model_args['chunk_size'] + self.chunk_size = min(self.pred_len, self.seq_len, chunk_size) + + assert (self.seq_len % self.chunk_size == 0) + self.num_chunks = self.seq_len // self.chunk_size + + self.d_model = model_args['d_model'] + self.enc_in = model_args['enc_in'] + self.dropout = model_args["dropout"] + self._build() + + def _build(self): + self.layer_1 = IEBlock( + input_dim=self.chunk_size, + hid_dim=self.d_model // 4, + output_dim=self.d_model // 4, + num_node=self.num_chunks + ) + + self.chunk_proj_1 = nn.Linear(self.num_chunks, 1) + + self.layer_2 = IEBlock( + input_dim=self.chunk_size, + hid_dim=self.d_model // 4, + output_dim=self.d_model // 4, + num_node=self.num_chunks + ) + + self.chunk_proj_2 = nn.Linear(self.num_chunks, 1) + + self.layer_3 = IEBlock( + input_dim=self.d_model // 2, + hid_dim=self.d_model // 2, + output_dim=self.pred_len, + num_node=self.enc_in + ) + + self.ar = nn.Linear(self.seq_len, self.pred_len) + + def encoder(self, x): + B, T, N = x.size() + + highway = self.ar(x.permute(0, 2, 1)) + highway = highway.permute(0, 2, 1) + + # continuous sampling + x1 = x.reshape(B, self.num_chunks, self.chunk_size, N) + x1 = x1.permute(0, 3, 2, 1) + x1 = x1.reshape(-1, self.chunk_size, self.num_chunks) + x1 = self.layer_1(x1) + x1 = self.chunk_proj_1(x1).squeeze(dim=-1) + + # interval sampling + x2 = x.reshape(B, self.chunk_size, self.num_chunks, N) + x2 = x2.permute(0, 3, 1, 2) + x2 = x2.reshape(-1, self.chunk_size, self.num_chunks) + x2 = self.layer_2(x2) + x2 = self.chunk_proj_2(x2).squeeze(dim=-1) + + x3 = torch.cat([x1, x2], dim=-1) + + x3 = x3.reshape(B, N, -1) + x3 = x3.permute(0, 2, 1) + + out = self.layer_3(x3) + + out = out + highway + return out + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + """ + + Args: + history_data (Tensor): Input data with shape: [B, L1, N, C] + future_data (Tensor): Future data with shape: [B, L2, N, C] + + Returns: + torch.Tensor: outputs with shape [B, L2, N, 1] + """ + x_enc = history_data[:, :, :, 0] + dec_out = self.encoder(x_enc) + return dec_out[:, -self.pred_len:, :].unsqueeze(-1) # [B, L, D] + diff --git a/baselines/MTSMixer/Electricity.py b/baselines/MTSMixer/Electricity.py new file mode 100644 index 0000000..fe5de90 --- /dev/null +++ b/baselines/MTSMixer/Electricity.py @@ -0,0 +1,164 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import MTSMixer + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = MTSMixer +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 3, # attn factor + "p_hidden_dims": [128, 128], + "p_hidden_layers": 2, + "d_model": 512, + "individual": False, + "seg" : 20, + "fac_T" : False, + "sampling" : 2, + "fac_C" : False, + "rev" : True, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 2, # num of encoder layers + "d_layers": 1, # num of decoder layers + "d_ff": 16, + "distil": True, + "sigma" : 0.2, + "dropout": 0.05, + "freq": 'h', + "use_norm" : True, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 16 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/MTSMixer/arch/Invertible.py b/baselines/MTSMixer/arch/Invertible.py new file mode 100644 index 0000000..6492b9a --- /dev/null +++ b/baselines/MTSMixer/arch/Invertible.py @@ -0,0 +1,105 @@ +import torch +import torch.nn as nn + +class ModifiedLayerNorm(nn.Module): + """ + Modified Layer Normalization normalizes vectors along channel dimension and temporal dimensions. + Input: tensor in shape [B, L, D] + """ + def __init__(self, num_channels, eps=1e-05): + super().__init__() + # The shape of learnable affine parameters is also [num_channels, ], keeping the same as vanilla Layer Normalization. + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x): + x = x.transpose(1, 2) + u = x.mean([1, 2], keepdim=True) # Mean along channel and spatial dimension. + s = (x - u).pow(2).mean([1, 2], keepdim=True) # Variance along channel and spatial dimensions. + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight.unsqueeze(-1) * x + self.bias.unsqueeze(-1) + + return x.transpose(1, 2) + + +class RevIN(nn.Module): + def __init__(self, num_features: int, eps=1e-5, affine=True): + """ + :param num_features: the number of features or channels + :param eps: a value added for numerical stability + :param affine: if True, RevIN has learnable affine parameters + """ + super(RevIN, self).__init__() + + self.num_features = num_features + self.eps = eps + self.affine = affine + + if self.affine: + self._init_params() + + def forward(self, x, mode:str): + if mode == 'norm': + self._get_statistics(x) + x = self._normalize(x) + + elif mode == 'denorm': + x = self._denormalize(x) + + else: raise NotImplementedError + + return x + + def _init_params(self): + # initialize RevIN params: (C,) + self.affine_weight = nn.Parameter(torch.ones(self.num_features)) + self.affine_bias = nn.Parameter(torch.zeros(self.num_features)) + + def _get_statistics(self, x): + dim2reduce = tuple(range(1, x.ndim-1)) + self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() + self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() + + def _normalize(self, x): + x = x - self.mean + x = x / self.stdev + if self.affine: + x = x * self.affine_weight + x = x + self.affine_bias + + return x + + def _denormalize(self, x): + if self.affine: + x = x - self.affine_bias + x = x / (self.affine_weight + self.eps*self.eps) + x = x * self.stdev + x = x + self.mean + + return x + + +class InvDiff(nn.Module): + def __init__(self, num_features: int): + super(InvDiff, self).__init__() + + self.num_features = num_features + self.pivot = None + + def forward(self, x, mode): + if mode == 'diff': + self.pivot = x[:, -1] + x = torch.diff(x, dim=1) + + return x + + elif mode == 'restore': + y = torch.zeros_like(x) + y[:, 0] = x[:, 0] + self.pivot + for idx in range(y.shape[1]-1): + y[:, idx] = x[:, idx] + y[:, idx-1] + + return y + + else: raise NotImplementedError diff --git a/baselines/MTSMixer/arch/Projection.py b/baselines/MTSMixer/arch/Projection.py new file mode 100644 index 0000000..94c955c --- /dev/null +++ b/baselines/MTSMixer/arch/Projection.py @@ -0,0 +1,25 @@ +import torch +import torch.nn as nn + +class ChannelProjection(nn.Module): + def __init__(self, seq_len, pred_len, num_channel, individual): + super().__init__() + + self.linears = nn.ModuleList([ + nn.Linear(seq_len, pred_len) for _ in range(num_channel) + ]) if individual else nn.Linear(seq_len, pred_len) + # self.dropouts = nn.ModuleList() + self.individual = individual + + def forward(self, x): + # x: [B, L, D] + x_out = [] + if self.individual: + for idx in range(x.shape[-1]): + x_out.append(self.linears[idx](x[:, :, idx])) + + x = torch.stack(x_out, dim=-1) + + else: x = self.linears(x.transpose(1, 2)).transpose(1, 2) + + return x diff --git a/baselines/MTSMixer/arch/__init__.py b/baselines/MTSMixer/arch/__init__.py new file mode 100644 index 0000000..104d220 --- /dev/null +++ b/baselines/MTSMixer/arch/__init__.py @@ -0,0 +1 @@ +from .mtsmixer_arch import MTSMixer \ No newline at end of file diff --git a/baselines/MTSMixer/arch/decomposition.py b/baselines/MTSMixer/arch/decomposition.py new file mode 100644 index 0000000..08f76b3 --- /dev/null +++ b/baselines/MTSMixer/arch/decomposition.py @@ -0,0 +1,58 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import repeat, rearrange +from contextlib import contextmanager + +def svd_denoise(x, cut): + x_ = x.clone().detach() + U, S, V = torch.linalg.svd(x_, full_matrices=False) + S[:, cut:] = 0 + + return U @ torch.diag(S[0, :]) @ V + +@contextmanager +def null_context(): + yield + +def exists(val): + return val is not None + +def default(val, d): + return val if exists(val) else d + +class NMF(nn.Module): + def __init__(self, dim, n, ratio=8, K=6, eps=2e-8): + super().__init__() + r = dim // ratio + + D = torch.zeros(dim, r).uniform_(0, 1) + C = torch.zeros(r, n).uniform_(0, 1) + + self.K = K + self.D = nn.Parameter(D) + self.C = nn.Parameter(C) + + self.eps = eps + + def forward(self, x): + b, D, C, eps = x.shape[0], self.D, self.C, self.eps + + # x is made non-negative with relu as proposed in paper + x = F.relu(x) + + D = repeat(D, 'd r -> b d r', b = b) + C = repeat(C, 'r n -> b r n', b = b) + + # transpose + t = lambda tensor: rearrange(tensor, 'b i j -> b j i') + + for k in reversed(range(self.K)): + # only calculate gradients on the last step, per propose 'One-step Gradient' + context = null_context if k == 0 else torch.no_grad + with context(): + C_new = C * ((t(D) @ x) / ((t(D) @ D @ C) + eps)) + D_new = D * ((x @ t(C)) / ((D @ C @ t(C)) + eps)) + C, D = C_new, D_new + + return D @ C \ No newline at end of file diff --git a/baselines/MTSMixer/arch/mtsmixer_arch.py b/baselines/MTSMixer/arch/mtsmixer_arch.py new file mode 100644 index 0000000..435f8bf --- /dev/null +++ b/baselines/MTSMixer/arch/mtsmixer_arch.py @@ -0,0 +1,128 @@ +import torch +import torch.nn as nn +from .Invertible import RevIN +from .Projection import ChannelProjection +from .decomposition import svd_denoise, NMF + +class MLPBlock(nn.Module): + def __init__(self, input_dim, mlp_dim) : + super().__init__() + self.fc1 = nn.Linear(input_dim, mlp_dim) + self.gelu = nn.GELU() + self.fc2 = nn.Linear(mlp_dim, input_dim) + + def forward(self, x): + # [B, L, D] or [B, D, L] + return self.fc2(self.gelu(self.fc1(x))) + + +class FactorizedTemporalMixing(nn.Module): + def __init__(self, input_dim, mlp_dim, sampling) : + super().__init__() + + assert sampling in [1, 2, 3, 4, 6, 8, 12] + self.sampling = sampling + self.temporal_fac = nn.ModuleList([ + MLPBlock(input_dim // sampling, mlp_dim) for _ in range(sampling) + ]) + + def merge(self, shape, x_list): + y = torch.zeros(shape, device=x_list[0].device) + for idx, x_pad in enumerate(x_list): + y[:, :, idx::self.sampling] = x_pad + + return y + + def forward(self, x): + x_samp = [] + for idx, samp in enumerate(self.temporal_fac): + x_samp.append(samp(x[:, :, idx::self.sampling])) + + x = self.merge(x.shape, x_samp) + + return x + + +class FactorizedChannelMixing(nn.Module): + def __init__(self, input_dim, factorized_dim) : + super().__init__() + + assert input_dim > factorized_dim + self.channel_mixing = MLPBlock(input_dim, factorized_dim) + + def forward(self, x): + + return self.channel_mixing(x) + + +class MixerBlock(nn.Module): + def __init__(self, tokens_dim, channels_dim, tokens_hidden_dim, channels_hidden_dim, fac_T, fac_C, sampling, norm_flag): + super().__init__() + self.tokens_mixing = FactorizedTemporalMixing(tokens_dim, tokens_hidden_dim, sampling) if fac_T else MLPBlock(tokens_dim, tokens_hidden_dim) + self.channels_mixing = FactorizedChannelMixing(channels_dim, channels_hidden_dim) if fac_C else None + self.norm = nn.LayerNorm(channels_dim) if norm_flag else None + + def forward(self,x): + # token-mixing [B, D, #tokens] + y = self.norm(x) if self.norm else x + y = self.tokens_mixing(y.transpose(1, 2)).transpose(1, 2) + + # channel-mixing [B, #tokens, D] + if self.channels_mixing: + y += x + res = y + y = self.norm(y) if self.norm else y + y = res + self.channels_mixing(y) + + return y + + +class MTSMixer(nn.Module): + def __init__(self, **model_args): + super(MTSMixer, self).__init__() + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + self.enc_in = model_args['enc_in'] + self.dec_in = model_args['dec_in'] + self.d_model = model_args['d_model'] + self.d_ff = model_args['d_ff'] + self.norm = model_args['use_norm'] + self.e_layers = model_args['e_layers'] + self.fac_T = model_args['fac_T'] + self.fac_C = model_args['fac_C'] + self.sampling = model_args['sampling'] + self.individual = model_args['individual'] + self.rev = model_args['rev'] + self.mlp_blocks = nn.ModuleList([ + MixerBlock(self.seq_len, self.enc_in, self.d_model, self.d_ff, self.fac_T, self.fac_C, self.sampling, self.norm) + for _ in range(self.e_layers) + ]) + self.norm = nn.LayerNorm(self.enc_in) if self.norm else None + self.projection = ChannelProjection(self.seq_len, self.pred_len, self.enc_in, self.individual) + # self.projection = nn.Linear(configs.seq_len, configs.pred_len) + # self.refine = MLPBlock(configs.pred_len, configs.d_model) if configs.refine else None + self.rev = RevIN(self.enc_in) if self.rev else None + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + """ + + Args: + history_data (Tensor): Input data with shape: [B, L1, N, C] + future_data (Tensor): Future data with shape: [B, L2, N, C] + + Returns: + torch.Tensor: outputs with shape [B, L2, N, 1] + """ + x = history_data[:, :, :, 0] + x = self.rev(x, 'norm') if self.rev else x + + for block in self.mlp_blocks: + x = block(x) + + x = self.norm(x) if self.norm else x + x = self.projection(x) + # x = self.refine(x.transpose(1, 2)).transpose(1, 2) if self.refine else x + x = self.rev(x, 'denorm') if self.rev else x + + return x.unsqueeze(-1) diff --git a/baselines/Nonstationary_Transformer/Electricity.py b/baselines/Nonstationary_Transformer/Electricity.py new file mode 100644 index 0000000..971592e --- /dev/null +++ b/baselines/Nonstationary_Transformer/Electricity.py @@ -0,0 +1,157 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import Nonstationary_Transformer + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = Nonstationary_Transformer +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 3, # attn factor + "p_hidden_dims": [128, 128], + "p_hidden_layers": 2, + "d_model": 512, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 2, # num of encoder layers + "d_layers": 1, # num of decoder layers + "d_ff": 2048, + "distil": True, + "sigma" : 0.2, + "dropout": 0.05, + "freq": 'h', + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/Nonstationary_Transformer/arch/Embed.py b/baselines/Nonstationary_Transformer/arch/Embed.py new file mode 100644 index 0000000..6ef5503 --- /dev/null +++ b/baselines/Nonstationary_Transformer/arch/Embed.py @@ -0,0 +1,132 @@ +import torch +import torch.nn as nn +import math + + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular', bias=False) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding + if freq == 't': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + + minute_x = self.minute_embed(x[:, :, 4]) if hasattr(self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h': 4, 't': 5, 's': 6, 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + x = self.value_embedding(x) + self.temporal_embedding(x_mark) + self.position_embedding(x) + return self.dropout(x) + + +class DataEmbedding_wo_pos(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_wo_pos, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + x = self.value_embedding(x) + self.temporal_embedding(x_mark) + return self.dropout(x) diff --git a/baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py b/baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py new file mode 100644 index 0000000..0e25b6f --- /dev/null +++ b/baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py @@ -0,0 +1,172 @@ +import torch +import torch.nn as nn +import numpy as np +from math import sqrt +from .masking import TriangularCausalMask, ProbMask + + +class DSAttention(nn.Module): + '''De-stationary Attention''' + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(DSAttention, self).__init__() + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, H, E = queries.shape + _, S, _, D = values.shape + scale = self.scale or 1. / sqrt(E) + + tau = 1.0 if tau is None else tau.unsqueeze(1).unsqueeze(1) # B x 1 x 1 x 1 + delta = 0.0 if delta is None else delta.unsqueeze(1).unsqueeze(1) # B x 1 x 1 x S + + # De-stationary Attention, rescaling pre-softmax score with learned de-stationary factors + scores = torch.einsum("blhe,bshe->bhls", queries, keys) * tau + delta + + if self.mask_flag: + if attn_mask is None: + attn_mask = TriangularCausalMask(B, L, device=queries.device) + + scores.masked_fill_(attn_mask.mask, -np.inf) + + A = self.dropout(torch.softmax(scale * scores, dim=-1)) + V = torch.einsum("bhls,bshd->blhd", A, values) + + if self.output_attention: + return (V.contiguous(), A) + else: + return (V.contiguous(), None) + + +class DSProbAttention(nn.Module): + '''De-stationary ProbAttention for Informer''' + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(DSProbAttention, self).__init__() + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q) + # Q [B, H, L, D] + B, H, L_K, E = K.shape + _, _, L_Q, _ = Q.shape + + # calculate the sampled Q_K + K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E) + index_sample = torch.randint(L_K, (L_Q, sample_k)) # real U = U_part(factor*ln(L_k))*L_q + K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :] + Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze() + + # find the Top_k query with sparisty measurement + M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K) + M_top = M.topk(n_top, sorted=False)[1] + + # use the reduced Q to calculate Q_K + Q_reduce = Q[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + M_top, :] # factor*ln(L_q) + Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k + + return Q_K, M_top + + def _get_initial_context(self, V, L_Q): + B, H, L_V, D = V.shape + if not self.mask_flag: + # V_sum = V.sum(dim=-2) + V_sum = V.mean(dim=-2) + contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone() + else: # use mask + assert (L_Q == L_V) # requires that L_Q == L_V, i.e. for self-attention only + contex = V.cumsum(dim=-2) + return contex + + def _update_context(self, context_in, V, scores, index, L_Q, attn_mask): + B, H, L_V, D = V.shape + + if self.mask_flag: + attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device) + scores.masked_fill_(attn_mask.mask, -np.inf) + + attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores) + + context_in[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + index, :] = torch.matmul(attn, V).type_as(context_in) + if self.output_attention: + attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device) + attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn + return (context_in, attns) + else: + return (context_in, None) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L_Q, H, D = queries.shape + _, L_K, _, _ = keys.shape + + queries = queries.transpose(2, 1) + keys = keys.transpose(2, 1) + values = values.transpose(2, 1) + + U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item() # c*ln(L_k) + u = self.factor * np.ceil(np.log(L_Q)).astype('int').item() # c*ln(L_q) + + U_part = U_part if U_part < L_K else L_K + u = u if u < L_Q else L_Q + + scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u) + + tau = 1.0 if tau is None else tau.unsqueeze(1).unsqueeze(1) # B x 1 x 1 x 1 + delta = 0.0 if delta is None else delta.unsqueeze(1).unsqueeze(1) # B x 1 x 1 x S + scores_top = scores_top * tau + delta + + # add scale factor + scale = self.scale or 1. / sqrt(D) + if scale is not None: + scores_top = scores_top * scale + # get the context + context = self._get_initial_context(values, L_Q) + # update the context with selected top_k queries + context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask) + + return context.contiguous(), attn + + +class AttentionLayer(nn.Module): + def __init__(self, attention, d_model, n_heads, d_keys=None, + d_values=None): + super(AttentionLayer, self).__init__() + + d_keys = d_keys or (d_model // n_heads) + d_values = d_values or (d_model // n_heads) + + self.inner_attention = attention + self.query_projection = nn.Linear(d_model, d_keys * n_heads) + self.key_projection = nn.Linear(d_model, d_keys * n_heads) + self.value_projection = nn.Linear(d_model, d_values * n_heads) + self.out_projection = nn.Linear(d_values * n_heads, d_model) + self.n_heads = n_heads + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, _ = queries.shape + _, S, _ = keys.shape + H = self.n_heads + + queries = self.query_projection(queries).view(B, L, H, -1) + keys = self.key_projection(keys).view(B, S, H, -1) + values = self.value_projection(values).view(B, S, H, -1) + + out, attn = self.inner_attention( + queries, + keys, + values, + attn_mask, + tau, delta + ) + out = out.view(B, L, -1) + + return self.out_projection(out), attn + diff --git a/baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py b/baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py new file mode 100644 index 0000000..0e63f5e --- /dev/null +++ b/baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py @@ -0,0 +1,143 @@ +import torch.nn as nn +import torch.nn.functional as F + + +class ConvLayer(nn.Module): + def __init__(self, c_in): + super(ConvLayer, self).__init__() + self.downConv = nn.Conv1d(in_channels=c_in, + out_channels=c_in, + kernel_size=3, + padding=2, + padding_mode='circular') + self.norm = nn.BatchNorm1d(c_in) + self.activation = nn.ELU() + self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + x = self.downConv(x.permute(0, 2, 1)) # BxExS + x = self.norm(x) + x = self.activation(x) + x = self.maxPool(x) + x = x.transpose(1, 2) + return x + + +class EncoderLayer(nn.Module): + def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None, tau=None, delta=None): + new_x, attn = self.attention( + x, x, x, + attn_mask=attn_mask, + tau=tau, delta=delta + ) + x = x + self.dropout(new_x) + + y = x = self.norm1(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm2(x + y), attn + + +class Encoder(nn.Module): + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None + self.norm = norm_layer + + def forward(self, x, attn_mask=None, tau=None, delta=None): + # x [B, L, D] + attns = [] + if self.conv_layers is not None: + # The reason why we only import delta for the first attn_block of Encoder + # is to integrate Informer into our framework, where row size of attention of Informer is changing each layer + # and inconsistent to the sequence length of the initial input, + # then no way to add delta to every row, so we make delta=0.0 (See our Appendix E.2) + # + for i, (attn_layer, conv_layer) in enumerate(zip(self.attn_layers, self.conv_layers)): + delta = delta if i==0 else None + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x, tau=tau, delta=None) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + + return x, attns + + +class DecoderLayer(nn.Module): + def __init__(self, self_attention, cross_attention, d_model, d_ff=None, + dropout=0.1, activation="relu"): + super(DecoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.self_attention = self_attention + self.cross_attention = cross_attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + # Note that delta only used for Self-Attention(x_enc with x_enc) + # and Cross-Attention(x_enc with x_dec), + # but not suitable for Self-Attention(x_dec with x_dec) + + x = x + self.dropout(self.self_attention( + x, x, x, + attn_mask=x_mask, + tau=tau, delta=None + )[0]) + x = self.norm1(x) + + x = x + self.dropout(self.cross_attention( + x, cross, cross, + attn_mask=cross_mask, + tau=tau, delta=delta + )[0]) + + y = x = self.norm2(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm3(x + y) + + +class Decoder(nn.Module): + def __init__(self, layers, norm_layer=None, projection=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + self.projection = projection + + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + for layer in self.layers: + x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta) + + if self.norm is not None: + x = self.norm(x) + + if self.projection is not None: + x = self.projection(x) + return x diff --git a/baselines/Nonstationary_Transformer/arch/__init__.py b/baselines/Nonstationary_Transformer/arch/__init__.py new file mode 100644 index 0000000..c0a0fd5 --- /dev/null +++ b/baselines/Nonstationary_Transformer/arch/__init__.py @@ -0,0 +1 @@ +from .nstransformer_arch import Nonstationary_Transformer \ No newline at end of file diff --git a/baselines/Nonstationary_Transformer/arch/masking.py b/baselines/Nonstationary_Transformer/arch/masking.py new file mode 100644 index 0000000..a19cbf6 --- /dev/null +++ b/baselines/Nonstationary_Transformer/arch/masking.py @@ -0,0 +1,26 @@ +import torch + + +class TriangularCausalMask(): + def __init__(self, B, L, device="cpu"): + mask_shape = [B, 1, L, L] + with torch.no_grad(): + self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device) + + @property + def mask(self): + return self._mask + + +class ProbMask(): + def __init__(self, B, H, L, index, scores, device="cpu"): + _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1) + _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1]) + indicator = _mask_ex[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + index, :].to(device) + self._mask = indicator.view(scores.shape).to(device) + + @property + def mask(self): + return self._mask diff --git a/baselines/Nonstationary_Transformer/arch/nstransformer_arch.py b/baselines/Nonstationary_Transformer/arch/nstransformer_arch.py new file mode 100644 index 0000000..402d282 --- /dev/null +++ b/baselines/Nonstationary_Transformer/arch/nstransformer_arch.py @@ -0,0 +1,161 @@ +import torch +import torch.nn as nn +from .Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer +from .SelfAttention_Family import DSAttention, AttentionLayer +from .Embed import DataEmbedding +from basicts.utils import data_transformation_4_xformer +class Projector(nn.Module): + ''' + MLP to learn the De-stationary factors + ''' + + def __init__(self, enc_in, seq_len, hidden_dims, hidden_layers, output_dim, kernel_size=3): + super(Projector, self).__init__() + + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.series_conv = nn.Conv1d(in_channels=seq_len, out_channels=1, kernel_size=kernel_size, padding=padding, + padding_mode='circular', bias=False) + + layers = [nn.Linear(2 * enc_in, hidden_dims[0]), nn.ReLU()] + for i in range(hidden_layers - 1): + layers += [nn.Linear(hidden_dims[i], hidden_dims[i + 1]), nn.ReLU()] + + layers += [nn.Linear(hidden_dims[-1], output_dim, bias=False)] + self.backbone = nn.Sequential(*layers) + + def forward(self, x, stats): + # x: B x S x E + # stats: B x 1 x E + # y: B x O + batch_size = x.shape[0] + x = self.series_conv(x) # B x 1 x E + x = torch.cat([x, stats], dim=1) # B x 2 x E + x = x.view(batch_size, -1) # B x 2E + y = self.backbone(x) # B x O + + return y + + +class Nonstationary_Transformer(nn.Module): + """ + Non-stationary Transformer + """ + + def __init__(self, **model_args): + super(Nonstationary_Transformer, self).__init__() + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + self.label_len = int(model_args['label_len']) + self.output_attention = model_args['output_attention'] + self.enc_in = model_args['enc_in'] + self.dec_in = model_args['dec_in'] + self.c_out = model_args['c_out'] + self.factor = model_args["factor"] + self.d_model = model_args['d_model'] + self.n_heads = model_args['n_heads'] + self.d_ff = model_args['d_ff'] + self.embed = model_args['embed'] + self.freq = model_args["freq"] + self.dropout = model_args["dropout"] + self.activation = model_args['activation'] + self.e_layers = model_args['e_layers'] + self.d_layers = model_args['d_layers'] + self.p_hidden_dims = model_args['p_hidden_dims'] + self.p_hidden_layers = model_args['p_hidden_layers'] + # Embedding + self.enc_embedding = DataEmbedding(self.enc_in, self.d_model, self.embed, self.freq, + self.dropout) + self.dec_embedding = DataEmbedding(self.dec_in, self.d_model, self.embed, self.freq, + self.dropout) + # Encoder + self.encoder = Encoder( + [ + EncoderLayer( + AttentionLayer( + DSAttention(False, self.factor, attention_dropout=self.dropout, + output_attention=self.output_attention), self.d_model, self.n_heads), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation + ) for l in range(self.e_layers) + ], + norm_layer=torch.nn.LayerNorm(self.d_model) + ) + # Decoder + self.decoder = Decoder( + [ + DecoderLayer( + AttentionLayer( + DSAttention(True, self.factor, attention_dropout=self.dropout, output_attention=False), + self.d_model, self.n_heads), + AttentionLayer( + DSAttention(False, self.factor, attention_dropout=self.dropout, output_attention=False), + self.d_model, self.n_heads), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation, + ) + for l in range(self.d_layers) + ], + norm_layer=torch.nn.LayerNorm(self.d_model), + projection=nn.Linear(self.d_model, self.c_out, bias=True) + ) + + self.tau_learner = Projector(enc_in=self.enc_in, seq_len=self.seq_len, hidden_dims=self.p_hidden_dims, + hidden_layers=self.p_hidden_layers, output_dim=1) + self.delta_learner = Projector(enc_in=self.enc_in, seq_len=self.seq_len, + hidden_dims=self.p_hidden_dims, hidden_layers=self.p_hidden_layers, + output_dim=self.seq_len) + + def forward_xformer(self, x_enc: torch.Tensor, x_mark_enc: torch.Tensor, x_dec: torch.Tensor, + x_mark_dec: torch.Tensor, + enc_self_mask: torch.Tensor = None, dec_self_mask: torch.Tensor = None, + dec_enc_mask: torch.Tensor = None) -> torch.Tensor: + x_raw = x_enc.clone().detach() + + # Normalization + mean_enc = x_enc.mean(1, keepdim=True).detach() # B x 1 x E + x_enc = x_enc - mean_enc + std_enc = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach() # B x 1 x E + x_enc = x_enc / std_enc + + x_dec_new = torch.cat([x_enc[:, -self.label_len:, :], torch.zeros_like(x_dec[:, -self.pred_len:, :])], + dim=1).to(x_enc.device).clone() + + tau = self.tau_learner(x_raw, std_enc).exp() # B x S x E, B x 1 x E -> B x 1, positive scalar + delta = self.delta_learner(x_raw, mean_enc) # B x S x E, B x 1 x E -> B x S + + # Model Inference + enc_out = self.enc_embedding(x_enc, x_mark_enc) + enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask, tau=tau, delta=delta) + + dec_out = self.dec_embedding(x_dec_new, x_mark_dec) + dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask, tau=tau, delta=delta) + + # De-normalization + dec_out = dec_out * std_enc + mean_enc + + if self.output_attention: + return dec_out[:, -self.pred_len:, :].unsqueeze(-1), attns + else: + return dec_out[:, -self.pred_len:, :].unsqueeze(-1) # [B, L, D] + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + """ + + Args: + history_data (Tensor): Input data with shape: [B, L1, N, C] + future_data (Tensor): Future data with shape: [B, L2, N, C] + + Returns: + torch.Tensor: outputs with shape [B, L2, N, 1] + """ + + x_enc, x_mark_enc, x_dec, x_mark_dec = data_transformation_4_xformer(history_data=history_data, + future_data=future_data, + start_token_len=self.label_len) + prediction = self.forward_xformer(x_enc=x_enc, x_mark_enc=x_mark_enc, x_dec=x_dec, x_mark_dec=x_mark_dec) + return prediction \ No newline at end of file diff --git a/baselines/SegRNN/Electricity.py b/baselines/SegRNN/Electricity.py new file mode 100644 index 0000000..740a584 --- /dev/null +++ b/baselines/SegRNN/Electricity.py @@ -0,0 +1,152 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import SegRNN + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = SegRNN +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "seg_len": 48, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 1, # attn factor + "d_model": 512, # 512 + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + # num of decoder layers + "dropout": 0.1, + "freq": 'h', + "use_norm" : False, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0005, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/SegRNN/arch/Autoformer_EncDec.py b/baselines/SegRNN/arch/Autoformer_EncDec.py new file mode 100644 index 0000000..6fce4bc --- /dev/null +++ b/baselines/SegRNN/arch/Autoformer_EncDec.py @@ -0,0 +1,203 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class my_Layernorm(nn.Module): + """ + Special designed layernorm for the seasonal part + """ + + def __init__(self, channels): + super(my_Layernorm, self).__init__() + self.layernorm = nn.LayerNorm(channels) + + def forward(self, x): + x_hat = self.layernorm(x) + bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1) + return x_hat - bias + + +class moving_avg(nn.Module): + """ + Moving average block to highlight the trend of time series + """ + + def __init__(self, kernel_size, stride): + super(moving_avg, self).__init__() + self.kernel_size = kernel_size + self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) + + def forward(self, x): + # padding on the both ends of time series + front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) + end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) + x = torch.cat([front, x, end], dim=1) + x = self.avg(x.permute(0, 2, 1)) + x = x.permute(0, 2, 1) + return x + + +class series_decomp(nn.Module): + """ + Series decomposition block + """ + + def __init__(self, kernel_size): + super(series_decomp, self).__init__() + self.moving_avg = moving_avg(kernel_size, stride=1) + + def forward(self, x): + moving_mean = self.moving_avg(x) + res = x - moving_mean + return res, moving_mean + + +class series_decomp_multi(nn.Module): + """ + Multiple Series decomposition block from FEDformer + """ + + def __init__(self, kernel_size): + super(series_decomp_multi, self).__init__() + self.kernel_size = kernel_size + self.series_decomp = [series_decomp(kernel) for kernel in kernel_size] + + def forward(self, x): + moving_mean = [] + res = [] + for func in self.series_decomp: + sea, moving_avg = func(x) + moving_mean.append(moving_avg) + res.append(sea) + + sea = sum(res) / len(res) + moving_mean = sum(moving_mean) / len(moving_mean) + return sea, moving_mean + + +class EncoderLayer(nn.Module): + """ + Autoformer encoder layer with the progressive decomposition architecture + """ + + def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) + self.decomp1 = series_decomp(moving_avg) + self.decomp2 = series_decomp(moving_avg) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None): + new_x, attn = self.attention( + x, x, x, + attn_mask=attn_mask + ) + x = x + self.dropout(new_x) + x, _ = self.decomp1(x) + y = x + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + res, _ = self.decomp2(x + y) + return res, attn + + +class Encoder(nn.Module): + """ + Autoformer encoder + """ + + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None + self.norm = norm_layer + + def forward(self, x, attn_mask=None): + attns = [] + if self.conv_layers is not None: + for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers): + x, attn = attn_layer(x, attn_mask=attn_mask) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + + return x, attns + + +class DecoderLayer(nn.Module): + """ + Autoformer decoder layer with the progressive decomposition architecture + """ + + def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None, + moving_avg=25, dropout=0.1, activation="relu"): + super(DecoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.self_attention = self_attention + self.cross_attention = cross_attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) + self.decomp1 = series_decomp(moving_avg) + self.decomp2 = series_decomp(moving_avg) + self.decomp3 = series_decomp(moving_avg) + self.dropout = nn.Dropout(dropout) + self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1, + padding_mode='circular', bias=False) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, cross, x_mask=None, cross_mask=None): + x = x + self.dropout(self.self_attention( + x, x, x, + attn_mask=x_mask + )[0]) + x, trend1 = self.decomp1(x) + x = x + self.dropout(self.cross_attention( + x, cross, cross, + attn_mask=cross_mask + )[0]) + x, trend2 = self.decomp2(x) + y = x + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + x, trend3 = self.decomp3(x + y) + + residual_trend = trend1 + trend2 + trend3 + residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2) + return x, residual_trend + + +class Decoder(nn.Module): + """ + Autoformer encoder + """ + + def __init__(self, layers, norm_layer=None, projection=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + self.projection = projection + + def forward(self, x, cross, x_mask=None, cross_mask=None, trend=None): + for layer in self.layers: + x, residual_trend = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) + trend = trend + residual_trend + + if self.norm is not None: + x = self.norm(x) + + if self.projection is not None: + x = self.projection(x) + return x, trend diff --git a/baselines/SegRNN/arch/__init__.py b/baselines/SegRNN/arch/__init__.py new file mode 100644 index 0000000..0b5ca8e --- /dev/null +++ b/baselines/SegRNN/arch/__init__.py @@ -0,0 +1 @@ +from .segrnn_arch import SegRNN \ No newline at end of file diff --git a/baselines/SegRNN/arch/segrnn_arch.py b/baselines/SegRNN/arch/segrnn_arch.py new file mode 100644 index 0000000..3769833 --- /dev/null +++ b/baselines/SegRNN/arch/segrnn_arch.py @@ -0,0 +1,83 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from .Autoformer_EncDec import series_decomp + + +class SegRNN(nn.Module): + """ + Paper link: https://arxiv.org/abs/2308.11200.pdf + """ + + def __init__(self, **model_args): + super(SegRNN, self).__init__() + + # get parameters + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + self.d_model = model_args['d_model'] + self.dropout = model_args["dropout"] + self.seg_len = model_args['seg_len'] + self.seg_num_x = self.seq_len // self.seg_len + self.seg_num_y = self.pred_len // self.seg_len + self.enc_in = model_args['enc_in'] + + # building model + self.valueEmbedding = nn.Sequential( + nn.Linear(self.seg_len, self.d_model), + nn.ReLU() + ) + self.rnn = nn.GRU(input_size=self.d_model, hidden_size=self.d_model, num_layers=1, bias=True, + batch_first=True, bidirectional=False) + self.pos_emb = nn.Parameter(torch.randn(self.seg_num_y, self.d_model // 2)) + self.channel_emb = nn.Parameter(torch.randn(self.enc_in, self.d_model // 2)) + + self.predict = nn.Sequential( + nn.Dropout(self.dropout), + nn.Linear(self.d_model, self.seg_len) + ) + + def encoder(self, x): + # b:batch_size c:channel_size s:seq_len s:seq_len + # d:d_model w:seg_len n:seg_num_x m:seg_num_y + batch_size = x.size(0) + + # normalization and permute b,s,c -> b,c,s + seq_last = x[:, -1:, :].detach() + x = (x - seq_last).permute(0, 2, 1) # b,c,s + + # segment and embedding b,c,s -> bc,n,w -> bc,n,d + x = self.valueEmbedding(x.reshape(-1, self.seg_num_x, self.seg_len)) + + # encoding + _, hn = self.rnn(x) # bc,n,d 1,bc,d + + # m,d//2 -> 1,m,d//2 -> c,m,d//2 + # c,d//2 -> c,1,d//2 -> c,m,d//2 + # c,m,d -> cm,1,d -> bcm, 1, d + pos_emb = torch.cat([ + self.pos_emb.unsqueeze(0).repeat(self.enc_in, 1, 1), + self.channel_emb.unsqueeze(1).repeat(1, self.seg_num_y, 1) + ], dim=-1).view(-1, 1, self.d_model).repeat(batch_size,1,1) + + _, hy = self.rnn(pos_emb, hn.repeat(1, 1, self.seg_num_y).view(1, -1, self.d_model)) # bcm,1,d 1,bcm,d + + # 1,bcm,d -> 1,bcm,w -> b,c,s + y = self.predict(hy).view(-1, self.enc_in, self.pred_len) + + # permute and denorm + y = y.permute(0, 2, 1) + seq_last + return y + + def forecast(self, x_enc): + # Encoder + return self.encoder(x_enc) + + + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + x_enc = history_data[:, :, :, 0] + dec_out = self.forecast(x_enc) + return dec_out[:, -self.pred_len:, :].unsqueeze(-1) # [B, L, D] + diff --git a/baselines/SparseTSF/Electricity.py b/baselines/SparseTSF/Electricity.py new file mode 100644 index 0000000..94e01ee --- /dev/null +++ b/baselines/SparseTSF/Electricity.py @@ -0,0 +1,158 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import SparseTSF +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = SparseTSF +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "period_len" : 24, + "factor": 1, # attn factor + "p_hidden_dims": [128, 128], + "p_hidden_layers": 2, + "d_model": 512, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 2, # num of encoder layers + "d_layers": 2, # num of decoder layers + "d_ff": 2048, + "distil": True, + "sigma" : 0.2, + "dropout": 0.1, + "freq": 'h', + "use_norm" : False, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.02, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/SparseTSF/arch/Embed.py b/baselines/SparseTSF/arch/Embed.py new file mode 100644 index 0000000..1202616 --- /dev/null +++ b/baselines/SparseTSF/arch/Embed.py @@ -0,0 +1,234 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import weight_norm +import math + + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular', bias=False) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_( + m.weight, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding + if freq == 't': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + minute_x = self.minute_embed(x[:, :, 4]) if hasattr( + self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h': 4, 't': 5, 's': 6, + 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + self.c_in = c_in + self.d_model = d_model + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + _, _, N = x.size() + if N == self.c_in: + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x) + else: + x = self.value_embedding( + x) + self.temporal_embedding(x_mark) + self.position_embedding(x) + elif N == self.d_model: + if x_mark is None: + x = x + self.position_embedding(x) + else: + x = x + self.temporal_embedding(x_mark) + self.position_embedding(x) + + return self.dropout(x) + + +class DataEmbedding_ms(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_ms, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=1, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + B, T, N = x.shape + x1 = self.value_embedding(x.reshape(0, 2, 1).reshape(B * N, T).unsqueeze(-1)).reshape(B, N, T, -1).permute(0, 2, + 1, 3) + if x_mark is None: + x = x1 + else: + x = x1 + self.temporal_embedding(x_mark) + return self.dropout(x) + + +class DataEmbedding_wo_pos(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_wo_pos, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x is None and x_mark is not None: + return self.temporal_embedding(x_mark) + if x_mark is None: + x = self.value_embedding(x) + else: + x = self.value_embedding(x) + self.temporal_embedding(x_mark) + return self.dropout(x) + + +class PatchEmbedding_crossformer(nn.Module): + def __init__(self, d_model, patch_len, stride, padding, dropout): + super(PatchEmbedding_crossformer, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, padding)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = nn.Linear(patch_len, d_model, bias=False) + + # Positional embedding + self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x), n_vars + + +class PatchEmbedding(nn.Module): + def __init__(self, d_model, patch_len, stride, dropout): + super(PatchEmbedding, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, stride)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = TokenEmbedding(patch_len, d_model) + + # Positional embedding + self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x), n_vars diff --git a/baselines/SparseTSF/arch/__init__.py b/baselines/SparseTSF/arch/__init__.py new file mode 100644 index 0000000..245c6b1 --- /dev/null +++ b/baselines/SparseTSF/arch/__init__.py @@ -0,0 +1 @@ +from .sparsetsf_arch import SparseTSF diff --git a/baselines/SparseTSF/arch/sparsetsf_arch.py b/baselines/SparseTSF/arch/sparsetsf_arch.py new file mode 100644 index 0000000..9afe450 --- /dev/null +++ b/baselines/SparseTSF/arch/sparsetsf_arch.py @@ -0,0 +1,46 @@ +import torch +import torch.nn as nn +from .Embed import PositionalEmbedding + +class SparseTSF(nn.Module): + def __init__(self, **model_args): + super(SparseTSF, self).__init__() + + # get parameters + self.seq_len = model_args['seq_len'] + self.pred_len = model_args['pred_len'] + self.enc_in = model_args['enc_in'] + self.period_len = model_args['period_len'] + + self.seg_num_x = self.seq_len // self.period_len + self.seg_num_y = self.pred_len // self.period_len + + self.conv1d = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=1 + 2 * (self.period_len // 2), + stride=1, padding=self.period_len // 2, padding_mode="zeros", bias=False) + + self.linear = nn.Linear(self.seg_num_x, self.seg_num_y, bias=False) + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + x = history_data[:, :, :, 0] + batch_size = x.shape[0] + # normalization and permute b,s,c -> b,c,s + seq_mean = torch.mean(x, dim=1).unsqueeze(1) + x = (x - seq_mean).permute(0, 2, 1) + + # 1D convolution aggregation + x = self.conv1d(x.reshape(-1, 1, self.seq_len)).reshape(-1, self.enc_in, self.seq_len) + x + + # downsampling: b,c,s -> bc,n,w -> bc,w,n + x = x.reshape(-1, self.seg_num_x, self.period_len).permute(0, 2, 1) + + # sparse forecasting + y = self.linear(x) # bc,w,m + + # upsampling: bc,w,m -> bc,m,w -> b,c,s + y = y.permute(0, 2, 1).reshape(batch_size, self.enc_in, self.pred_len) + + # permute and denorm + y = y.permute(0, 2, 1) + seq_mean + + return y.unsqueeze(-1) diff --git a/baselines/TiDE/Electricity.py b/baselines/TiDE/Electricity.py new file mode 100644 index 0000000..aff06f7 --- /dev/null +++ b/baselines/TiDE/Electricity.py @@ -0,0 +1,153 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import TiDE + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = TiDE +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "d_model": 256,# 256 + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "bias": True, + "feature_encode_dim": 2, + "e_layers": 2, # num of encoder layers + "d_layers": 2, # num of decoder layers + "d_ff": 256, # 256 + "dropout": 0.3, + "freq": 'h', + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 24 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 24 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 24 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/TiDE/arch/__init__.py b/baselines/TiDE/arch/__init__.py new file mode 100644 index 0000000..2f24eb7 --- /dev/null +++ b/baselines/TiDE/arch/__init__.py @@ -0,0 +1 @@ +from .tide_arch import TiDE \ No newline at end of file diff --git a/baselines/TiDE/arch/tide_arch.py b/baselines/TiDE/arch/tide_arch.py new file mode 100644 index 0000000..3a8f386 --- /dev/null +++ b/baselines/TiDE/arch/tide_arch.py @@ -0,0 +1,118 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from basicts.utils import data_transformation_4_xformer + +class LayerNorm(nn.Module): + """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """ + + def __init__(self, ndim, bias): + super().__init__() + self.weight = nn.Parameter(torch.ones(ndim)) + self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None + + def forward(self, input): + return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5) + + +class ResBlock(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.1, bias=True): + super().__init__() + + self.fc1 = nn.Linear(input_dim, hidden_dim, bias=bias) + self.fc2 = nn.Linear(hidden_dim, output_dim, bias=bias) + self.fc3 = nn.Linear(input_dim, output_dim, bias=bias) + self.dropout = nn.Dropout(dropout) + self.relu = nn.ReLU() + self.ln = LayerNorm(output_dim, bias=bias) + + def forward(self, x): + out = self.fc1(x) + out = self.relu(out) + out = self.fc2(out) + out = self.dropout(out) + out = out + self.fc3(x) + out = self.ln(out) + return out + + +# TiDE +class TiDE(nn.Module): + """ + paper: https://arxiv.org/pdf/2304.08424.pdf + """ + + def __init__(self, **model_args): + super(TiDE, self).__init__() + + self.seq_len = model_args['seq_len'] # L + self.label_len = int(model_args['label_len']) + self.pred_len = model_args['pred_len'] # H + self.hidden_dim = model_args['d_model'] + self.res_hidden = model_args['d_model'] + self.encoder_num = model_args['e_layers'] + self.decoder_num = model_args['d_layers'] + self.freq = model_args["freq"] + self.bias = model_args["bias"] + self.feature_encode_dim = model_args["feature_encode_dim"] + self.decode_dim = model_args['c_out'] + self.temporalDecoderHidden = model_args['d_ff'] + dropout = model_args["dropout"] + + freq_map = {'h': 4, 't': 5, 's': 6, + 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + + self.feature_dim = freq_map[self.freq] + + flatten_dim = self.seq_len + (self.seq_len + self.pred_len) * self.feature_encode_dim + + self.feature_encoder = ResBlock(self.feature_dim, self.res_hidden, self.feature_encode_dim, dropout, self.bias) + self.encoders = nn.Sequential(ResBlock(flatten_dim, self.res_hidden, self.hidden_dim, dropout, self.bias), *( + [ResBlock(self.hidden_dim, self.res_hidden, self.hidden_dim, dropout, self.bias)] * (self.encoder_num - 1))) + + self.decoders = nn.Sequential(*( + [ResBlock(self.hidden_dim, self.res_hidden, self.hidden_dim, dropout, self.bias)] * ( + self.decoder_num - 1)), + ResBlock(self.hidden_dim, self.res_hidden, self.decode_dim * self.pred_len, + dropout, self.bias)) + self.temporalDecoder = ResBlock(self.decode_dim + self.feature_encode_dim, self.temporalDecoderHidden, 1, + dropout, self.bias) + self.residual_proj = nn.Linear(self.seq_len, self.pred_len, bias=self.bias) + + + def forward_xformer(self, x_enc, x_mark_enc, x_dec, batch_y_mark) -> torch.Tensor: + # Normalization + means = x_enc.mean(1, keepdim=True).detach() + x_enc = x_enc - means + stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5) + x_enc /= stdev + + feature = self.feature_encoder(batch_y_mark) + hidden = self.encoders(torch.cat([x_enc, feature.reshape(feature.shape[0], -1)], dim=-1)) + decoded = self.decoders(hidden).reshape(hidden.shape[0], self.pred_len, self.decode_dim) + dec_out = self.temporalDecoder(torch.cat([feature[:, self.seq_len:], decoded], dim=-1)).squeeze( + -1) + self.residual_proj(x_enc) + + # De-Normalization + dec_out = dec_out * (stdev[:, 0].unsqueeze(1).repeat(1, self.pred_len)) + dec_out = dec_out + (means[:, 0].unsqueeze(1).repeat(1, self.pred_len)) + return dec_out + + + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + '''x_mark_enc is the exogenous dynamic feature described in the original paper''' + x_enc, x_mark_enc, x_dec, batch_y_mark = data_transformation_4_xformer(history_data=history_data, + future_data=future_data, + start_token_len=0) + + batch_y_mark = torch.concat([x_mark_enc, batch_y_mark[:, -self.pred_len:, :]], dim=1) + + batch_y_mark = torch.concat([x_mark_enc, batch_y_mark[:, -self.pred_len:, :]], dim=1) + dec_out = torch.stack([self.forward_xformer(x_enc[:, :, feature], x_mark_enc, x_dec, batch_y_mark) for feature in + range(x_enc.shape[-1])], dim=-1) + return dec_out.unsqueeze(-1) # [B, L, D] + + + diff --git a/baselines/TimeMixer/Electricity.py b/baselines/TimeMixer/Electricity.py new file mode 100644 index 0000000..57b0b36 --- /dev/null +++ b/baselines/TimeMixer/Electricity.py @@ -0,0 +1,162 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import TimeMixer + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = TimeMixer +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 1, # attn factor + "down_sampling_window": 2, + "down_sampling_layers": 3, + "top_k":5, + "down_sampling_method": 'avg', + "channel_independence": True, + "d_model": 16, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 3, # num of encoder layers + "d_layers": 1, # num of decoder layers + "d_ff": 32, + "distil": True, + "sigma" : 0.2, + "dropout": 0.1, + "freq": 'h', + "use_norm" : 0, + "decomp_method" : 'moving_avg', # dft_decomp or moving_avg + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.01, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 16 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/TimeMixer/arch/Autoformer_EncDec.py b/baselines/TimeMixer/arch/Autoformer_EncDec.py new file mode 100644 index 0000000..6fce4bc --- /dev/null +++ b/baselines/TimeMixer/arch/Autoformer_EncDec.py @@ -0,0 +1,203 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class my_Layernorm(nn.Module): + """ + Special designed layernorm for the seasonal part + """ + + def __init__(self, channels): + super(my_Layernorm, self).__init__() + self.layernorm = nn.LayerNorm(channels) + + def forward(self, x): + x_hat = self.layernorm(x) + bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1) + return x_hat - bias + + +class moving_avg(nn.Module): + """ + Moving average block to highlight the trend of time series + """ + + def __init__(self, kernel_size, stride): + super(moving_avg, self).__init__() + self.kernel_size = kernel_size + self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) + + def forward(self, x): + # padding on the both ends of time series + front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) + end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) + x = torch.cat([front, x, end], dim=1) + x = self.avg(x.permute(0, 2, 1)) + x = x.permute(0, 2, 1) + return x + + +class series_decomp(nn.Module): + """ + Series decomposition block + """ + + def __init__(self, kernel_size): + super(series_decomp, self).__init__() + self.moving_avg = moving_avg(kernel_size, stride=1) + + def forward(self, x): + moving_mean = self.moving_avg(x) + res = x - moving_mean + return res, moving_mean + + +class series_decomp_multi(nn.Module): + """ + Multiple Series decomposition block from FEDformer + """ + + def __init__(self, kernel_size): + super(series_decomp_multi, self).__init__() + self.kernel_size = kernel_size + self.series_decomp = [series_decomp(kernel) for kernel in kernel_size] + + def forward(self, x): + moving_mean = [] + res = [] + for func in self.series_decomp: + sea, moving_avg = func(x) + moving_mean.append(moving_avg) + res.append(sea) + + sea = sum(res) / len(res) + moving_mean = sum(moving_mean) / len(moving_mean) + return sea, moving_mean + + +class EncoderLayer(nn.Module): + """ + Autoformer encoder layer with the progressive decomposition architecture + """ + + def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) + self.decomp1 = series_decomp(moving_avg) + self.decomp2 = series_decomp(moving_avg) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None): + new_x, attn = self.attention( + x, x, x, + attn_mask=attn_mask + ) + x = x + self.dropout(new_x) + x, _ = self.decomp1(x) + y = x + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + res, _ = self.decomp2(x + y) + return res, attn + + +class Encoder(nn.Module): + """ + Autoformer encoder + """ + + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None + self.norm = norm_layer + + def forward(self, x, attn_mask=None): + attns = [] + if self.conv_layers is not None: + for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers): + x, attn = attn_layer(x, attn_mask=attn_mask) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + + return x, attns + + +class DecoderLayer(nn.Module): + """ + Autoformer decoder layer with the progressive decomposition architecture + """ + + def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None, + moving_avg=25, dropout=0.1, activation="relu"): + super(DecoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.self_attention = self_attention + self.cross_attention = cross_attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) + self.decomp1 = series_decomp(moving_avg) + self.decomp2 = series_decomp(moving_avg) + self.decomp3 = series_decomp(moving_avg) + self.dropout = nn.Dropout(dropout) + self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1, + padding_mode='circular', bias=False) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, cross, x_mask=None, cross_mask=None): + x = x + self.dropout(self.self_attention( + x, x, x, + attn_mask=x_mask + )[0]) + x, trend1 = self.decomp1(x) + x = x + self.dropout(self.cross_attention( + x, cross, cross, + attn_mask=cross_mask + )[0]) + x, trend2 = self.decomp2(x) + y = x + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + x, trend3 = self.decomp3(x + y) + + residual_trend = trend1 + trend2 + trend3 + residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2) + return x, residual_trend + + +class Decoder(nn.Module): + """ + Autoformer encoder + """ + + def __init__(self, layers, norm_layer=None, projection=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + self.projection = projection + + def forward(self, x, cross, x_mask=None, cross_mask=None, trend=None): + for layer in self.layers: + x, residual_trend = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) + trend = trend + residual_trend + + if self.norm is not None: + x = self.norm(x) + + if self.projection is not None: + x = self.projection(x) + return x, trend diff --git a/baselines/TimeMixer/arch/Embed.py b/baselines/TimeMixer/arch/Embed.py new file mode 100644 index 0000000..1202616 --- /dev/null +++ b/baselines/TimeMixer/arch/Embed.py @@ -0,0 +1,234 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import weight_norm +import math + + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular', bias=False) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_( + m.weight, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding + if freq == 't': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + minute_x = self.minute_embed(x[:, :, 4]) if hasattr( + self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h': 4, 't': 5, 's': 6, + 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + self.c_in = c_in + self.d_model = d_model + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + _, _, N = x.size() + if N == self.c_in: + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x) + else: + x = self.value_embedding( + x) + self.temporal_embedding(x_mark) + self.position_embedding(x) + elif N == self.d_model: + if x_mark is None: + x = x + self.position_embedding(x) + else: + x = x + self.temporal_embedding(x_mark) + self.position_embedding(x) + + return self.dropout(x) + + +class DataEmbedding_ms(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_ms, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=1, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + B, T, N = x.shape + x1 = self.value_embedding(x.reshape(0, 2, 1).reshape(B * N, T).unsqueeze(-1)).reshape(B, N, T, -1).permute(0, 2, + 1, 3) + if x_mark is None: + x = x1 + else: + x = x1 + self.temporal_embedding(x_mark) + return self.dropout(x) + + +class DataEmbedding_wo_pos(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_wo_pos, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x is None and x_mark is not None: + return self.temporal_embedding(x_mark) + if x_mark is None: + x = self.value_embedding(x) + else: + x = self.value_embedding(x) + self.temporal_embedding(x_mark) + return self.dropout(x) + + +class PatchEmbedding_crossformer(nn.Module): + def __init__(self, d_model, patch_len, stride, padding, dropout): + super(PatchEmbedding_crossformer, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, padding)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = nn.Linear(patch_len, d_model, bias=False) + + # Positional embedding + self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x), n_vars + + +class PatchEmbedding(nn.Module): + def __init__(self, d_model, patch_len, stride, dropout): + super(PatchEmbedding, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, stride)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = TokenEmbedding(patch_len, d_model) + + # Positional embedding + self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x), n_vars diff --git a/baselines/TimeMixer/arch/StandardNorm.py b/baselines/TimeMixer/arch/StandardNorm.py new file mode 100644 index 0000000..990d0fd --- /dev/null +++ b/baselines/TimeMixer/arch/StandardNorm.py @@ -0,0 +1,68 @@ +import torch +import torch.nn as nn + + +class Normalize(nn.Module): + def __init__(self, num_features: int, eps=1e-5, affine=False, subtract_last=False, non_norm=False): + """ + :param num_features: the number of features or channels + :param eps: a value added for numerical stability + :param affine: if True, RevIN has learnable affine parameters + """ + super(Normalize, self).__init__() + self.num_features = num_features + self.eps = eps + self.affine = affine + self.subtract_last = subtract_last + self.non_norm = non_norm + if self.affine: + self._init_params() + + def forward(self, x, mode: str): + if mode == 'norm': + self._get_statistics(x) + x = self._normalize(x) + elif mode == 'denorm': + x = self._denormalize(x) + else: + raise NotImplementedError + return x + + def _init_params(self): + # initialize RevIN params: (C,) + self.affine_weight = nn.Parameter(torch.ones(self.num_features)) + self.affine_bias = nn.Parameter(torch.zeros(self.num_features)) + + def _get_statistics(self, x): + dim2reduce = tuple(range(1, x.ndim - 1)) + if self.subtract_last: + self.last = x[:, -1, :].unsqueeze(1) + else: + self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() + self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() + + def _normalize(self, x): + if self.non_norm: + return x + if self.subtract_last: + x = x - self.last + else: + x = x - self.mean + x = x / self.stdev + if self.affine: + x = x * self.affine_weight + x = x + self.affine_bias + return x + + def _denormalize(self, x): + if self.non_norm: + return x + if self.affine: + x = x - self.affine_bias + x = x / (self.affine_weight + self.eps * self.eps) + x = x * self.stdev + if self.subtract_last: + x = x + self.last + else: + x = x + self.mean + return x diff --git a/baselines/TimeMixer/arch/__init__.py b/baselines/TimeMixer/arch/__init__.py new file mode 100644 index 0000000..1495113 --- /dev/null +++ b/baselines/TimeMixer/arch/__init__.py @@ -0,0 +1 @@ +from .timemixer_arch import TimeMixer \ No newline at end of file diff --git a/baselines/TimeMixer/arch/timemixer_arch.py b/baselines/TimeMixer/arch/timemixer_arch.py new file mode 100644 index 0000000..11505bc --- /dev/null +++ b/baselines/TimeMixer/arch/timemixer_arch.py @@ -0,0 +1,419 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from .Autoformer_EncDec import series_decomp +from .Embed import DataEmbedding_wo_pos +from .StandardNorm import Normalize +from basicts.utils import data_transformation_4_xformer + +class DFT_series_decomp(nn.Module): + """ + Series decomposition block + """ + + def __init__(self, top_k=5): + super(DFT_series_decomp, self).__init__() + self.top_k = top_k + + def forward(self, x): + xf = torch.fft.rfft(x) + freq = abs(xf) + freq[0] = 0 + top_k_freq, top_list = torch.topk(freq, 5) + xf[freq <= top_k_freq.min()] = 0 + x_season = torch.fft.irfft(xf) + x_trend = x - x_season + return x_season, x_trend + + +class MultiScaleSeasonMixing(nn.Module): + """ + Bottom-up mixing season pattern + """ + + def __init__(self, **model_args): + super(MultiScaleSeasonMixing, self).__init__() + self.seq_len = model_args['seq_len'] + self.down_sampling_window = model_args['down_sampling_window'] + self.down_sampling_layers = model_args['down_sampling_layers'] + self.down_sampling_layers = torch.nn.ModuleList( + [ + nn.Sequential( + torch.nn.Linear( + self.seq_len // (self.down_sampling_window ** i), + self.seq_len // (self.down_sampling_window ** (i + 1)), + ), + nn.GELU(), + torch.nn.Linear( + self.seq_len // (self.down_sampling_window ** (i + 1)), + self.seq_len // (self.down_sampling_window ** (i + 1)), + ), + + ) + for i in range(self.down_sampling_layers) + ] + ) + + def forward(self, season_list): + + # mixing high->low + out_high = season_list[0] + out_low = season_list[1] + out_season_list = [out_high.permute(0, 2, 1)] + + for i in range(len(season_list) - 1): + out_low_res = self.down_sampling_layers[i](out_high) + out_low = out_low + out_low_res + out_high = out_low + if i + 2 <= len(season_list) - 1: + out_low = season_list[i + 2] + out_season_list.append(out_high.permute(0, 2, 1)) + + return out_season_list + + +class MultiScaleTrendMixing(nn.Module): + """ + Top-down mixing trend pattern + """ + + def __init__(self, **model_args): + super(MultiScaleTrendMixing, self).__init__() + self.seq_len = model_args['seq_len'] + self.down_sampling_window = model_args['down_sampling_window'] + self.down_sampling_layers = model_args['down_sampling_layers'] + + self.up_sampling_layers = torch.nn.ModuleList( + [ + nn.Sequential( + torch.nn.Linear( + self.seq_len // (self.down_sampling_window ** (i + 1)), + self.seq_len // (self.down_sampling_window ** i), + ), + nn.GELU(), + torch.nn.Linear( + self.seq_len // (self.down_sampling_window ** i), + self.seq_len // (self.down_sampling_window ** i), + ), + ) + for i in reversed(range(self.down_sampling_layers)) + ]) + + def forward(self, trend_list): + + # mixing low->high + trend_list_reverse = trend_list.copy() + trend_list_reverse.reverse() + out_low = trend_list_reverse[0] + out_high = trend_list_reverse[1] + out_trend_list = [out_low.permute(0, 2, 1)] + + for i in range(len(trend_list_reverse) - 1): + out_high_res = self.up_sampling_layers[i](out_low) + out_high = out_high + out_high_res + out_low = out_high + if i + 2 <= len(trend_list_reverse) - 1: + out_high = trend_list_reverse[i + 2] + out_trend_list.append(out_low.permute(0, 2, 1)) + + out_trend_list.reverse() + return out_trend_list + + +class PastDecomposableMixing(nn.Module): + def __init__(self, **model_args): + super(PastDecomposableMixing, self).__init__() + self.seq_len = model_args['seq_len'] + self.pred_len = model_args['pred_len'] + self.moving_avg = model_args['moving_avg'] + self.down_sampling_window = model_args['down_sampling_window'] + self.down_sampling_layers = model_args['down_sampling_layers'] + self.channel_independence = model_args['channel_independence'] + self.d_model = model_args['d_model'] + self.d_ff = model_args['d_ff'] + self.dp = model_args['dropout'] + + self.layer_norm = nn.LayerNorm(self.d_model) + self.dropout = nn.Dropout(self.dp) + self.decomp_method = model_args['decomp_method'] + self.top_k = model_args['top_k'] + if self.decomp_method == 'moving_avg': + self.decompsition = series_decomp(self.moving_avg) + elif self.decomp_method == "dft_decomp": + self.decompsition = DFT_series_decomp(self.top_k) + else: + raise ValueError('decompsition is error') + + if not self.channel_independence: + self.cross_layer = nn.Sequential( + nn.Linear(in_features=self.d_model, out_features=self.d_ff), + nn.GELU(), + nn.Linear(in_features=self.d_ff, out_features=self.d_model), + ) + + # Mixing season + self.mixing_multi_scale_season = MultiScaleSeasonMixing(**model_args) + + # Mxing trend + self.mixing_multi_scale_trend = MultiScaleTrendMixing(**model_args) + + self.out_cross_layer = nn.Sequential( + nn.Linear(in_features=self.d_model, out_features=self.d_ff), + nn.GELU(), + nn.Linear(in_features=self.d_ff, out_features=self.d_model), + ) + + def forward(self, x_list): + length_list = [] + for x in x_list: + _, T, _ = x.size() + length_list.append(T) + + # Decompose to obtain the season and trend + season_list = [] + trend_list = [] + for x in x_list: + season, trend = self.decompsition(x) + if not self.channel_independence: + season = self.cross_layer(season) + trend = self.cross_layer(trend) + season_list.append(season.permute(0, 2, 1)) + trend_list.append(trend.permute(0, 2, 1)) + + # bottom-up season mixing + out_season_list = self.mixing_multi_scale_season(season_list) + # top-down trend mixing + out_trend_list = self.mixing_multi_scale_trend(trend_list) + + out_list = [] + for ori, out_season, out_trend, length in zip(x_list, out_season_list, out_trend_list, + length_list): + out = out_season + out_trend + if self.channel_independence: + out = ori + self.out_cross_layer(out) + out_list.append(out[:, :length, :]) + return out_list + + +class TimeMixer(nn.Module): + + def __init__(self, **model_args): + super(TimeMixer, self).__init__() + + self.seq_len = model_args['seq_len'] + self.pred_len = model_args['pred_len'] + self.e_layers = model_args['e_layers'] + self.moving_avg = model_args['moving_avg'] + self.down_sampling_window = model_args['down_sampling_window'] + self.down_sampling_layers = model_args['down_sampling_layers'] + self.channel_independence = model_args['channel_independence'] + self.d_model = model_args['d_model'] + self.dec_in = model_args['dec_in'] + self.enc_in = model_args['enc_in'] + self.c_out = model_args['c_out'] + self.freq = model_args['freq'] + self.dropout = model_args['dropout'] + self.pdm_blocks = nn.ModuleList([PastDecomposableMixing(**model_args) + for _ in range(self.e_layers)]) + self.down_sampling_method = model_args["down_sampling_method"] + self.preprocess = series_decomp(self.moving_avg) + self.embed = model_args['embed'] + self.enc_in = model_args['enc_in'] + self.use_norm = model_args['use_norm'] + if self.channel_independence: + self.enc_embedding = DataEmbedding_wo_pos(1, self.d_model, self.embed, self.freq, + self.dropout) + else: + self.enc_embedding = DataEmbedding_wo_pos(self.enc_in, self.d_model, self.embed, self.freq, + self.dropout) + + self.layer = self.e_layers + + self.normalize_layers = torch.nn.ModuleList( + [ + Normalize(self.enc_in, affine=True, non_norm=True if self.use_norm == 0 else False) + for i in range(self.down_sampling_layers + 1) + ] + ) + + + self.predict_layers = torch.nn.ModuleList( + [ + torch.nn.Linear( + self.seq_len // (self.down_sampling_window ** i), + self.pred_len, + ) + for i in range(self.down_sampling_layers + 1) + ] + ) + + if self.channel_independence: + self.projection_layer = nn.Linear( + self.d_model, 1, bias=True) + else: + self.projection_layer = nn.Linear( + self.d_model, self.c_out, bias=True) + + self.out_res_layers = torch.nn.ModuleList( + [ + torch.nn.Linear( + self.seq_len // (self.down_sampling_window ** i), + self.seq_len // (self.down_sampling_window ** i), + ) + for i in range(self.down_sampling_layers + 1) + ] + ) + + self.regression_layers = torch.nn.ModuleList( + [ + torch.nn.Linear( + self.seq_len // (self.down_sampling_window ** i), + self.pred_len, + ) + for i in range(self.down_sampling_layers + 1) + ] + ) + + + + def out_projection(self, dec_out, i, out_res): + dec_out = self.projection_layer(dec_out) + out_res = out_res.permute(0, 2, 1) + out_res = self.out_res_layers[i](out_res) + out_res = self.regression_layers[i](out_res).permute(0, 2, 1) + dec_out = dec_out + out_res + return dec_out + + def pre_enc(self, x_list): + if self.channel_independence: + return (x_list, None) + else: + out1_list = [] + out2_list = [] + for x in x_list: + x_1, x_2 = self.preprocess(x) + out1_list.append(x_1) + out2_list.append(x_2) + return (out1_list, out2_list) + + def __multi_scale_process_inputs(self, x_enc, x_mark_enc): + if self.down_sampling_method == 'max': + down_pool = torch.nn.MaxPool1d(self.down_sampling_window, return_indices=False) + elif self.down_sampling_method == 'avg': + down_pool = torch.nn.AvgPool1d(self.down_sampling_window) + elif self.down_sampling_method == 'conv': + padding = 1 if torch.__version__ >= '1.5.0' else 2 + down_pool = nn.Conv1d(in_channels=self.enc_in, out_channels=self.enc_in, + kernel_size=3, padding=padding, + stride=self.down_sampling_window, + padding_mode='circular', + bias=False) + else: + return x_enc, x_mark_enc + # B,T,C -> B,C,T + x_enc = x_enc.permute(0, 2, 1) + + x_enc_ori = x_enc + x_mark_enc_mark_ori = x_mark_enc + + x_enc_sampling_list = [] + x_mark_sampling_list = [] + x_enc_sampling_list.append(x_enc.permute(0, 2, 1)) + x_mark_sampling_list.append(x_mark_enc) + + for i in range(self.down_sampling_layers): + x_enc_sampling = down_pool(x_enc_ori) + + x_enc_sampling_list.append(x_enc_sampling.permute(0, 2, 1)) + x_enc_ori = x_enc_sampling + + if x_mark_enc is not None: + x_mark_sampling_list.append(x_mark_enc_mark_ori[:, ::self.down_sampling_window, :]) + x_mark_enc_mark_ori = x_mark_enc_mark_ori[:, ::self.down_sampling_window, :] + + x_enc = x_enc_sampling_list + x_mark_enc = x_mark_sampling_list if x_mark_enc is not None else None + + return x_enc, x_mark_enc + + def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec): + + x_enc, x_mark_enc = self.__multi_scale_process_inputs(x_enc, x_mark_enc) + + x_list = [] + x_mark_list = [] + if x_mark_enc is not None: + for i, x, x_mark in zip(range(len(x_enc)), x_enc, x_mark_enc): + B, T, N = x.size() + x = self.normalize_layers[i](x, 'norm') + if self.channel_independence: + x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1) + x_list.append(x) + x_mark = x_mark.repeat(N, 1, 1) + x_mark_list.append(x_mark) + else: + x_list.append(x) + x_mark_list.append(x_mark) + else: + for i, x in zip(range(len(x_enc)), x_enc, ): + B, T, N = x.size() + x = self.normalize_layers[i](x, 'norm') + if self.channel_independence: + x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1) + x_list.append(x) + + # embedding + enc_out_list = [] + x_list = self.pre_enc(x_list) + if x_mark_enc is not None: + for i, x, x_mark in zip(range(len(x_list[0])), x_list[0], x_mark_list): + enc_out = self.enc_embedding(x, x_mark) # [B,T,C] + enc_out_list.append(enc_out) + else: + for i, x in zip(range(len(x_list[0])), x_list[0]): + enc_out = self.enc_embedding(x, None) # [B,T,C] + enc_out_list.append(enc_out) + + # Past Decomposable Mixing as encoder for past + for i in range(self.layer): + enc_out_list = self.pdm_blocks[i](enc_out_list) + + # Future Multipredictor Mixing as decoder for future + dec_out_list = self.future_multi_mixing(B, enc_out_list, x_list) + + dec_out = torch.stack(dec_out_list, dim=-1).sum(-1) + dec_out = self.normalize_layers[0](dec_out, 'denorm') + return dec_out + + def future_multi_mixing(self, B, enc_out_list, x_list): + dec_out_list = [] + if self.channel_independence: + x_list = x_list[0] + for i, enc_out in zip(range(len(x_list)), enc_out_list): + dec_out = self.predict_layers[i](enc_out.permute(0, 2, 1)).permute( + 0, 2, 1) # align temporal dimension + dec_out = self.projection_layer(dec_out) + dec_out = dec_out.reshape(B, self.c_out, self.pred_len).permute(0, 2, 1).contiguous() + dec_out_list.append(dec_out) + + else: + for i, enc_out, out_res in zip(range(len(x_list[0])), enc_out_list, x_list[1]): + dec_out = self.predict_layers[i](enc_out.permute(0, 2, 1)).permute( + 0, 2, 1) # align temporal dimension + dec_out = self.out_projection(dec_out, i, out_res) + dec_out_list.append(dec_out) + + return dec_out_list + + + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + + x_enc, x_mark_enc, x_dec, x_mark_dec = data_transformation_4_xformer(history_data=history_data, + future_data=future_data, + start_token_len=0) + + dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec) + return dec_out.unsqueeze(-1) + diff --git a/baselines/UMixer/Electricity.py b/baselines/UMixer/Electricity.py new file mode 100644 index 0000000..fb7ce10 --- /dev/null +++ b/baselines/UMixer/Electricity.py @@ -0,0 +1,159 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import UMixer + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = UMixer +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 3, # attn factor + "p_hidden_dims": [128, 128], + "p_hidden_layers": 2, + "d_model": 32, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 2, # num of encoder layers + "d_layers": 1, # num of decoder layers + "d_ff": 32, + "stride": 8, + "patch_len": 16, + "distil": True, + "sigma" : 0.2, + "dropout": 0.1, + "freq": 'h', + "use_norm" : False, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0001, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/UMixer/arch/Embed.py b/baselines/UMixer/arch/Embed.py new file mode 100644 index 0000000..1a1c95b --- /dev/null +++ b/baselines/UMixer/arch/Embed.py @@ -0,0 +1,230 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import weight_norm +import math + + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular', bias=False) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_( + m.weight, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding + if freq == 't': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + minute_x = self.minute_embed(x[:, :, 4]) if hasattr( + self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h': 4, 't': 5, 's': 6, + 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x) + else: + x = self.value_embedding( + x) + self.temporal_embedding(x_mark) + self.position_embedding(x) + return self.dropout(x) + + +class TokenEmbed(nn.Module): + def __init__(self,d_model, embed_type='fixed', freq='h'): + super(TokenEmbed, self).__init__() + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + + def forward(self, x_mark): + return self.temporal_embedding(x_mark) + + +class DataEmbedding_wo_pos(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_wo_pos, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + else: + x = self.value_embedding(x) + self.temporal_embedding(x_mark) + return self.dropout(x) + + +class PatchEmbedding(nn.Module): + def __init__(self, d_model, patch_len, stride, dropout): + super(PatchEmbedding, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, stride)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = TokenEmbedding(patch_len, d_model) + + # Positional embedding + self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) # 对最后一维,进行size为patch_len,step为stride的滑动 + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + self.position_embedding(x) + return self.dropout(x), n_vars + + +class PosiEmbed(nn.Module): + def __init__(self, d_model, patch_len, stride): + super(PosiEmbed, self).__init__() + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, stride)) + self.position_embedding = PositionalEmbedding(d_model) + + def forward(self, x): + # do patching + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) # 对最后一维,进行size为patch_len,step为stride的滑动 + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.position_embedding(x) + return x + + +class PatchEmbedding_wopos(nn.Module): + def __init__(self, d_model, patch_len, stride, dropout): + super(PatchEmbedding_wopos, self).__init__() + # Patching + self.patch_len = patch_len + self.stride = stride + self.padding_patch_layer = nn.ReplicationPad1d((0, stride)) + + # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space + self.value_embedding = TokenEmbedding(patch_len, d_model) + + # Positional embedding + # self.position_embedding = PositionalEmbedding(d_model) + + # Residual dropout + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # do patching + n_vars = x.shape[1] + x = self.padding_patch_layer(x) + x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride) # 对最后一维,进行size为patch_len,step为stride的滑动 + x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])) + # Input encoding + x = self.value_embedding(x) + return self.dropout(x), n_vars \ No newline at end of file diff --git a/baselines/UMixer/arch/RevIN.py b/baselines/UMixer/arch/RevIN.py new file mode 100644 index 0000000..f780f3c --- /dev/null +++ b/baselines/UMixer/arch/RevIN.py @@ -0,0 +1,103 @@ +import torch +import torch.nn as nn + + +class RevIN(nn.Module): + def __init__(self, num_features: int, eps=1e-5, affine=True): + """ + :param num_features: the number of features or channels + :param eps: a value added for numerical stability + :param affine: if True, RevIN has learnable affine parameters + """ + super(RevIN, self).__init__() + self.num_features = num_features + self.eps = eps + self.affine = affine + if self.affine: + self._init_params() + + def forward(self, x, mode:str): + if mode == 'norm': + self._get_statistics(x) + x = self._normalize(x) + elif mode == 'denorm': + x = self._denormalize(x) + else: raise NotImplementedError + return x + + def _init_params(self): + # initialize RevIN params: (C,) + self.affine_weight = nn.Parameter(torch.ones(self.num_features)) + self.affine_bias = nn.Parameter(torch.zeros(self.num_features)) + + def _get_statistics(self, x): + dim2reduce = tuple(range(1, x.ndim-1)) + self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() + self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() + + def _normalize(self, x): + x = x - self.mean + x = x / self.stdev + if self.affine: + x = x * self.affine_weight + x = x + self.affine_bias + return x + + def _denormalize(self, x): + if self.affine: + x = x - self.affine_bias + x = x / (self.affine_weight + self.eps*self.eps) + x = x * self.stdev + x = x + self.mean + return x + +# +# class RevIN_loc(nn.Module): +# def __init__(self, num_features: int, eps=1e-5, affine=True): +# """ +# :param num_features: the number of features or channels +# :param eps: a value added for numerical stability +# :param affine: if True, RevIN has learnable affine parameters +# """ +# super(RevIN_loc, self).__init__() +# self.num_features_l = num_features +# self.eps_l = eps +# self.affine_l = affine +# if self.affine_l: +# self._init_params_l() +# +# def forward(self, x, mode:str): +# if mode == 'norm': +# self._get_statistics_l(x) +# x = self._normalize_l(x) +# elif mode == 'denorm': +# x = self._denormalize_l(x) +# else: raise NotImplementedError +# return x +# +# def _init_params_l(self): +# # initialize RevIN params: (C,) +# self.affine_weight_l = nn.Parameter(torch.ones(self.num_features_l)) +# self.affine_bias_l = nn.Parameter(torch.zeros(self.num_features_l)) +# +# def _get_statistics_l(self, x): +# dim2reduce_l = tuple(range(1, x.ndim-1)) +# self.mean_l = torch.mean(x, dim=dim2reduce_l, keepdim=True).detach() +# self.stdev_l = torch.sqrt(torch.var(x, dim=dim2reduce_l, keepdim=True, unbiased=False) + self.eps_l).detach() +# +# def _normalize_l(self, x): +# x = x - self.mean_l +# x = x / self.stdev_l +# if self.affine_l: +# x = x * self.affine_weight_l +# x = x + self.affine_bias_l +# return x +# +# def _denormalize_l(self, x): +# if self.affine_l: +# x = x - self.affine_bias_l +# x = x / (self.affine_weight_l + self.eps_l*self.eps_l) +# x = x * self.stdev_l +# x = x + self.mean_l +# return x +# diff --git a/baselines/UMixer/arch/__init__.py b/baselines/UMixer/arch/__init__.py new file mode 100644 index 0000000..9bc84af --- /dev/null +++ b/baselines/UMixer/arch/__init__.py @@ -0,0 +1 @@ +from .umixer_arch import UMixer \ No newline at end of file diff --git a/baselines/UMixer/arch/umixer_arch.py b/baselines/UMixer/arch/umixer_arch.py new file mode 100644 index 0000000..9396911 --- /dev/null +++ b/baselines/UMixer/arch/umixer_arch.py @@ -0,0 +1,234 @@ +import torch +import torch.nn as nn +import torch.fft +from .Embed import PatchEmbedding +from .RevIN import RevIN + + +def S_Correction(x, x_pre): + x_fft = torch.fft.rfft(x,dim=1,norm='ortho') + x_pre_fft = torch.fft.rfft(x_pre, dim=1, norm='ortho') + x_fft = x_fft * torch.conj(x_fft) + x_pre_fft = x_pre_fft * torch.conj(x_pre_fft) + x_ifft = torch.fft.irfft(x_fft, dim=1) # + x_pre_ifft = torch.fft.irfft(x_pre_fft, dim=1) + x_ifft = torch.clamp(x_ifft,min=0) + x_pre_ifft = torch.clamp(x_pre_ifft,min=0) + alpha = torch.sum(x_ifft*x_pre_ifft,dim=1,keepdim=True)/(torch.sum(x_pre_ifft*x_pre_ifft,dim=1,keepdim=True)+0.001) + #alpha = (x_ifft * x_pre_ifft) / (x_pre_ifft * x_pre_ifft + 0.001) + return torch.sqrt(alpha) + + +class Flatten_Head(nn.Module): + def __init__(self, n_vars, nf, target_window, head_dropout=0): + super().__init__() + self.n_vars = n_vars + self.flatten = nn.Flatten(start_dim=-2) + self.linear = nn.Linear(nf, target_window) + self.dropout = nn.Dropout(head_dropout) + + def forward(self, x): # x: [bs x nvars x d_model x patch_num] + x = self.flatten(x) + x = self.linear(x) + x = self.dropout(x) + return x + + +class moving_avg(nn.Module): + """ + Moving average block to highlight the trend of time series + """ + + def __init__(self, kernel_size, stride): + super(moving_avg, self).__init__() + self.kernel_size = kernel_size + self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) + + def forward(self, x): + # x shape: batch,seq_len,channels + # padding on the both ends of time series + front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) + end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) + x = torch.cat([front, x, end], dim=1) + x = self.avg(x.permute(0, 2, 1)) + x = x.permute(0, 2, 1) + return x + + +class series_decomp(nn.Module): + """ + Series decomposition block + """ + def __init__(self, kernel_size): + super(series_decomp, self).__init__() + self.moving_avg = moving_avg(kernel_size, stride=1) + + def forward(self, x): + moving_mean = self.moving_avg(x) + res = x - moving_mean + return res, moving_mean + + +class series_decomp_multi(nn.Module): + """ + Series decomposition block + """ + + def __init__(self, kernel_size): + super(series_decomp_multi, self).__init__() + self.kernel_size = kernel_size + self.moving_avg = [moving_avg(kernel, stride=1) for kernel in kernel_size] + + def forward(self, x): + moving_mean = [] + res = [] + for func in self.moving_avg: + moving_avg = func(x) + moving_mean.append(moving_avg) + sea = x - moving_avg + res.append(sea) + + sea = sum(res) / len(res) + moving_mean = sum(moving_mean) / len(moving_mean) + return sea, moving_mean + + +class channelMix_CI_pat(nn.Module): + def __init__(self, **model_args): + super(channelMix_CI_pat, self).__init__() + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + self.stride = model_args['stride'] + self.patch_len = model_args['patch_len'] + self.d_model = model_args['d_model'] + self.dropout = model_args["dropout"] + self.Pnum = int((self.pred_len + self.seq_len - self.patch_len) / self.stride + 2) + + self.conv1 = nn.ModuleList(nn.Linear(self.Pnum, self.Pnum) for _ in range(self.d_model)) + self.conv2 = nn.ModuleList(nn.Linear(self.Pnum, self.Pnum) for _ in range(self.d_model)) + self.gelu = nn.GELU() + self.drop = nn.Dropout(self.dropout) + self.norm = nn.LayerNorm(self.d_model) + self.channels = self.d_model + + def forward(self, x): + o = torch.zeros(x.shape, dtype=x.dtype, device='cuda:0') + for i in range(self.channels): + o[:, :, i] = self.drop(self.conv2[i](self.gelu(self.conv1[i](x[:, :, i])))) + res = o + x + res = self.norm(res) + return res + + +class tempolMix_CI_pat(nn.Module): + def __init__(self, **model_args): + super(tempolMix_CI_pat, self).__init__() + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + self.stride = model_args['stride'] + self.patch_len = model_args['patch_len'] + self.d_model = model_args['d_model'] + self.dropout = model_args["dropout"] + self.Pnum = int((self.pred_len + self.seq_len - self.patch_len) / self.stride + 2) + + self.conv1 = nn.ModuleList(nn.Linear(self.d_model, self.d_model) for _ in range(self.Pnum)) + self.conv2 = nn.ModuleList(nn.Linear(self.d_model, self.d_model) for _ in range(self.Pnum)) + self.gelu = nn.GELU() + self.drop = nn.Dropout(self.dropout) + self.norm = nn.LayerNorm(self.d_model) + self.channels = self.Pnum + + def forward(self, x): + o = torch.zeros(x.shape, dtype=x.dtype, device='cuda:0') + for i in range(self.channels): + o[:, i, :] = self.drop(self.conv2[i](self.gelu(self.conv1[i](x[:, i, :])))) + res = o + x + res = self.norm(res) + return res + + +class UMixer(nn.Module): + def __init__(self, **model_args): + super(UMixer, self).__init__() + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + self.stride = model_args['stride'] + self.patch_len = model_args['patch_len'] + self.d_model = model_args['d_model'] + self.dropout = model_args["dropout"] + self.Pnum = int((self.pred_len + self.seq_len - self.patch_len) / self.stride + 2) + + self.layer = model_args['e_layers'] + + self.layer_norm = nn.LayerNorm(self.d_model) + self.predict_linear = nn.Linear(self.seq_len, self.pred_len+self.seq_len) + self.e_layers = model_args['e_layers'] + self.d_layers = model_args['d_layers'] + self.enc_in = model_args['enc_in'] + self.dropout = model_args["dropout"] + self.c_out = model_args['c_out'] + + self.mlp_tempmix_md = nn.ModuleList([tempolMix_CI_pat(**model_args) + for _ in range(self.e_layers)]) + self.mlp_chanmix_md = nn.ModuleList([channelMix_CI_pat(**model_args) + for _ in range(self.e_layers)]) + self.mlp_tempmix_mu = nn.ModuleList([tempolMix_CI_pat(**model_args) + for _ in range(self.e_layers)]) + self.mlp_chanmix_mu = nn.ModuleList([channelMix_CI_pat(**model_args) + for _ in range(self.e_layers)]) + + self.mlp_trend_ci = nn.ModuleList(nn.Linear(self.pred_len, self.d_model) for _ in range(self.c_out)) + self.mlp_trend2_ci = nn.ModuleList(nn.Linear(self.d_model, self.pred_len) for _ in range(self.c_out)) + + self.revin = RevIN(self.enc_in) + self.patch_embedding = PatchEmbedding( + self.d_model, self.patch_len, self.stride, self.dropout) + self.head = Flatten_Head(self.enc_in, self.d_model * self.Pnum, self.pred_len, + head_dropout=self.dropout) + self.comb = nn.Linear(self.e_layers, 1) + + def forecast(self, x_input): + x_ori = x_input.contiguous() + x_input = self.revin(x_input, 'norm') + x_input = self.predict_linear(x_input.permute(0, 2, 1)) + x_input, n_vars = self.patch_embedding(x_input) + + x_old, _ = self.patch_embedding(x_ori.permute(0, 2, 1)) + + x_all = torch.zeros([x_input.shape[0],x_input.shape[1],x_input.shape[2],self.layer], device='cuda:0') + for i in range(self.layer): + x_ud = self.mlp_tempmix_md[i](x_input) + x_ud = self.mlp_chanmix_md[i](x_ud) + for i in range(i,-1,-1): + x_ud = self.mlp_tempmix_mu[i](x_ud) + x_ud = self.mlp_chanmix_mu[i](x_ud) + x_all[:,:,:,i] = x_ud + x_input = self.comb(x_all).squeeze(-1) + x_input = S_Correction(self.layer_norm(x_old), self.layer_norm(x_input[:, :x_old.shape[1], :])) * x_input + x_input = torch.reshape( + x_input, (-1, n_vars, x_input.shape[-2], x_input.shape[-1])) + x_input = x_input.permute(0, 1, 3, 2) + + x_input = self.head(x_input) + x_input = x_input.permute(0, 2, 1) + x_input = self.revin(x_input, 'denorm') + + x = x_input[:,-self.pred_len:,:] + + return x + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + """ + + Args: + history_data (Tensor): Input data with shape: [B, L1, N, C] + future_data (Tensor): Future data with shape: [B, L2, N, C] + + Returns: + torch.Tensor: outputs with shape [B, L2, N, 1] + """ + x_input = history_data[:, :, :, 0] + + out = self.forecast(x_input) + return out.unsqueeze(-1) # [B, L, C] diff --git a/baselines/iTransformer/Electricity.py b/baselines/iTransformer/Electricity.py new file mode 100644 index 0000000..6729510 --- /dev/null +++ b/baselines/iTransformer/Electricity.py @@ -0,0 +1,158 @@ +import os +import sys +from easydict import EasyDict +sys.path.append(os.path.abspath(__file__ + '/../../..')) +from basicts.metrics import masked_mae, masked_mse +from basicts.data import TimeSeriesForecastingDataset +from basicts.runners import SimpleTimeSeriesForecastingRunner +from basicts.scaler import ZScoreScaler +from basicts.utils import get_regular_settings + +from .arch import iTransformer + +############################## Hot Parameters ############################## +# Dataset & Metrics configuration +DATA_NAME = 'Electricity' # Dataset name +regular_settings = get_regular_settings(DATA_NAME) +INPUT_LEN = regular_settings['INPUT_LEN'] # Length of input sequence +OUTPUT_LEN = regular_settings['OUTPUT_LEN'] # Length of output sequence +TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO'] # Train/Validation/Test split ratios +NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data +RESCALE = regular_settings['RESCALE'] # Whether to rescale the data +NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data +# Model architecture and parameters +MODEL_ARCH = iTransformer +NUM_NODES = 321 +MODEL_PARAM = { + "enc_in": NUM_NODES, # num nodes + "dec_in": NUM_NODES, + "c_out": NUM_NODES, + "seq_len": INPUT_LEN, + "label_len": INPUT_LEN/2, # start token length used in decoder + "pred_len": OUTPUT_LEN, # prediction sequence length + "factor": 3, # attn factor + "p_hidden_dims": [128, 128], + "p_hidden_layers": 2, + "d_model": 512, + "moving_avg": 25, # window size of moving average. This is a CRUCIAL hyper-parameter. + "n_heads": 8, + "e_layers": 3, # num of encoder layers + "d_layers": 1, # num of decoder layers + "d_ff": 512, + "distil": True, + "sigma" : 0.2, + "dropout": 0.1, + "freq": 'h', + "use_norm" : False, + "output_attention": False, + "embed": "timeF", # [timeF, fixed, learned] + "activation": "gelu", + "num_time_features": 4, # number of used time features + "time_of_day_size": 24, + "day_of_week_size": 7, + "day_of_month_size": 31, + "day_of_year_size": 366 + } +NUM_EPOCHS = 100 + +############################## General Configuration ############################## +CFG = EasyDict() +# General settings +CFG.DESCRIPTION = 'An Example Config' +CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode) +# Runner +CFG.RUNNER = SimpleTimeSeriesForecastingRunner + +############################## Dataset Configuration ############################## +CFG.DATASET = EasyDict() +# Dataset settings +CFG.DATASET.NAME = DATA_NAME +CFG.DATASET.TYPE = TimeSeriesForecastingDataset +CFG.DATASET.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_val_test_ratio': TRAIN_VAL_TEST_RATIO, + 'input_len': INPUT_LEN, + 'output_len': OUTPUT_LEN, + # 'mode' is automatically set by the runner +}) + +############################## Scaler Configuration ############################## +CFG.SCALER = EasyDict() +# Scaler settings +CFG.SCALER.TYPE = ZScoreScaler # Scaler class +CFG.SCALER.PARAM = EasyDict({ + 'dataset_name': DATA_NAME, + 'train_ratio': TRAIN_VAL_TEST_RATIO[0], + 'norm_each_channel': NORM_EACH_CHANNEL, + 'rescale': RESCALE, +}) + +############################## Model Configuration ############################## +CFG.MODEL = EasyDict() +# Model settings +CFG.MODEL.NAME = MODEL_ARCH.__name__ +CFG.MODEL.ARCH = MODEL_ARCH +CFG.MODEL.PARAM = MODEL_PARAM +CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4] +CFG.MODEL.TARGET_FEATURES = [0] + +############################## Metrics Configuration ############################## + +CFG.METRICS = EasyDict() +# Metrics settings +CFG.METRICS.FUNCS = EasyDict({ + 'MAE': masked_mae, + 'MSE': masked_mse + }) +CFG.METRICS.TARGET = 'MAE' +CFG.METRICS.NULL_VAL = NULL_VAL + +############################## Training Configuration ############################## +CFG.TRAIN = EasyDict() +CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS +CFG.TRAIN.CKPT_SAVE_DIR = os.path.join( + 'checkpoints', + MODEL_ARCH.__name__, + '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)]) +) +CFG.TRAIN.LOSS = masked_mae +# Optimizer settings +CFG.TRAIN.OPTIM = EasyDict() +CFG.TRAIN.OPTIM.TYPE = "Adam" +CFG.TRAIN.OPTIM.PARAM = { + "lr": 0.0005, +} +# Learning rate scheduler settings +CFG.TRAIN.LR_SCHEDULER = EasyDict() +CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR" +CFG.TRAIN.LR_SCHEDULER.PARAM = { + "milestones": [1, 25, 50], + "gamma": 0.5 +} +CFG.TRAIN.CLIP_GRAD_PARAM = { + 'max_norm': 5.0 +} +# Train data loader settings +CFG.TRAIN.DATA = EasyDict() +CFG.TRAIN.DATA.BATCH_SIZE = 64 +CFG.TRAIN.DATA.SHUFFLE = True + +############################## Validation Configuration ############################## +CFG.VAL = EasyDict() +CFG.VAL.INTERVAL = 1 +CFG.VAL.DATA = EasyDict() +CFG.VAL.DATA.BATCH_SIZE = 64 + +############################## Test Configuration ############################## +CFG.TEST = EasyDict() +CFG.TEST.INTERVAL = 1 +CFG.TEST.DATA = EasyDict() +CFG.TEST.DATA.BATCH_SIZE = 64 + +############################## Evaluation Configuration ############################## + +CFG.EVAL = EasyDict() + +# Evaluation parameters +CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336] +CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True diff --git a/baselines/iTransformer/arch/Embed.py b/baselines/iTransformer/arch/Embed.py new file mode 100644 index 0000000..33fae23 --- /dev/null +++ b/baselines/iTransformer/arch/Embed.py @@ -0,0 +1,143 @@ +import torch +import torch.nn as nn +import math + + +class PositionalEmbedding(nn.Module): + def __init__(self, d_model, max_len=5000): + super(PositionalEmbedding, self).__init__() + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + pe.require_grad = False + + position = torch.arange(0, max_len).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + return self.pe[:, :x.size(1)] + + +class TokenEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(TokenEmbedding, self).__init__() + padding = 1 if torch.__version__ >= '1.5.0' else 2 + self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, + kernel_size=3, padding=padding, padding_mode='circular', bias=False) + for m in self.modules(): + if isinstance(m, nn.Conv1d): + nn.init.kaiming_normal_( + m.weight, mode='fan_in', nonlinearity='leaky_relu') + + def forward(self, x): + x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) + return x + + +class FixedEmbedding(nn.Module): + def __init__(self, c_in, d_model): + super(FixedEmbedding, self).__init__() + + w = torch.zeros(c_in, d_model).float() + w.require_grad = False + + position = torch.arange(0, c_in).float().unsqueeze(1) + div_term = (torch.arange(0, d_model, 2).float() + * -(math.log(10000.0) / d_model)).exp() + + w[:, 0::2] = torch.sin(position * div_term) + w[:, 1::2] = torch.cos(position * div_term) + + self.emb = nn.Embedding(c_in, d_model) + self.emb.weight = nn.Parameter(w, requires_grad=False) + + def forward(self, x): + return self.emb(x).detach() + + +class TemporalEmbedding(nn.Module): + def __init__(self, d_model, embed_type='fixed', freq='h'): + super(TemporalEmbedding, self).__init__() + + minute_size = 4 + hour_size = 24 + weekday_size = 7 + day_size = 32 + month_size = 13 + + Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding + if freq == 't': + self.minute_embed = Embed(minute_size, d_model) + self.hour_embed = Embed(hour_size, d_model) + self.weekday_embed = Embed(weekday_size, d_model) + self.day_embed = Embed(day_size, d_model) + self.month_embed = Embed(month_size, d_model) + + def forward(self, x): + x = x.long() + minute_x = self.minute_embed(x[:, :, 4]) if hasattr( + self, 'minute_embed') else 0. + hour_x = self.hour_embed(x[:, :, 3]) + weekday_x = self.weekday_embed(x[:, :, 2]) + day_x = self.day_embed(x[:, :, 1]) + month_x = self.month_embed(x[:, :, 0]) + + return hour_x + weekday_x + day_x + month_x + minute_x + + +class TimeFeatureEmbedding(nn.Module): + def __init__(self, d_model, embed_type='timeF', freq='h'): + super(TimeFeatureEmbedding, self).__init__() + + freq_map = {'h': 4, 't': 5, 's': 6, + 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} + d_inp = freq_map[freq] + self.embed = nn.Linear(d_inp, d_model, bias=False) + + def forward(self, x): + return self.embed(x) + + +class DataEmbedding(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding, self).__init__() + + self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) + self.position_embedding = PositionalEmbedding(d_model=d_model) + self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, + freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( + d_model=d_model, embed_type=embed_type, freq=freq) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + if x_mark is None: + x = self.value_embedding(x) + self.position_embedding(x) + else: + x = self.value_embedding( + x) + self.temporal_embedding(x_mark) + self.position_embedding(x) + return self.dropout(x) + + +class DataEmbedding_inverted(nn.Module): + def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): + super(DataEmbedding_inverted, self).__init__() + self.value_embedding = nn.Linear(c_in, d_model) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, x, x_mark): + x = x.permute(0, 2, 1) + # x: [Batch Variate Time] + if x_mark is None: + x = self.value_embedding(x) + else: + # the potential to take covariates (e.g. timestamps) as tokens + x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) + # x: [Batch Variate d_model] + return self.dropout(x) + diff --git a/baselines/iTransformer/arch/SelfAttention_Family.py b/baselines/iTransformer/arch/SelfAttention_Family.py new file mode 100644 index 0000000..afe927b --- /dev/null +++ b/baselines/iTransformer/arch/SelfAttention_Family.py @@ -0,0 +1,302 @@ +import torch +import torch.nn as nn +import numpy as np +from math import sqrt +from .masking import TriangularCausalMask, ProbMask +from einops import rearrange + + +# Code implementation from https://github.com/thuml/Flowformer +class FlowAttention(nn.Module): + def __init__(self, attention_dropout=0.1): + super(FlowAttention, self).__init__() + self.dropout = nn.Dropout(attention_dropout) + + def kernel_method(self, x): + return torch.sigmoid(x) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + queries = queries.transpose(1, 2) + keys = keys.transpose(1, 2) + values = values.transpose(1, 2) + # kernel + queries = self.kernel_method(queries) + keys = self.kernel_method(keys) + # incoming and outgoing + normalizer_row = 1.0 / (torch.einsum("nhld,nhd->nhl", queries + 1e-6, keys.sum(dim=2) + 1e-6)) + normalizer_col = 1.0 / (torch.einsum("nhsd,nhd->nhs", keys + 1e-6, queries.sum(dim=2) + 1e-6)) + # reweighting + normalizer_row_refine = ( + torch.einsum("nhld,nhd->nhl", queries + 1e-6, (keys * normalizer_col[:, :, :, None]).sum(dim=2) + 1e-6)) + normalizer_col_refine = ( + torch.einsum("nhsd,nhd->nhs", keys + 1e-6, (queries * normalizer_row[:, :, :, None]).sum(dim=2) + 1e-6)) + # competition and allocation + normalizer_row_refine = torch.sigmoid( + normalizer_row_refine * (float(queries.shape[2]) / float(keys.shape[2]))) + normalizer_col_refine = torch.softmax(normalizer_col_refine, dim=-1) * keys.shape[2] # B h L vis + # multiply + kv = keys.transpose(-2, -1) @ (values * normalizer_col_refine[:, :, :, None]) + x = (((queries @ kv) * normalizer_row[:, :, :, None]) * normalizer_row_refine[:, :, :, None]).transpose(1, + 2).contiguous() + return x, None + + +# Code implementation from https://github.com/shreyansh26/FlashAttention-PyTorch +class FlashAttention(nn.Module): + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(FlashAttention, self).__init__() + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def flash_attention_forward(self, Q, K, V, mask=None): + BLOCK_SIZE = 32 + NEG_INF = -1e10 # -infinity + EPSILON = 1e-10 + # mask = torch.randint(0, 2, (128, 8)).to(device='cuda') + O = torch.zeros_like(Q, requires_grad=True) + l = torch.zeros(Q.shape[:-1])[..., None] + m = torch.ones(Q.shape[:-1])[..., None] * NEG_INF + + O = O.to(device='cuda') + l = l.to(device='cuda') + m = m.to(device='cuda') + + Q_BLOCK_SIZE = min(BLOCK_SIZE, Q.shape[-1]) + KV_BLOCK_SIZE = BLOCK_SIZE + + Q_BLOCKS = torch.split(Q, Q_BLOCK_SIZE, dim=2) + K_BLOCKS = torch.split(K, KV_BLOCK_SIZE, dim=2) + V_BLOCKS = torch.split(V, KV_BLOCK_SIZE, dim=2) + if mask is not None: + mask_BLOCKS = list(torch.split(mask, KV_BLOCK_SIZE, dim=1)) + + Tr = len(Q_BLOCKS) + Tc = len(K_BLOCKS) + + O_BLOCKS = list(torch.split(O, Q_BLOCK_SIZE, dim=2)) + l_BLOCKS = list(torch.split(l, Q_BLOCK_SIZE, dim=2)) + m_BLOCKS = list(torch.split(m, Q_BLOCK_SIZE, dim=2)) + + for j in range(Tc): + Kj = K_BLOCKS[j] + Vj = V_BLOCKS[j] + if mask is not None: + maskj = mask_BLOCKS[j] + + for i in range(Tr): + Qi = Q_BLOCKS[i] + Oi = O_BLOCKS[i] + li = l_BLOCKS[i] + mi = m_BLOCKS[i] + + scale = 1 / np.sqrt(Q.shape[-1]) + Qi_scaled = Qi * scale + + S_ij = torch.einsum('... i d, ... j d -> ... i j', Qi_scaled, Kj) + if mask is not None: + # Masking + maskj_temp = rearrange(maskj, 'b j -> b 1 1 j') + S_ij = torch.where(maskj_temp > 0, S_ij, NEG_INF) + + m_block_ij, _ = torch.max(S_ij, dim=-1, keepdims=True) + P_ij = torch.exp(S_ij - m_block_ij) + if mask is not None: + # Masking + P_ij = torch.where(maskj_temp > 0, P_ij, 0.) + + l_block_ij = torch.sum(P_ij, dim=-1, keepdims=True) + EPSILON + + P_ij_Vj = torch.einsum('... i j, ... j d -> ... i d', P_ij, Vj) + + mi_new = torch.maximum(m_block_ij, mi) + li_new = torch.exp(mi - mi_new) * li + torch.exp(m_block_ij - mi_new) * l_block_ij + + O_BLOCKS[i] = (li / li_new) * torch.exp(mi - mi_new) * Oi + ( + torch.exp(m_block_ij - mi_new) / li_new) * P_ij_Vj + l_BLOCKS[i] = li_new + m_BLOCKS[i] = mi_new + + O = torch.cat(O_BLOCKS, dim=2) + l = torch.cat(l_BLOCKS, dim=2) + m = torch.cat(m_BLOCKS, dim=2) + return O, l, m + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + res = \ + self.flash_attention_forward(queries.permute(0, 2, 1, 3), keys.permute(0, 2, 1, 3), values.permute(0, 2, 1, 3), + attn_mask)[0] + return res.permute(0, 2, 1, 3).contiguous(), None + + +class FullAttention(nn.Module): + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(FullAttention, self).__init__() + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, H, E = queries.shape + _, S, _, D = values.shape + scale = self.scale or 1. / sqrt(E) + + scores = torch.einsum("blhe,bshe->bhls", queries, keys) + + if self.mask_flag: + if attn_mask is None: + attn_mask = TriangularCausalMask(B, L, device=queries.device) + + scores.masked_fill_(attn_mask.mask, -np.inf) + + A = self.dropout(torch.softmax(scale * scores, dim=-1)) + V = torch.einsum("bhls,bshd->blhd", A, values) + + if self.output_attention: + return (V.contiguous(), A) + else: + return (V.contiguous(), None) + + +# Code implementation from https://github.com/zhouhaoyi/Informer2020 +class ProbAttention(nn.Module): + def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): + super(ProbAttention, self).__init__() + self.factor = factor + self.scale = scale + self.mask_flag = mask_flag + self.output_attention = output_attention + self.dropout = nn.Dropout(attention_dropout) + + def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q) + # Q [B, H, L, D] + B, H, L_K, E = K.shape + _, _, L_Q, _ = Q.shape + + # calculate the sampled Q_K + K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E) + # real U = U_part(factor*ln(L_k))*L_q + index_sample = torch.randint(L_K, (L_Q, sample_k)) + K_sample = K_expand[:, :, torch.arange( + L_Q).unsqueeze(1), index_sample, :] + Q_K_sample = torch.matmul( + Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze() + + # find the Top_k query with sparisty measurement + M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K) + M_top = M.topk(n_top, sorted=False)[1] + + # use the reduced Q to calculate Q_K + Q_reduce = Q[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + M_top, :] # factor*ln(L_q) + Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k + + return Q_K, M_top + + def _get_initial_context(self, V, L_Q): + B, H, L_V, D = V.shape + if not self.mask_flag: + # V_sum = V.sum(dim=-2) + V_sum = V.mean(dim=-2) + contex = V_sum.unsqueeze(-2).expand(B, H, + L_Q, V_sum.shape[-1]).clone() + else: # use mask + # requires that L_Q == L_V, i.e. for self-attention only + assert (L_Q == L_V) + contex = V.cumsum(dim=-2) + return contex + + def _update_context(self, context_in, V, scores, index, L_Q, attn_mask): + B, H, L_V, D = V.shape + + if self.mask_flag: + attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device) + scores.masked_fill_(attn_mask.mask, -np.inf) + + attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores) + + context_in[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + index, :] = torch.matmul(attn, V).type_as(context_in) + if self.output_attention: + attns = (torch.ones([B, H, L_V, L_V]) / + L_V).type_as(attn).to(attn.device) + attns[torch.arange(B)[:, None, None], torch.arange(H)[ + None, :, None], index, :] = attn + return (context_in, attns) + else: + return (context_in, None) + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L_Q, H, D = queries.shape + _, L_K, _, _ = keys.shape + + queries = queries.transpose(2, 1) + keys = keys.transpose(2, 1) + values = values.transpose(2, 1) + + U_part = self.factor * \ + np.ceil(np.log(L_K)).astype('int').item() # c*ln(L_k) + u = self.factor * \ + np.ceil(np.log(L_Q)).astype('int').item() # c*ln(L_q) + + U_part = U_part if U_part < L_K else L_K + u = u if u < L_Q else L_Q + + scores_top, index = self._prob_QK( + queries, keys, sample_k=U_part, n_top=u) + + # add scale factor + scale = self.scale or 1. / sqrt(D) + if scale is not None: + scores_top = scores_top * scale + # get the context + context = self._get_initial_context(values, L_Q) + # update the context with selected top_k queries + context, attn = self._update_context( + context, values, scores_top, index, L_Q, attn_mask) + + return context.contiguous(), attn + + +class AttentionLayer(nn.Module): + def __init__(self, attention, d_model, n_heads, d_keys=None, + d_values=None): + super(AttentionLayer, self).__init__() + + d_keys = d_keys or (d_model // n_heads) + d_values = d_values or (d_model // n_heads) + + self.inner_attention = attention + self.query_projection = nn.Linear(d_model, d_keys * n_heads) + self.key_projection = nn.Linear(d_model, d_keys * n_heads) + self.value_projection = nn.Linear(d_model, d_values * n_heads) + self.out_projection = nn.Linear(d_values * n_heads, d_model) + self.n_heads = n_heads + + def forward(self, queries, keys, values, attn_mask, tau=None, delta=None): + B, L, _ = queries.shape + _, S, _ = keys.shape + H = self.n_heads + + queries = self.query_projection(queries).view(B, L, H, -1) + keys = self.key_projection(keys).view(B, S, H, -1) + values = self.value_projection(values).view(B, S, H, -1) + + out, attn = self.inner_attention( + queries, + keys, + values, + attn_mask, + tau=tau, + delta=delta + ) + out = out.view(B, L, -1) + + return self.out_projection(out), attn + + + diff --git a/baselines/iTransformer/arch/Transformer_EncDec.py b/baselines/iTransformer/arch/Transformer_EncDec.py new file mode 100644 index 0000000..c48ddc3 --- /dev/null +++ b/baselines/iTransformer/arch/Transformer_EncDec.py @@ -0,0 +1,134 @@ +import torch.nn as nn +import torch.nn.functional as F + + +class ConvLayer(nn.Module): + def __init__(self, c_in): + super(ConvLayer, self).__init__() + self.downConv = nn.Conv1d(in_channels=c_in, + out_channels=c_in, + kernel_size=3, + padding=2, + padding_mode='circular') + self.norm = nn.BatchNorm1d(c_in) + self.activation = nn.ELU() + self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) + + def forward(self, x): + x = self.downConv(x.permute(0, 2, 1)) + x = self.norm(x) + x = self.activation(x) + x = self.maxPool(x) + x = x.transpose(1, 2) + return x + + +class EncoderLayer(nn.Module): + def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"): + super(EncoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.attention = attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, attn_mask=None, tau=None, delta=None): + new_x, attn = self.attention( + x, x, x, + attn_mask=attn_mask, + tau=tau, delta=delta + ) + x = x + self.dropout(new_x) + + y = x = self.norm1(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm2(x + y), attn + + +class Encoder(nn.Module): + def __init__(self, attn_layers, conv_layers=None, norm_layer=None): + super(Encoder, self).__init__() + self.attn_layers = nn.ModuleList(attn_layers) + self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None + self.norm = norm_layer + + def forward(self, x, attn_mask=None, tau=None, delta=None): + # x [B, L, D] + attns = [] + if self.conv_layers is not None: + for i, (attn_layer, conv_layer) in enumerate(zip(self.attn_layers, self.conv_layers)): + delta = delta if i == 0 else None + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + x = conv_layer(x) + attns.append(attn) + x, attn = self.attn_layers[-1](x, tau=tau, delta=None) + attns.append(attn) + else: + for attn_layer in self.attn_layers: + x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta) + attns.append(attn) + + if self.norm is not None: + x = self.norm(x) + + return x, attns + + +class DecoderLayer(nn.Module): + def __init__(self, self_attention, cross_attention, d_model, d_ff=None, + dropout=0.1, activation="relu"): + super(DecoderLayer, self).__init__() + d_ff = d_ff or 4 * d_model + self.self_attention = self_attention + self.cross_attention = cross_attention + self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) + self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + self.activation = F.relu if activation == "relu" else F.gelu + + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + x = x + self.dropout(self.self_attention( + x, x, x, + attn_mask=x_mask, + tau=tau, delta=None + )[0]) + x = self.norm1(x) + + x = x + self.dropout(self.cross_attention( + x, cross, cross, + attn_mask=cross_mask, + tau=tau, delta=delta + )[0]) + + y = x = self.norm2(x) + y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) + y = self.dropout(self.conv2(y).transpose(-1, 1)) + + return self.norm3(x + y) + + +class Decoder(nn.Module): + def __init__(self, layers, norm_layer=None, projection=None): + super(Decoder, self).__init__() + self.layers = nn.ModuleList(layers) + self.norm = norm_layer + self.projection = projection + + def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None): + for layer in self.layers: + x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta) + + if self.norm is not None: + x = self.norm(x) + + if self.projection is not None: + x = self.projection(x) + return x diff --git a/baselines/iTransformer/arch/__init__.py b/baselines/iTransformer/arch/__init__.py new file mode 100644 index 0000000..41dd6e0 --- /dev/null +++ b/baselines/iTransformer/arch/__init__.py @@ -0,0 +1 @@ +from .itransformer_arch import iTransformer \ No newline at end of file diff --git a/baselines/iTransformer/arch/itransformer_arch.py b/baselines/iTransformer/arch/itransformer_arch.py new file mode 100644 index 0000000..a499c59 --- /dev/null +++ b/baselines/iTransformer/arch/itransformer_arch.py @@ -0,0 +1,108 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from .Transformer_EncDec import Encoder, EncoderLayer +from .SelfAttention_Family import FullAttention, AttentionLayer +from .Embed import DataEmbedding_inverted +import numpy as np +from basicts.utils import data_transformation_4_xformer + +class iTransformer(nn.Module): + """ + Paper link: https://arxiv.org/abs/2310.06625 + """ + + def __init__(self, **model_args): + super(iTransformer, self).__init__() + self.pred_len = model_args['pred_len'] + self.seq_len = model_args['seq_len'] + self.output_attention = model_args['output_attention'] + self.enc_in = model_args['enc_in'] + self.dec_in = model_args['dec_in'] + self.c_out = model_args['c_out'] + self.factor = model_args["factor"] + self.d_model = model_args['d_model'] + self.n_heads = model_args['n_heads'] + self.d_ff = model_args['d_ff'] + self.embed = model_args['embed'] + self.freq = model_args["freq"] + self.dropout = model_args["dropout"] + self.activation = model_args['activation'] + self.e_layers = model_args['e_layers'] + self.d_layers = model_args['d_layers'] + + self.use_norm =model_args['use_norm'] + # Embedding + self.enc_embedding = DataEmbedding_inverted(self.seq_len, self.d_model, self.embed, self.freq, + self.dropout) + + # Encoder-only architecture + self.encoder = Encoder( + [ + EncoderLayer( + AttentionLayer( + FullAttention(False, self.factor, attention_dropout=self.dropout, + output_attention=self.output_attention), self.d_model, self.n_heads), + self.d_model, + self.d_ff, + dropout=self.dropout, + activation=self.activation + ) for l in range(self.e_layers) + ], + norm_layer=torch.nn.LayerNorm(self.d_model) + ) + self.projector = nn.Linear(self.d_model, self.pred_len, bias=True) + + def forward_xformer(self, x_enc: torch.Tensor, x_mark_enc: torch.Tensor, x_dec: torch.Tensor, + x_mark_dec: torch.Tensor, + enc_self_mask: torch.Tensor = None, dec_self_mask: torch.Tensor = None, + dec_enc_mask: torch.Tensor = None) -> torch.Tensor: + + if self.use_norm: + # Normalization from Non-stationary Transformer + means = x_enc.mean(1, keepdim=True).detach() + x_enc = x_enc - means + stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5) + x_enc /= stdev + + _, _, N = x_enc.shape # B L N + # B: batch_size; E: d_model; + # L: seq_len; S: pred_len; + # N: number of variate (tokens), can also includes covariates + + # Embedding + # B L N -> B N E (B L N -> B L E in the vanilla Transformer) + enc_out = self.enc_embedding(x_enc, x_mark_enc) # covariates (e.g timestamp) can be also embedded as tokens + + # B N E -> B N E (B L E -> B L E in the vanilla Transformer) + # the dimensions of embedded time series has been inverted, and then processed by native attn, layernorm and ffn modules + enc_out, attns = self.encoder(enc_out, attn_mask=None) + + # B N E -> B N S -> B S N + dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N] # filter the covariates + + if self.use_norm: + # De-Normalization from Non-stationary Transformer + dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1)) + dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1)) + + return dec_out + + def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, + **kwargs) -> torch.Tensor: + """ + + Args: + history_data (Tensor): Input data with shape: [B, L1, N, C] + future_data (Tensor): Future data with shape: [B, L2, N, C] + + Returns: + torch.Tensor: outputs with shape [B, L2, N, 1] + """ + + x_enc, x_mark_enc, x_dec, x_mark_dec = data_transformation_4_xformer(history_data=history_data, + future_data=future_data, + start_token_len=0) + #print(x_mark_enc.shape, x_mark_dec.shape) + prediction = self.forward_xformer(x_enc=x_enc, x_mark_enc=x_mark_enc, x_dec=x_dec, x_mark_dec=x_mark_dec) + return prediction.unsqueeze(-1) \ No newline at end of file diff --git a/baselines/iTransformer/arch/masking.py b/baselines/iTransformer/arch/masking.py new file mode 100644 index 0000000..a19cbf6 --- /dev/null +++ b/baselines/iTransformer/arch/masking.py @@ -0,0 +1,26 @@ +import torch + + +class TriangularCausalMask(): + def __init__(self, B, L, device="cpu"): + mask_shape = [B, 1, L, L] + with torch.no_grad(): + self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device) + + @property + def mask(self): + return self._mask + + +class ProbMask(): + def __init__(self, B, H, L, index, scores, device="cpu"): + _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1) + _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1]) + indicator = _mask_ex[torch.arange(B)[:, None, None], + torch.arange(H)[None, :, None], + index, :].to(device) + self._mask = indicator.view(scores.shape).to(device) + + @property + def mask(self): + return self._mask