From 0c60982ec46b50fe8c2b17bd778effa308a55ada Mon Sep 17 00:00:00 2001
From: blisky-li <2621142272@qq.com>
Date: Thu, 26 Sep 2024 20:47:49 +0800
Subject: [PATCH] Add more baselines (#148)

---
 baselines/ETSformer/Electricity.py            | 154 +++++++
 baselines/ETSformer/arch/__init__.py          |   3 +
 baselines/ETSformer/arch/decoder.py           |  84 ++++
 baselines/ETSformer/arch/encoder.py           | 227 ++++++++++
 baselines/ETSformer/arch/etsformer_arch.py    | 116 +++++
 .../ETSformer/arch/exponential_smoothing.py   |  68 +++
 baselines/ETSformer/arch/modules.py           |  32 ++
 baselines/FiLM/Electricity.py                 | 155 +++++++
 baselines/FiLM/arch/__init__.py               |   1 +
 baselines/FiLM/arch/film_arch.py              | 154 +++++++
 baselines/Koopa/Electricity.py                | 152 +++++++
 baselines/Koopa/arch/__init__.py              |   2 +
 baselines/Koopa/arch/koopa_arch.py            | 341 ++++++++++++++
 baselines/LightTS/Electricity.py              | 157 +++++++
 baselines/LightTS/arch/__init__.py            |   1 +
 baselines/LightTS/arch/lightts_arch.py        | 135 ++++++
 baselines/MTSMixer/Electricity.py             | 164 +++++++
 baselines/MTSMixer/arch/Invertible.py         | 105 +++++
 baselines/MTSMixer/arch/Projection.py         |  25 ++
 baselines/MTSMixer/arch/__init__.py           |   1 +
 baselines/MTSMixer/arch/decomposition.py      |  58 +++
 baselines/MTSMixer/arch/mtsmixer_arch.py      | 128 ++++++
 .../Nonstationary_Transformer/Electricity.py  | 157 +++++++
 .../Nonstationary_Transformer/arch/Embed.py   | 132 ++++++
 .../arch/SelfAttention_Family.py              | 172 +++++++
 .../arch/Transformer_EncDec.py                | 143 ++++++
 .../arch/__init__.py                          |   1 +
 .../Nonstationary_Transformer/arch/masking.py |  26 ++
 .../arch/nstransformer_arch.py                | 161 +++++++
 baselines/SegRNN/Electricity.py               | 152 +++++++
 baselines/SegRNN/arch/Autoformer_EncDec.py    | 203 +++++++++
 baselines/SegRNN/arch/__init__.py             |   1 +
 baselines/SegRNN/arch/segrnn_arch.py          |  83 ++++
 baselines/SparseTSF/Electricity.py            | 158 +++++++
 baselines/SparseTSF/arch/Embed.py             | 234 ++++++++++
 baselines/SparseTSF/arch/__init__.py          |   1 +
 baselines/SparseTSF/arch/sparsetsf_arch.py    |  46 ++
 baselines/TiDE/Electricity.py                 | 153 +++++++
 baselines/TiDE/arch/__init__.py               |   1 +
 baselines/TiDE/arch/tide_arch.py              | 118 +++++
 baselines/TimeMixer/Electricity.py            | 162 +++++++
 baselines/TimeMixer/arch/Autoformer_EncDec.py | 203 +++++++++
 baselines/TimeMixer/arch/Embed.py             | 234 ++++++++++
 baselines/TimeMixer/arch/StandardNorm.py      |  68 +++
 baselines/TimeMixer/arch/__init__.py          |   1 +
 baselines/TimeMixer/arch/timemixer_arch.py    | 419 ++++++++++++++++++
 baselines/UMixer/Electricity.py               | 159 +++++++
 baselines/UMixer/arch/Embed.py                | 230 ++++++++++
 baselines/UMixer/arch/RevIN.py                | 103 +++++
 baselines/UMixer/arch/__init__.py             |   1 +
 baselines/UMixer/arch/umixer_arch.py          | 234 ++++++++++
 baselines/iTransformer/Electricity.py         | 158 +++++++
 baselines/iTransformer/arch/Embed.py          | 143 ++++++
 .../iTransformer/arch/SelfAttention_Family.py | 302 +++++++++++++
 .../iTransformer/arch/Transformer_EncDec.py   | 134 ++++++
 baselines/iTransformer/arch/__init__.py       |   1 +
 .../iTransformer/arch/itransformer_arch.py    | 108 +++++
 baselines/iTransformer/arch/masking.py        |  26 ++
 58 files changed, 6891 insertions(+)
 create mode 100644 baselines/ETSformer/Electricity.py
 create mode 100644 baselines/ETSformer/arch/__init__.py
 create mode 100644 baselines/ETSformer/arch/decoder.py
 create mode 100644 baselines/ETSformer/arch/encoder.py
 create mode 100644 baselines/ETSformer/arch/etsformer_arch.py
 create mode 100644 baselines/ETSformer/arch/exponential_smoothing.py
 create mode 100644 baselines/ETSformer/arch/modules.py
 create mode 100644 baselines/FiLM/Electricity.py
 create mode 100644 baselines/FiLM/arch/__init__.py
 create mode 100644 baselines/FiLM/arch/film_arch.py
 create mode 100644 baselines/Koopa/Electricity.py
 create mode 100644 baselines/Koopa/arch/__init__.py
 create mode 100644 baselines/Koopa/arch/koopa_arch.py
 create mode 100644 baselines/LightTS/Electricity.py
 create mode 100644 baselines/LightTS/arch/__init__.py
 create mode 100644 baselines/LightTS/arch/lightts_arch.py
 create mode 100644 baselines/MTSMixer/Electricity.py
 create mode 100644 baselines/MTSMixer/arch/Invertible.py
 create mode 100644 baselines/MTSMixer/arch/Projection.py
 create mode 100644 baselines/MTSMixer/arch/__init__.py
 create mode 100644 baselines/MTSMixer/arch/decomposition.py
 create mode 100644 baselines/MTSMixer/arch/mtsmixer_arch.py
 create mode 100644 baselines/Nonstationary_Transformer/Electricity.py
 create mode 100644 baselines/Nonstationary_Transformer/arch/Embed.py
 create mode 100644 baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py
 create mode 100644 baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py
 create mode 100644 baselines/Nonstationary_Transformer/arch/__init__.py
 create mode 100644 baselines/Nonstationary_Transformer/arch/masking.py
 create mode 100644 baselines/Nonstationary_Transformer/arch/nstransformer_arch.py
 create mode 100644 baselines/SegRNN/Electricity.py
 create mode 100644 baselines/SegRNN/arch/Autoformer_EncDec.py
 create mode 100644 baselines/SegRNN/arch/__init__.py
 create mode 100644 baselines/SegRNN/arch/segrnn_arch.py
 create mode 100644 baselines/SparseTSF/Electricity.py
 create mode 100644 baselines/SparseTSF/arch/Embed.py
 create mode 100644 baselines/SparseTSF/arch/__init__.py
 create mode 100644 baselines/SparseTSF/arch/sparsetsf_arch.py
 create mode 100644 baselines/TiDE/Electricity.py
 create mode 100644 baselines/TiDE/arch/__init__.py
 create mode 100644 baselines/TiDE/arch/tide_arch.py
 create mode 100644 baselines/TimeMixer/Electricity.py
 create mode 100644 baselines/TimeMixer/arch/Autoformer_EncDec.py
 create mode 100644 baselines/TimeMixer/arch/Embed.py
 create mode 100644 baselines/TimeMixer/arch/StandardNorm.py
 create mode 100644 baselines/TimeMixer/arch/__init__.py
 create mode 100644 baselines/TimeMixer/arch/timemixer_arch.py
 create mode 100644 baselines/UMixer/Electricity.py
 create mode 100644 baselines/UMixer/arch/Embed.py
 create mode 100644 baselines/UMixer/arch/RevIN.py
 create mode 100644 baselines/UMixer/arch/__init__.py
 create mode 100644 baselines/UMixer/arch/umixer_arch.py
 create mode 100644 baselines/iTransformer/Electricity.py
 create mode 100644 baselines/iTransformer/arch/Embed.py
 create mode 100644 baselines/iTransformer/arch/SelfAttention_Family.py
 create mode 100644 baselines/iTransformer/arch/Transformer_EncDec.py
 create mode 100644 baselines/iTransformer/arch/__init__.py
 create mode 100644 baselines/iTransformer/arch/itransformer_arch.py
 create mode 100644 baselines/iTransformer/arch/masking.py

diff --git a/baselines/ETSformer/Electricity.py b/baselines/ETSformer/Electricity.py
new file mode 100644
index 0000000..304b240
--- /dev/null
+++ b/baselines/ETSformer/Electricity.py
@@ -0,0 +1,154 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import ETSformer
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = ETSformer
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 3,                                # attn factor
+    "d_model": 512,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 2,                              # num of encoder layers
+    "d_layers": 2,                              # num of decoder layers
+    "d_ff": 2048,
+    "K": 3,
+    "sigma" : 0.2,
+    "dropout": 0.2,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "sigmoid",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/ETSformer/arch/__init__.py b/baselines/ETSformer/arch/__init__.py
new file mode 100644
index 0000000..e76c95d
--- /dev/null
+++ b/baselines/ETSformer/arch/__init__.py
@@ -0,0 +1,3 @@
+from .etsformer_arch import ETSformer
+
+__all__ = ["ETSformer"]
diff --git a/baselines/ETSformer/arch/decoder.py b/baselines/ETSformer/arch/decoder.py
new file mode 100644
index 0000000..61496da
--- /dev/null
+++ b/baselines/ETSformer/arch/decoder.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+from einops import rearrange, reduce, repeat
+
+
+class DampingLayer(nn.Module):
+
+    def __init__(self, pred_len, nhead, dropout=0.1, output_attention=False):
+        super().__init__()
+        self.pred_len = pred_len
+        self.nhead = nhead
+        self.output_attention = output_attention
+        self._damping_factor = nn.Parameter(torch.randn(1, nhead))
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = repeat(x, 'b 1 d -> b t d', t=self.pred_len)
+        b, t, d = x.shape
+
+        powers = torch.arange(self.pred_len).to(self._damping_factor.device) + 1
+        powers = powers.view(self.pred_len, 1)
+        damping_factors = self.damping_factor ** powers
+        damping_factors = damping_factors.cumsum(dim=0)
+        x = x.view(b, t, self.nhead, -1)
+        x = self.dropout(x) * damping_factors.unsqueeze(-1)
+        x = x.view(b, t, d)
+        if self.output_attention:
+            return x, damping_factors
+        return x, None
+
+    @property
+    def damping_factor(self):
+        return torch.sigmoid(self._damping_factor)
+
+
+class DecoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, c_out, pred_len, dropout=0.1, output_attention=False):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.c_out = c_out
+        self.pred_len = pred_len
+        self.output_attention = output_attention
+
+        self.growth_damping = DampingLayer(pred_len, nhead, dropout=dropout, output_attention=output_attention)
+        self.dropout1 = nn.Dropout(dropout)
+
+    def forward(self, growth, season):
+        growth_horizon, growth_damping = self.growth_damping(growth[:, -1:])
+        growth_horizon = self.dropout1(growth_horizon)
+
+        seasonal_horizon = season[:, -self.pred_len:]
+
+        if self.output_attention:
+            return growth_horizon, seasonal_horizon, growth_damping
+        return growth_horizon, seasonal_horizon, None
+
+
+class Decoder(nn.Module):
+
+    def __init__(self, layers):
+        super().__init__()
+        self.d_model = layers[0].d_model
+        self.c_out = layers[0].c_out
+        self.pred_len = layers[0].pred_len
+        self.nhead = layers[0].nhead
+
+        self.layers = nn.ModuleList(layers)
+        self.pred = nn.Linear(self.d_model, self.c_out)
+
+    def forward(self, growths, seasons):
+        growth_repr = []
+        season_repr = []
+        growth_dampings = []
+
+        for idx, layer in enumerate(self.layers):
+            growth_horizon, season_horizon, growth_damping = layer(growths[idx], seasons[idx])
+            growth_repr.append(growth_horizon)
+            season_repr.append(season_horizon)
+            growth_dampings.append(growth_damping)
+        growth_repr = sum(growth_repr)
+        season_repr = sum(season_repr)
+        return self.pred(growth_repr), self.pred(season_repr), growth_dampings
diff --git a/baselines/ETSformer/arch/encoder.py b/baselines/ETSformer/arch/encoder.py
new file mode 100644
index 0000000..9c30eb7
--- /dev/null
+++ b/baselines/ETSformer/arch/encoder.py
@@ -0,0 +1,227 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.fft as fft
+
+import numpy as np
+from einops import rearrange, reduce, repeat
+import math, random
+
+from .modules import Feedforward
+from .exponential_smoothing import ExponentialSmoothing
+
+
+class GrowthLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, d_head=None, dropout=0.1, output_attention=False):
+        super().__init__()
+        self.d_head = d_head or (d_model // nhead)
+        self.d_model = d_model
+        self.nhead = nhead
+        self.output_attention = output_attention
+
+        self.z0 = nn.Parameter(torch.randn(self.nhead, self.d_head))
+        self.in_proj = nn.Linear(self.d_model, self.d_head * self.nhead)
+        self.es = ExponentialSmoothing(self.d_head, self.nhead, dropout=dropout)
+        self.out_proj = nn.Linear(self.d_head * self.nhead, self.d_model)
+
+        assert self.d_head * self.nhead == self.d_model, "d_model must be divisible by nhead"
+
+    def forward(self, inputs):
+        """
+        :param inputs: shape: (batch, seq_len, dim)
+        :return: shape: (batch, seq_len, dim)
+        """
+        b, t, d = inputs.shape
+        values = self.in_proj(inputs).view(b, t, self.nhead, -1)
+        values = torch.cat([repeat(self.z0, 'h d -> b 1 h d', b=b), values], dim=1)
+        values = values[:, 1:] - values[:, :-1]
+        out = self.es(values)
+        out = torch.cat([repeat(self.es.v0, '1 1 h d -> b 1 h d', b=b), out], dim=1)
+        out = rearrange(out, 'b t h d -> b t (h d)')
+        out = self.out_proj(out)
+
+        if self.output_attention:
+            return out, self.es.get_exponential_weight(t)[1]
+        return out, None
+
+
+class FourierLayer(nn.Module):
+
+    def __init__(self, d_model, pred_len, k=None, low_freq=1, output_attention=False):
+        super().__init__()
+        self.d_model = d_model
+        self.pred_len = pred_len
+        self.k = k
+        self.low_freq = low_freq
+        self.output_attention = output_attention
+
+    def forward(self, x):
+        """x: (b, t, d)"""
+
+        if self.output_attention:
+            return self.dft_forward(x)
+
+        b, t, d = x.shape
+        x_freq = fft.rfft(x, dim=1)
+
+        if t % 2 == 0:
+            x_freq = x_freq[:, self.low_freq:-1]
+            f = fft.rfftfreq(t)[self.low_freq:-1]
+        else:
+            x_freq = x_freq[:, self.low_freq:]
+            f = fft.rfftfreq(t)[self.low_freq:]
+
+        x_freq, index_tuple = self.topk_freq(x_freq)
+
+        index_tuple = tuple(t.to(x_freq.device) for t in index_tuple)
+
+
+        f = repeat(f, 'f -> b f d', b=x_freq.size(0), d=x_freq.size(2)).to(x_freq.device)
+        f = rearrange(f[index_tuple], 'b f d -> b f () d').to(x_freq.device)
+
+        return self.extrapolate(x_freq, f, t), None
+
+    def extrapolate(self, x_freq, f, t):
+        x_freq = torch.cat([x_freq, x_freq.conj()], dim=1)
+        f = torch.cat([f, -f], dim=1)
+        t_val = rearrange(torch.arange(t + self.pred_len, dtype=torch.float),
+                      't -> () () t ()').to(x_freq.device)
+
+        amp = rearrange(x_freq.abs() / t, 'b f d -> b f () d')
+        phase = rearrange(x_freq.angle(), 'b f d -> b f () d')
+
+        x_time = amp * torch.cos(2 * math.pi * f * t_val + phase)
+
+        return reduce(x_time, 'b f t d -> b t d', 'sum')
+
+    def topk_freq(self, x_freq):
+        values, indices = torch.topk(x_freq.abs(), self.k, dim=1, largest=True, sorted=True)
+        mesh_a, mesh_b = torch.meshgrid(torch.arange(x_freq.size(0)), torch.arange(x_freq.size(2)))
+        index_tuple = (mesh_a.unsqueeze(1), indices, mesh_b.unsqueeze(1))
+        x_freq = x_freq[index_tuple]
+
+        return x_freq, index_tuple
+
+    def dft_forward(self, x):
+        T = x.size(1)
+
+        dft_mat = fft.fft(torch.eye(T))
+        i, j = torch.meshgrid(torch.arange(self.pred_len + T), torch.arange(T))
+        omega = np.exp(2 * math.pi * 1j / T)
+        idft_mat = (np.power(omega, i * j) / T).cfloat()
+
+        x_freq = torch.einsum('ft,btd->bfd', [dft_mat, x.cfloat()])
+
+        if T % 2 == 0:
+            x_freq = x_freq[:, self.low_freq:T // 2]
+        else:
+            x_freq = x_freq[:, self.low_freq:T // 2 + 1]
+
+        _, indices = torch.topk(x_freq.abs(), self.k, dim=1, largest=True, sorted=True)
+        indices = indices + self.low_freq
+        indices = torch.cat([indices, -indices], dim=1)
+
+        dft_mat = repeat(dft_mat, 'f t -> b f t d', b=x.shape[0], d=x.shape[-1])
+        idft_mat = repeat(idft_mat, 't f -> b t f d', b=x.shape[0], d=x.shape[-1])
+
+        mesh_a, mesh_b = torch.meshgrid(torch.arange(x.size(0)), torch.arange(x.size(2)))
+
+        dft_mask = torch.zeros_like(dft_mat)
+        dft_mask[mesh_a, indices, :, mesh_b] = 1
+        dft_mat = dft_mat * dft_mask
+
+        idft_mask = torch.zeros_like(idft_mat)
+        idft_mask[mesh_a, :, indices, mesh_b] = 1
+        idft_mat = idft_mat * idft_mask
+
+        attn = torch.einsum('bofd,bftd->botd', [idft_mat, dft_mat]).real
+        return torch.einsum('botd,btd->bod', [attn, x]), rearrange(attn, 'b o t d -> b d o t')
+
+
+class LevelLayer(nn.Module):
+
+    def __init__(self, d_model, c_out, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.c_out = c_out
+
+        self.es = ExponentialSmoothing(1, self.c_out, dropout=dropout, aux=True)
+        self.growth_pred = nn.Linear(self.d_model, self.c_out)
+        self.season_pred = nn.Linear(self.d_model, self.c_out)
+
+    def forward(self, level, growth, season):
+        b, t, _ = level.shape
+        growth = self.growth_pred(growth).view(b, t, self.c_out, 1)
+        season = self.season_pred(season).view(b, t, self.c_out, 1)
+        growth = growth.view(b, t, self.c_out, 1)
+        season = season.view(b, t, self.c_out, 1)
+        level = level.view(b, t, self.c_out, 1)
+        out = self.es(level - season, aux_values=growth)
+        out = rearrange(out, 'b t h d -> b t (h d)')
+        return out
+
+class EncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, c_out, seq_len, pred_len, k, dim_feedforward=None, dropout=0.1,
+                 activation='sigmoid', layer_norm_eps=1e-5, output_attention=False):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.c_out = c_out
+        self.seq_len = seq_len
+        self.pred_len = pred_len
+        dim_feedforward = dim_feedforward or 4 * d_model
+        self.dim_feedforward = dim_feedforward
+
+        self.growth_layer = GrowthLayer(d_model, nhead, dropout=dropout, output_attention=output_attention)
+        self.seasonal_layer = FourierLayer(d_model, pred_len, k=k, output_attention=output_attention)
+        self.level_layer = LevelLayer(d_model, c_out, dropout=dropout)
+
+        # Implementation of Feedforward model
+        self.ff = Feedforward(d_model, dim_feedforward, dropout=dropout, activation=activation)
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+    def forward(self, res, level, attn_mask=None):
+        season, season_attn = self._season_block(res)
+        res = res - season[:, :-self.pred_len]
+        growth, growth_attn = self._growth_block(res)
+        res = self.norm1(res - growth[:, 1:])
+        res = self.norm2(res + self.ff(res))
+
+        level = self.level_layer(level, growth[:, :-1], season[:, :-self.pred_len])
+
+        return res, level, growth, season, season_attn, growth_attn
+
+    def _growth_block(self, x):
+        x, growth_attn = self.growth_layer(x)
+        return self.dropout1(x), growth_attn
+
+    def _season_block(self, x):
+        x, season_attn = self.seasonal_layer(x)
+        return self.dropout2(x), season_attn
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, layers):
+        super().__init__()
+        self.layers = nn.ModuleList(layers)
+
+    def forward(self, res, level, attn_mask=None):
+        growths = []
+        seasons = []
+        season_attns = []
+        growth_attns = []
+        for layer in self.layers:
+            res, level, growth, season, season_attn, growth_attn = layer(res, level, attn_mask=None)
+            growths.append(growth)
+            seasons.append(season)
+            season_attns.append(season_attn)
+            growth_attns.append(growth_attn)
+
+        return level, growths, seasons, season_attns, growth_attns
diff --git a/baselines/ETSformer/arch/etsformer_arch.py b/baselines/ETSformer/arch/etsformer_arch.py
new file mode 100644
index 0000000..217efbb
--- /dev/null
+++ b/baselines/ETSformer/arch/etsformer_arch.py
@@ -0,0 +1,116 @@
+import torch
+import torch.nn as nn
+from einops import reduce
+
+from .modules import ETSEmbedding
+from .encoder import EncoderLayer, Encoder
+from .decoder import DecoderLayer, Decoder
+
+
+class Transform:
+    def __init__(self, sigma):
+        self.sigma = sigma
+
+    @torch.no_grad()
+    def transform(self, x):
+        return self.jitter(self.shift(self.scale(x)))
+
+    def jitter(self, x):
+        return x + (torch.randn(x.shape).to(x.device) * self.sigma)
+
+    def scale(self, x):
+        return x * (torch.randn(x.size(-1)).to(x.device) * self.sigma + 1)
+
+    def shift(self, x):
+        return x + (torch.randn(x.size(-1)).to(x.device) * self.sigma)
+
+
+class ETSformer(nn.Module):
+
+    def __init__(self,  **model_args):
+        super().__init__()
+
+
+        self.seq_len = model_args['seq_len']
+        self.pred_len = model_args['pred_len']
+        self.e_layers = model_args['e_layers']
+        self.d_layers = model_args['d_layers']
+        self.enc_in = model_args['enc_in']
+        self.d_model = model_args['d_model']
+        self.dropout = model_args['dropout']
+        self.n_head = model_args['n_heads']
+        self.c_out = model_args['c_out']
+        self.K = model_args['K']
+        self.d_ff = model_args['d_ff']
+        self.sigma = model_args['sigma']
+        self.activation = model_args['activation']
+        self.output_attention = model_args['output_attention']
+
+        assert self.e_layers == self.d_layers, "Encoder and decoder layers must be equal"
+
+        # Embedding
+        self.enc_embedding = ETSEmbedding(self.enc_in, self.d_model, dropout=self.dropout)
+
+        # Encoder
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    self.d_model, self.n_head, self.c_out, self.seq_len, self.pred_len, self.K,
+                    dim_feedforward=self.d_ff,
+                    dropout=self.dropout,
+                    activation=self.activation,
+                    output_attention=self.output_attention,
+                ) for _ in range(self.e_layers)
+            ]
+        )
+
+        # Decoder
+        self.decoder = Decoder(
+            [
+                DecoderLayer(
+                    self.d_model, self.n_head, self.c_out, self.pred_len,
+                    dropout=self.dropout,
+                    output_attention=self.output_attention,
+                ) for _ in range(self.d_layers)
+            ],
+        )
+
+        self.transform = Transform(sigma=self.sigma)
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                enc_self_mask=None,
+                decomposed=False, attention=False):
+        """
+                Args:
+                    history_data (Tensor): Input data with shape: [B, L1, N, C]
+                    future_data (Tensor): Future data with shape: [B, L2, N, C]
+
+                Returns:
+                    torch.Tensor: outputs with shape [B, L2, N, 1]
+        """
+        x_enc = history_data[:,:,:,0]
+        with torch.no_grad():
+            if self.training:
+                x_enc = self.transform.transform(x_enc)
+        res = self.enc_embedding(x_enc)
+        level, growths, seasons, season_attns, growth_attns = self.encoder(res, x_enc, attn_mask=enc_self_mask)
+
+        growth, season, growth_dampings = self.decoder(growths, seasons)
+
+        if decomposed:
+            return level[:, -1:], growth, season
+
+        preds = level[:, -1:] + growth + season
+
+        if attention:
+            decoder_growth_attns = []
+            for growth_attn, growth_damping in zip(growth_attns, growth_dampings):
+                decoder_growth_attns.append(torch.einsum('bth,oh->bhot', [growth_attn.squeeze(-1), growth_damping]))
+
+            season_attns = torch.stack(season_attns, dim=0)[:, :, -self.pred_len:]
+            season_attns = reduce(season_attns, 'l b d o t -> b o t', reduction='mean')
+            decoder_growth_attns = torch.stack(decoder_growth_attns, dim=0)[:, :, -self.pred_len:]
+            decoder_growth_attns = reduce(decoder_growth_attns, 'l b d o t -> b o t', reduction='mean')
+            return preds, season_attns, decoder_growth_attns
+        preds = preds.unsqueeze(-1)
+        return preds
diff --git a/baselines/ETSformer/arch/exponential_smoothing.py b/baselines/ETSformer/arch/exponential_smoothing.py
new file mode 100644
index 0000000..96c167d
--- /dev/null
+++ b/baselines/ETSformer/arch/exponential_smoothing.py
@@ -0,0 +1,68 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.fft as fft
+
+from einops import rearrange, reduce, repeat
+from scipy.fftpack import next_fast_len
+
+
+def conv1d_fft(f, g, dim=-1):
+    N = f.size(dim)
+    M = g.size(dim)
+
+    fast_len = next_fast_len(N + M - 1)
+
+    F_f = fft.rfft(f, fast_len, dim=dim)
+    F_g = fft.rfft(g, fast_len, dim=dim)
+
+    F_fg = F_f * F_g.conj()
+    out = fft.irfft(F_fg, fast_len, dim=dim)
+    out = out.roll((-1,), dims=(dim,))
+    idx = torch.as_tensor(range(fast_len - N, fast_len)).to(out.device)
+    out = out.index_select(dim, idx)
+
+    return out
+
+
+class ExponentialSmoothing(nn.Module):
+
+    def __init__(self, dim, nhead, dropout=0.1, aux=False):
+        super().__init__()
+        self._smoothing_weight = nn.Parameter(torch.randn(nhead, 1))
+        self.v0 = nn.Parameter(torch.randn(1, 1, nhead, dim))
+        self.dropout = nn.Dropout(dropout)
+        if aux:
+            self.aux_dropout = nn.Dropout(dropout)
+
+    def forward(self, values, aux_values=None):
+        b, t, h, d = values.shape
+
+        init_weight, weight = self.get_exponential_weight(t)
+        output = conv1d_fft(self.dropout(values), weight, dim=1)
+        output = init_weight * self.v0 + output
+
+        if aux_values is not None:
+            aux_weight = weight / (1 - self.weight) * self.weight
+            aux_output = conv1d_fft(self.aux_dropout(aux_values), aux_weight)
+            output = output + aux_output
+
+        return output
+
+    def get_exponential_weight(self, T):
+        # Generate array [0, 1, ..., T-1]
+        powers = torch.arange(T, dtype=torch.float, device=self.weight.device)
+
+        # (1 - \alpha) * \alpha^t, for all t = T-1, T-2, ..., 0]
+        weight = (1 - self.weight) * (self.weight ** torch.flip(powers, dims=(0,)))
+
+        # \alpha^t for all t = 1, 2, ..., T
+        init_weight = self.weight ** (powers + 1)
+
+        return rearrange(init_weight, 'h t -> 1 t h 1'), \
+               rearrange(weight, 'h t -> 1 t h 1')
+
+    @property
+    def weight(self):
+        return torch.sigmoid(self._smoothing_weight)
diff --git a/baselines/ETSformer/arch/modules.py b/baselines/ETSformer/arch/modules.py
new file mode 100644
index 0000000..fd2572c
--- /dev/null
+++ b/baselines/ETSformer/arch/modules.py
@@ -0,0 +1,32 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ETSEmbedding(nn.Module):
+    def __init__(self, c_in, d_model, dropout=0.1):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                              kernel_size=3, padding=2, bias=False)
+        self.dropout = nn.Dropout(p=dropout)
+        nn.init.kaiming_normal_(self.conv.weight)
+
+    def forward(self, x,):
+
+        x = self.conv(x.permute(0,2,1))[..., :-2]
+
+        return self.dropout(x.transpose(1,2))
+
+
+class Feedforward(nn.Module):
+    def __init__(self, d_model, dim_feedforward, dropout=0.1, activation='sigmoid'):
+        # Implementation of Feedforward model
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=False)
+        self.dropout1 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=False)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = getattr(F, activation)
+
+    def forward(self, x):
+        x = self.linear2(self.dropout1(self.activation(self.linear1(x))))
+        return self.dropout2(x)
diff --git a/baselines/FiLM/Electricity.py b/baselines/FiLM/Electricity.py
new file mode 100644
index 0000000..1ff99a9
--- /dev/null
+++ b/baselines/FiLM/Electricity.py
@@ -0,0 +1,155 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import FiLM
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = FiLM
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 1, # attn factor
+    "d_model": 512,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 2,  # num of encoder layers
+    "ratio": 0.5,
+    "multiscale" : [1, 2, 4],
+    "window_size" : [256],
+    "dropout": 0.05,
+    "freq": 'h',
+    "use_norm" : False,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 8
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 8
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 8
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/FiLM/arch/__init__.py b/baselines/FiLM/arch/__init__.py
new file mode 100644
index 0000000..e297243
--- /dev/null
+++ b/baselines/FiLM/arch/__init__.py
@@ -0,0 +1 @@
+from .film_arch import FiLM
\ No newline at end of file
diff --git a/baselines/FiLM/arch/film_arch.py b/baselines/FiLM/arch/film_arch.py
new file mode 100644
index 0000000..efdfecb
--- /dev/null
+++ b/baselines/FiLM/arch/film_arch.py
@@ -0,0 +1,154 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from scipy import signal
+from scipy import special as ss
+
+
+
+def transition(N):
+    Q = np.arange(N, dtype=np.float64)
+    R = (2 * Q + 1)[:, None]  # / theta
+    j, i = np.meshgrid(Q, Q)
+    A = np.where(i < j, -1, (-1.) ** (i - j + 1)) * R
+    B = (-1.) ** Q[:, None] * R
+    return A, B
+
+
+class HiPPO_LegT(nn.Module):
+    def __init__(self, N, dt=1.0, discretization='bilinear'):
+        """
+        N: the order of the HiPPO projection
+        dt: discretization step size - should be roughly inverse to the length of the sequence
+        """
+        super(HiPPO_LegT, self).__init__()
+        self.N = N
+        A, B = transition(N)
+        C = np.ones((1, N))
+        D = np.zeros((1,))
+        A, B, _, _, _ = signal.cont2discrete((A, B, C, D), dt=dt, method=discretization)
+
+        B = B.squeeze(-1)
+
+        self.register_buffer('A', torch.Tensor(A))
+        self.register_buffer('B', torch.Tensor(B))
+        vals = np.arange(0.0, 1.0, dt)
+        self.register_buffer('eval_matrix', torch.Tensor(
+            ss.eval_legendre(np.arange(N)[:, None], 1 - 2 * vals).T))
+
+    def forward(self, inputs):
+        """
+        inputs : (length, ...)
+        output : (length, ..., N) where N is the order of the HiPPO projection
+        """
+        c = torch.zeros(inputs.shape[:-1] + tuple([self.N])).to(inputs.device)
+        cs = []
+        for f in inputs.permute([-1, 0, 1]):
+            f = f.unsqueeze(-1)
+            new = f @ self.B.unsqueeze(0)
+            new = new.to(inputs.device)
+            c = F.linear(c, self.A.to(inputs.device)) + new
+            cs.append(c)
+        return torch.stack(cs, dim=0)
+
+    def reconstruct(self, c):
+        return (self.eval_matrix @ c.unsqueeze(-1)).squeeze(-1)
+
+
+class SpectralConv1d(nn.Module):
+    def __init__(self, in_channels, out_channels, seq_len, ratio=0.5):
+        """
+        1D Fourier layer. It does FFT, linear transform, and Inverse FFT.
+        """
+        super(SpectralConv1d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.ratio = ratio
+        self.modes = min(32, seq_len // 2)
+        self.index = list(range(0, self.modes))
+
+        self.scale = (1 / (in_channels * out_channels))
+        self.weights_real = nn.Parameter(
+            self.scale * torch.rand(in_channels, out_channels, len(self.index), dtype=torch.float))
+        self.weights_imag = nn.Parameter(
+            self.scale * torch.rand(in_channels, out_channels, len(self.index), dtype=torch.float))
+
+    def compl_mul1d(self, order, x, weights_real, weights_imag):
+        return torch.complex(torch.einsum(order, x.real, weights_real) - torch.einsum(order, x.imag, weights_imag),
+                                 torch.einsum(order, x.real, weights_imag) + torch.einsum(order, x.imag, weights_real))
+
+    def forward(self, x):
+        B, H, E, N = x.shape
+        x_ft = torch.fft.rfft(x)
+        out_ft = torch.zeros(B, H, self.out_channels, x.size(-1) // 2 + 1, device=x.device, dtype=torch.cfloat)
+        a = x_ft[:, :, :, :self.modes]
+        out_ft[:, :, :, :self.modes] = self.compl_mul1d("bjix,iox->bjox", a, self.weights_real, self.weights_imag)
+        x = torch.fft.irfft(out_ft, n=x.size(-1))
+        return x
+
+
+class FiLM(nn.Module):
+    """
+    Paper link: https://arxiv.org/abs/2205.08897
+    """
+    def __init__(self, **model_args):
+        super(FiLM, self).__init__()
+
+        self.seq_len = model_args['seq_len']
+        self.pred_len = self.seq_len if model_args['pred_len'] == 0 else model_args['pred_len']
+
+        self.output_attention = model_args['output_attention']
+        self.layers = model_args['e_layers']
+        self.enc_in = model_args['enc_in']
+        self.e_layers = model_args['e_layers']
+        # b, s, f means b, f
+        self.affine_weight = nn.Parameter(torch.ones(1, 1, model_args['enc_in']))
+        self.affine_bias = nn.Parameter(torch.zeros(1, 1, model_args['enc_in']))
+
+        self.multiscale = model_args['multiscale'] # 1 2 4
+        self.window_size = model_args['window_size'] # 256
+        self.ratio = model_args['ratio']
+        self.legts = nn.ModuleList(
+            [HiPPO_LegT(N=n, dt=1. / self.pred_len / i) for n in self.window_size for i in self.multiscale])
+        self.spec_conv_1 = nn.ModuleList([SpectralConv1d(in_channels=n, out_channels=n,
+                                                         seq_len=min(self.pred_len, self.seq_len),
+                                                         ratio=self.ratio) for n in
+                                          self.window_size for _ in range(len(self.multiscale))])
+        self.mlp = nn.Linear(len(self.multiscale) * len(self.window_size), 1)
+
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs):
+        # Normalization from Non-stationary Transformer
+        x_enc = history_data[:, :, :, 0]
+        means = x_enc.mean(1, keepdim=True).detach()
+        x_enc = x_enc - means
+        stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach()
+        x_enc /= stdev
+
+        x_enc = x_enc * self.affine_weight + self.affine_bias
+        x_decs = []
+        jump_dist = 0
+        for i in range(0, len(self.multiscale) * len(self.window_size)):
+            x_in_len = self.multiscale[i % len(self.multiscale)] * self.pred_len
+            x_in = x_enc[:, -x_in_len:]
+            legt = self.legts[i]
+            x_in_c = legt(x_in.transpose(1, 2)).permute([1, 2, 3, 0])[:, :, :, jump_dist:]
+            out1 = self.spec_conv_1[i](x_in_c)
+            if self.seq_len >= self.pred_len:
+                x_dec_c = out1.transpose(2, 3)[:, :, self.pred_len - 1 - jump_dist, :]
+            else:
+                x_dec_c = out1.transpose(2, 3)[:, :, -1, :]
+            x_dec = x_dec_c @ legt.eval_matrix[-self.pred_len:, :].T
+            x_decs.append(x_dec)
+        x_dec = torch.stack(x_decs, dim=-1)
+        x_dec = self.mlp(x_dec).squeeze(-1).permute(0, 2, 1)
+
+        # De-Normalization from Non-stationary Transformer
+        x_dec = x_dec - self.affine_bias
+        x_dec = x_dec / (self.affine_weight + 1e-10)
+        x_dec = x_dec * stdev
+        x_dec = x_dec + means
+        return x_dec.unsqueeze(-1)
+
diff --git a/baselines/Koopa/Electricity.py b/baselines/Koopa/Electricity.py
new file mode 100644
index 0000000..b569f78
--- /dev/null
+++ b/baselines/Koopa/Electricity.py
@@ -0,0 +1,152 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import Koopa
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = Koopa
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "seg_len": 168,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "dynamic_dim": 512,#128
+    "hidden_dim": 512, #256                          # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "hidden_layers": 3,
+    "num_blocks": 1,                              # num of encoder layers
+    "alpha" : 0.2,
+    "dropout": 0.05,
+    "output_attention": False,
+    "embed": "timeF",  # [timeF, fixed, learned]
+    "multistep" : False,
+    "activation": "sigmoid",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/Koopa/arch/__init__.py b/baselines/Koopa/arch/__init__.py
new file mode 100644
index 0000000..96273e7
--- /dev/null
+++ b/baselines/Koopa/arch/__init__.py
@@ -0,0 +1,2 @@
+from .koopa_arch import Koopa
+
diff --git a/baselines/Koopa/arch/koopa_arch.py b/baselines/Koopa/arch/koopa_arch.py
new file mode 100644
index 0000000..cdf7752
--- /dev/null
+++ b/baselines/Koopa/arch/koopa_arch.py
@@ -0,0 +1,341 @@
+import math
+import torch
+import torch.nn as nn
+
+
+
+class FourierFilter(nn.Module):
+    """
+    Fourier Filter: to time-variant and time-invariant term
+    """
+    def __init__(self, mask_spectrum):
+        super(FourierFilter, self).__init__()
+        self.mask_spectrum = mask_spectrum
+        
+    def forward(self, x):
+        xf = torch.fft.rfft(x, dim=1)
+        mask = torch.ones_like(xf)
+        mask[:, self.mask_spectrum, :] = 0
+        x_var = torch.fft.irfft(xf*mask, dim=1)
+        x_inv = x - x_var
+        
+        return x_var, x_inv
+    
+
+class MLP(nn.Module):
+    '''
+    Multilayer perceptron to encode/decode high dimension representation of sequential data
+    '''
+    def __init__(self, 
+                 f_in, 
+                 f_out, 
+                 hidden_dim=128, 
+                 hidden_layers=2, 
+                 dropout=0.05,
+                 activation='tanh'): 
+        super(MLP, self).__init__()
+        self.f_in = f_in
+        self.f_out = f_out
+        self.hidden_dim = hidden_dim
+        self.hidden_layers = hidden_layers
+        self.dropout = dropout
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+        else:
+            raise NotImplementedError
+
+        layers = [nn.Linear(self.f_in, self.hidden_dim), 
+                  self.activation, nn.Dropout(self.dropout)]
+        for i in range(self.hidden_layers-2):
+            layers += [nn.Linear(self.hidden_dim, self.hidden_dim),
+                       self.activation, nn.Dropout(dropout)]
+        
+        layers += [nn.Linear(hidden_dim, f_out)]
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x):
+        # x:     B x S x f_in
+        # y:     B x S x f_out
+        y = self.layers(x)
+        return y
+    
+
+class KPLayer(nn.Module):
+    """
+    A demonstration of finding one step transition of linear system by DMD iteratively
+    """
+    def __init__(self): 
+        super(KPLayer, self).__init__()
+        
+        self.K = None # B E E
+
+    def one_step_forward(self, z, return_rec=False, return_K=False):
+        B, input_len, E = z.shape
+        assert input_len > 1, 'snapshots number should be larger than 1'
+        x, y = z[:, :-1], z[:, 1:]
+
+        # solve linear system
+        self.K = torch.linalg.lstsq(x, y).solution # B E E
+        if torch.isnan(self.K).any():
+            print('Encounter K with nan, replace K by identity matrix')
+            self.K = torch.eye(self.K.shape[1]).to(self.K.device).unsqueeze(0).repeat(B, 1, 1)
+
+        z_pred = torch.bmm(z[:, -1:], self.K)
+        if return_rec:
+            z_rec = torch.cat((z[:, :1], torch.bmm(x, self.K)), dim=1)
+            return z_rec, z_pred
+
+        return z_pred
+    
+    def forward(self, z, pred_len=1):
+        assert pred_len >= 1, 'prediction length should not be less than 1'
+        z_rec, z_pred= self.one_step_forward(z, return_rec=True)
+        z_preds = [z_pred]
+        for i in range(1, pred_len):
+            z_pred = torch.bmm(z_pred, self.K)
+            z_preds.append(z_pred)
+        z_preds = torch.cat(z_preds, dim=1)
+        return z_rec, z_preds
+
+
+class KPLayerApprox(nn.Module):
+    """
+    Find koopman transition of linear system by DMD with multistep K approximation
+    """
+    def __init__(self): 
+        super(KPLayerApprox, self).__init__()
+        
+        self.K = None # B E E
+        self.K_step = None # B E E
+
+    def forward(self, z, pred_len=1):
+        # z:       B L E, koopman invariance space representation
+        # z_rec:   B L E, reconstructed representation
+        # z_pred:  B S E, forecasting representation
+        B, input_len, E = z.shape
+        assert input_len > 1, 'snapshots number should be larger than 1'
+        x, y = z[:, :-1], z[:, 1:]
+
+        # solve linear system
+        self.K = torch.linalg.lstsq(x, y).solution # B E E
+
+        if torch.isnan(self.K).any():
+            print('Encounter K with nan, replace K by identity matrix')
+            self.K = torch.eye(self.K.shape[1]).to(self.K.device).unsqueeze(0).repeat(B, 1, 1)
+
+        z_rec = torch.cat((z[:, :1], torch.bmm(x, self.K)), dim=1) # B L E
+        
+        if pred_len <= input_len:
+            self.K_step = torch.linalg.matrix_power(self.K, pred_len)
+            if torch.isnan(self.K_step).any():
+                print('Encounter multistep K with nan, replace it by identity matrix')
+                self.K_step = torch.eye(self.K_step.shape[1]).to(self.K_step.device).unsqueeze(0).repeat(B, 1, 1)
+            z_pred = torch.bmm(z[:, -pred_len:, :], self.K_step)
+        else:
+            self.K_step = torch.linalg.matrix_power(self.K, input_len)
+            if torch.isnan(self.K_step).any():
+                print('Encounter multistep K with nan, replace it by identity matrix')
+                self.K_step = torch.eye(self.K_step.shape[1]).to(self.K_step.device).unsqueeze(0).repeat(B, 1, 1)
+            temp_z_pred, all_pred = z, []
+            for _ in range(math.ceil(pred_len / input_len)):
+                temp_z_pred = torch.bmm(temp_z_pred, self.K_step)
+                all_pred.append(temp_z_pred)
+            z_pred = torch.cat(all_pred, dim=1)[:, :pred_len, :]
+
+        return z_rec, z_pred
+    
+
+class TimeVarKP(nn.Module):
+    """
+    Koopman Predictor with DMD (analysitical solution of Koopman operator)
+    Utilize local variations within individual sliding window to predict the future of time-variant term
+    """
+    def __init__(self,
+                 enc_in=8,
+                 input_len=96,
+                 pred_len=96,
+                 seg_len=24,
+                 dynamic_dim=128,
+                 encoder=None,
+                 decoder=None,
+                 multistep=False,
+                ):
+        super(TimeVarKP, self).__init__()
+        self.input_len = input_len
+        self.pred_len = pred_len
+        self.enc_in = enc_in
+        self.seg_len = seg_len
+        self.dynamic_dim = dynamic_dim
+        self.multistep = multistep
+        self.encoder, self.decoder = encoder, decoder            
+        self.freq = math.ceil(self.input_len / self.seg_len)  # segment number of input
+        self.step = math.ceil(self.pred_len / self.seg_len)   # segment number of output
+        self.padding_len = self.seg_len * self.freq - self.input_len
+        # Approximate mulitstep K by KPLayerApprox when pred_len is large
+        self.dynamics = KPLayerApprox() if self.multistep else KPLayer() 
+
+    def forward(self, x):
+        # x: B L C
+        B, L, C = x.shape
+
+        res = torch.cat((x[:, L-self.padding_len:, :], x) ,dim=1)
+
+        res = res.chunk(self.freq, dim=1)     # F x B P C, P means seg_len
+        res = torch.stack(res, dim=1).reshape(B, self.freq, -1)   # B F PC
+
+        res = self.encoder(res) # B F H
+        x_rec, x_pred = self.dynamics(res, self.step) # B F H, B S H
+
+        x_rec = self.decoder(x_rec) # B F PC
+        x_rec = x_rec.reshape(B, self.freq, self.seg_len, self.enc_in)
+        x_rec = x_rec.reshape(B, -1, self.enc_in)[:, :self.input_len, :]  # B L C
+        
+        x_pred = self.decoder(x_pred)     # B S PC
+        x_pred = x_pred.reshape(B, self.step, self.seg_len, self.enc_in)
+        x_pred = x_pred.reshape(B, -1, self.enc_in)[:, :self.pred_len, :] # B S C
+
+        return x_rec, x_pred
+
+
+class TimeInvKP(nn.Module):
+    """
+    Koopman Predictor with learnable Koopman operator
+    Utilize lookback and forecast window snapshots to predict the future of time-invariant term
+    """
+    def __init__(self,
+                 input_len=96,
+                 pred_len=96,
+                 dynamic_dim=128,
+                 encoder=None,
+                 decoder=None):
+        super(TimeInvKP, self).__init__()
+        self.dynamic_dim = dynamic_dim
+        self.input_len = input_len
+        self.pred_len = pred_len
+        self.encoder = encoder
+        self.decoder = decoder
+
+        K_init = torch.randn(self.dynamic_dim, self.dynamic_dim)
+        U, _, V = torch.svd(K_init) # stable initialization
+        self.K = nn.Linear(self.dynamic_dim, self.dynamic_dim, bias=False)
+        self.K.weight.data = torch.mm(U, V.t())
+    
+    def forward(self, x):
+        # x: B L C
+        res = x.transpose(1, 2) # B C L
+        res = self.encoder(res) # B C H
+        res = self.K(res) # B C H
+        res = self.decoder(res) # B C S
+        res = res.transpose(1, 2) # B S C
+
+        return res
+
+
+class Koopa(nn.Module):
+    '''
+    Koopman Forecasting Model
+    '''
+    def __init__(self, **model_args):
+        super(Koopa, self).__init__()
+        self.mask_spectrum = None  # 初始化为None
+        self.alpha = model_args['alpha']  # 假设configs中有alpha参数
+        self.enc_in = model_args['enc_in']
+        self.input_len = model_args['seq_len']
+        self.pred_len = model_args['pred_len']
+        self.seg_len = model_args['seg_len']
+        self.num_blocks = model_args['num_blocks']
+        self.dynamic_dim = model_args['dynamic_dim']
+        self.hidden_dim = model_args['hidden_dim']
+        self.hidden_layers = model_args['hidden_layers']
+        self.multistep = model_args['multistep']
+        self.amps = 0.0
+        self.disentanglement = FourierFilter(self.mask_spectrum)
+
+        # shared encoder/decoder to make koopman embedding consistent
+        self.time_inv_encoder = MLP(f_in=self.input_len, f_out=self.dynamic_dim, activation='relu',
+                    hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers)
+        self.time_inv_decoder = MLP(f_in=self.dynamic_dim, f_out=self.pred_len, activation='relu',
+                           hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers)
+        self.time_inv_kps = self.time_var_kps = nn.ModuleList([
+                                TimeInvKP(input_len=self.input_len,
+                                    pred_len=self.pred_len,
+                                    dynamic_dim=self.dynamic_dim,
+                                    encoder=self.time_inv_encoder,
+                                    decoder=self.time_inv_decoder)
+                                for _ in range(self.num_blocks)])
+
+        # shared encoder/decoder to make koopman embedding consistent
+        self.time_var_encoder = MLP(f_in=self.seg_len*self.enc_in, f_out=self.dynamic_dim, activation='tanh',
+                           hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers)
+        self.time_var_decoder = MLP(f_in=self.dynamic_dim, f_out=self.seg_len*self.enc_in, activation='tanh',
+                           hidden_dim=self.hidden_dim, hidden_layers=self.hidden_layers)
+        self.time_var_kps = nn.ModuleList([
+                    TimeVarKP(enc_in=self.enc_in,
+                        input_len=self.input_len,
+                        pred_len=self.pred_len,
+                        seg_len=self.seg_len,
+                        dynamic_dim=self.dynamic_dim,
+                        encoder=self.time_var_encoder,
+                        decoder=self.time_var_decoder,
+                        multistep=self.multistep)
+                    for _ in range(self.num_blocks)])
+
+    def _get_mask_spectrum(self, train_loader):
+        """
+        get shared frequency spectrums
+        """
+
+        for data in train_loader:
+            lookback_window = data
+            self.amps += abs(torch.fft.rfft(lookback_window, dim=1)).mean(dim=0).mean(dim=1)
+        mask_spectrum = self.amps.topk(int(self.amps.shape[0]*self.alpha)).indices
+
+        return mask_spectrum
+
+    def train_model(self, train_loader):
+        # 在训练阶段计算mask_spectrum
+        self.mask_spectrum = self._get_mask_spectrum(train_loader)
+        self.disentanglement = FourierFilter(self.mask_spectrum)
+        # 继续训练过程...
+
+    def test_model(self):
+        if self.mask_spectrum is None:
+            raise ValueError("Model has not been trained yet.")
+        # 在测试阶段直接使用训练阶段计算的mask_spectrum
+        # 继续测试过程...
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool, **kwargs):
+        # x_enc: B L C
+        # print(history_data.shape)
+        x_enc = history_data[:, :, :, 0]
+
+        # Series Stationarization adopted from NSformer
+        mean_enc = x_enc.mean(1, keepdim=True).detach() # B x 1 x E
+        x_enc = x_enc - mean_enc
+        std_enc = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach()
+        x_enc = x_enc / std_enc
+
+        # 在训练阶段计算mask_spectrum
+        if train:
+            self.mask_spectrum = self._get_mask_spectrum([history_data[:, :, :, 0]])
+            self.disentanglement = FourierFilter(self.mask_spectrum)
+
+        # Koopman Forecasting
+        residual, forecast = x_enc, None
+        for i in range(self.num_blocks):
+            time_var_input, time_inv_input = self.disentanglement(residual)
+            time_inv_output = self.time_inv_kps[i](time_inv_input)
+            time_var_backcast, time_var_output = self.time_var_kps[i](time_var_input)
+            residual = residual - time_var_backcast
+            if forecast is None:
+                forecast = (time_inv_output + time_var_output)
+            else:
+                forecast += (time_inv_output + time_var_output)
+
+        # Series Stationarization adopted from NSformer
+        res = forecast * std_enc + mean_enc
+        res = res.unsqueeze(-1)
+        return res
+
diff --git a/baselines/LightTS/Electricity.py b/baselines/LightTS/Electricity.py
new file mode 100644
index 0000000..91387f4
--- /dev/null
+++ b/baselines/LightTS/Electricity.py
@@ -0,0 +1,157 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import LightTS
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = LightTS
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 3, # attn factor
+    "chunk_size": 24,
+    "d_model": 32, # 512
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 2,                              # num of encoder layers
+    "d_layers": 1,                              # num of decoder layers
+    "d_ff": 64, # 2048
+    "distil": True,
+    "sigma" : 0.2,
+    "dropout": 0.3,
+    "freq": 'h',
+    "use_norm" : True,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/LightTS/arch/__init__.py b/baselines/LightTS/arch/__init__.py
new file mode 100644
index 0000000..dbf4711
--- /dev/null
+++ b/baselines/LightTS/arch/__init__.py
@@ -0,0 +1 @@
+from .lightts_arch import LightTS
\ No newline at end of file
diff --git a/baselines/LightTS/arch/lightts_arch.py b/baselines/LightTS/arch/lightts_arch.py
new file mode 100644
index 0000000..eb6809d
--- /dev/null
+++ b/baselines/LightTS/arch/lightts_arch.py
@@ -0,0 +1,135 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class IEBlock(nn.Module):
+    def __init__(self, input_dim, hid_dim, output_dim, num_node):
+        super(IEBlock, self).__init__()
+
+        self.input_dim = input_dim
+        self.hid_dim = hid_dim
+        self.output_dim = output_dim
+        self.num_node = num_node
+
+        self._build()
+
+    def _build(self):
+        self.spatial_proj = nn.Sequential(
+            nn.Linear(self.input_dim, self.hid_dim),
+            nn.LeakyReLU(),
+            nn.Linear(self.hid_dim, self.hid_dim // 4)
+        )
+
+        self.channel_proj = nn.Linear(self.num_node, self.num_node)
+        torch.nn.init.eye_(self.channel_proj.weight)
+
+        self.output_proj = nn.Linear(self.hid_dim // 4, self.output_dim)
+
+    def forward(self, x):
+        x = self.spatial_proj(x.permute(0, 2, 1))
+        x = x.permute(0, 2, 1) + self.channel_proj(x.permute(0, 2, 1))
+        x = self.output_proj(x.permute(0, 2, 1))
+
+        x = x.permute(0, 2, 1)
+
+        return x
+
+
+class LightTS(nn.Module):
+    """
+    Paper link: https://arxiv.org/abs/2207.01186
+    """
+
+    def __init__(self, **model_args):
+        """
+        chunk_size: int, reshape T into [num_chunks, chunk_size]
+        """
+        super(LightTS, self).__init__()
+
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        chunk_size = model_args['chunk_size']
+        self.chunk_size = min(self.pred_len, self.seq_len, chunk_size)
+
+        assert (self.seq_len % self.chunk_size == 0)
+        self.num_chunks = self.seq_len // self.chunk_size
+
+        self.d_model = model_args['d_model']
+        self.enc_in = model_args['enc_in']
+        self.dropout = model_args["dropout"]
+        self._build()
+
+    def _build(self):
+        self.layer_1 = IEBlock(
+            input_dim=self.chunk_size,
+            hid_dim=self.d_model // 4,
+            output_dim=self.d_model // 4,
+            num_node=self.num_chunks
+        )
+
+        self.chunk_proj_1 = nn.Linear(self.num_chunks, 1)
+
+        self.layer_2 = IEBlock(
+            input_dim=self.chunk_size,
+            hid_dim=self.d_model // 4,
+            output_dim=self.d_model // 4,
+            num_node=self.num_chunks
+        )
+
+        self.chunk_proj_2 = nn.Linear(self.num_chunks, 1)
+
+        self.layer_3 = IEBlock(
+            input_dim=self.d_model // 2,
+            hid_dim=self.d_model // 2,
+            output_dim=self.pred_len,
+            num_node=self.enc_in
+        )
+
+        self.ar = nn.Linear(self.seq_len, self.pred_len)
+
+    def encoder(self, x):
+        B, T, N = x.size()
+
+        highway = self.ar(x.permute(0, 2, 1))
+        highway = highway.permute(0, 2, 1)
+
+        # continuous sampling
+        x1 = x.reshape(B, self.num_chunks, self.chunk_size, N)
+        x1 = x1.permute(0, 3, 2, 1)
+        x1 = x1.reshape(-1, self.chunk_size, self.num_chunks)
+        x1 = self.layer_1(x1)
+        x1 = self.chunk_proj_1(x1).squeeze(dim=-1)
+
+        # interval sampling
+        x2 = x.reshape(B, self.chunk_size, self.num_chunks, N)
+        x2 = x2.permute(0, 3, 1, 2)
+        x2 = x2.reshape(-1, self.chunk_size, self.num_chunks)
+        x2 = self.layer_2(x2)
+        x2 = self.chunk_proj_2(x2).squeeze(dim=-1)
+
+        x3 = torch.cat([x1, x2], dim=-1)
+
+        x3 = x3.reshape(B, N, -1)
+        x3 = x3.permute(0, 2, 1)
+
+        out = self.layer_3(x3)
+
+        out = out + highway
+        return out
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        """
+
+        Args:
+            history_data (Tensor): Input data with shape: [B, L1, N, C]
+            future_data (Tensor): Future data with shape: [B, L2, N, C]
+
+        Returns:
+            torch.Tensor: outputs with shape [B, L2, N, 1]
+        """
+        x_enc = history_data[:, :, :, 0]
+        dec_out = self.encoder(x_enc)
+        return dec_out[:, -self.pred_len:, :].unsqueeze(-1)  # [B, L, D]
+
diff --git a/baselines/MTSMixer/Electricity.py b/baselines/MTSMixer/Electricity.py
new file mode 100644
index 0000000..fe5de90
--- /dev/null
+++ b/baselines/MTSMixer/Electricity.py
@@ -0,0 +1,164 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import MTSMixer
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = MTSMixer
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 3, # attn factor
+    "p_hidden_dims": [128, 128],
+    "p_hidden_layers": 2,
+    "d_model": 512,
+    "individual": False,
+    "seg" : 20,
+    "fac_T" : False,
+    "sampling" : 2,
+    "fac_C" : False,
+    "rev" : True,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 2,                              # num of encoder layers
+    "d_layers": 1,                              # num of decoder layers
+    "d_ff": 16,
+    "distil": True,
+    "sigma" : 0.2,
+    "dropout": 0.05,
+    "freq": 'h',
+    "use_norm" : True,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 16
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/MTSMixer/arch/Invertible.py b/baselines/MTSMixer/arch/Invertible.py
new file mode 100644
index 0000000..6492b9a
--- /dev/null
+++ b/baselines/MTSMixer/arch/Invertible.py
@@ -0,0 +1,105 @@
+import torch
+import torch.nn as nn
+
+class ModifiedLayerNorm(nn.Module):
+    """
+    Modified Layer Normalization normalizes vectors along channel dimension and temporal dimensions.
+    Input: tensor in shape [B, L, D]
+    """
+    def __init__(self, num_channels, eps=1e-05):
+        super().__init__()
+        # The shape of learnable affine parameters is also [num_channels, ], keeping the same as vanilla Layer Normalization.
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        u = x.mean([1, 2], keepdim=True) # Mean along channel and spatial dimension.
+        s = (x - u).pow(2).mean([1, 2], keepdim=True) # Variance along channel and spatial dimensions.
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight.unsqueeze(-1) * x + self.bias.unsqueeze(-1)
+
+        return x.transpose(1, 2)
+
+
+class RevIN(nn.Module):
+    def __init__(self, num_features: int, eps=1e-5, affine=True):
+        """
+        :param num_features: the number of features or channels
+        :param eps: a value added for numerical stability
+        :param affine: if True, RevIN has learnable affine parameters
+        """
+        super(RevIN, self).__init__()
+
+        self.num_features = num_features
+        self.eps = eps
+        self.affine = affine
+        
+        if self.affine:
+            self._init_params()
+
+    def forward(self, x, mode:str):
+        if mode == 'norm':
+            self._get_statistics(x)
+            x = self._normalize(x)
+        
+        elif mode == 'denorm':
+            x = self._denormalize(x)
+        
+        else: raise NotImplementedError
+
+        return x
+
+    def _init_params(self):
+        # initialize RevIN params: (C,)
+        self.affine_weight = nn.Parameter(torch.ones(self.num_features))
+        self.affine_bias = nn.Parameter(torch.zeros(self.num_features))
+
+    def _get_statistics(self, x):
+        dim2reduce = tuple(range(1, x.ndim-1))
+        self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
+        self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
+
+    def _normalize(self, x):
+        x = x - self.mean
+        x = x / self.stdev
+        if self.affine:
+            x = x * self.affine_weight
+            x = x + self.affine_bias
+
+        return x
+
+    def _denormalize(self, x):
+        if self.affine:
+            x = x - self.affine_bias
+            x = x / (self.affine_weight + self.eps*self.eps)
+        x = x * self.stdev
+        x = x + self.mean
+        
+        return x
+
+
+class InvDiff(nn.Module):
+    def __init__(self, num_features: int):
+        super(InvDiff, self).__init__()
+
+        self.num_features = num_features
+        self.pivot = None
+
+    def forward(self, x, mode):
+        if mode == 'diff':
+            self.pivot = x[:, -1]
+            x = torch.diff(x, dim=1)
+
+            return x
+        
+        elif mode == 'restore':
+            y = torch.zeros_like(x)
+            y[:, 0] = x[:, 0] + self.pivot
+            for idx in range(y.shape[1]-1):
+                y[:, idx] = x[:, idx] + y[:, idx-1]
+            
+            return y
+        
+        else: raise NotImplementedError
diff --git a/baselines/MTSMixer/arch/Projection.py b/baselines/MTSMixer/arch/Projection.py
new file mode 100644
index 0000000..94c955c
--- /dev/null
+++ b/baselines/MTSMixer/arch/Projection.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+class ChannelProjection(nn.Module):
+    def __init__(self, seq_len, pred_len, num_channel, individual):
+        super().__init__()
+
+        self.linears = nn.ModuleList([
+            nn.Linear(seq_len, pred_len) for _ in range(num_channel)
+        ]) if individual else nn.Linear(seq_len, pred_len)
+        # self.dropouts = nn.ModuleList()
+        self.individual = individual
+
+    def forward(self, x):
+        # x: [B, L, D]
+        x_out = []
+        if self.individual:
+            for idx in range(x.shape[-1]):
+                x_out.append(self.linears[idx](x[:, :, idx]))
+
+            x = torch.stack(x_out, dim=-1)
+
+        else: x = self.linears(x.transpose(1, 2)).transpose(1, 2)
+
+        return x
diff --git a/baselines/MTSMixer/arch/__init__.py b/baselines/MTSMixer/arch/__init__.py
new file mode 100644
index 0000000..104d220
--- /dev/null
+++ b/baselines/MTSMixer/arch/__init__.py
@@ -0,0 +1 @@
+from .mtsmixer_arch import MTSMixer
\ No newline at end of file
diff --git a/baselines/MTSMixer/arch/decomposition.py b/baselines/MTSMixer/arch/decomposition.py
new file mode 100644
index 0000000..08f76b3
--- /dev/null
+++ b/baselines/MTSMixer/arch/decomposition.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import repeat, rearrange
+from contextlib import contextmanager
+
+def svd_denoise(x, cut):
+    x_ = x.clone().detach()
+    U, S, V = torch.linalg.svd(x_, full_matrices=False)
+    S[:, cut:] = 0
+
+    return U @ torch.diag(S[0, :]) @ V
+
+@contextmanager
+def null_context():
+    yield
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
+class NMF(nn.Module):
+    def __init__(self, dim, n, ratio=8, K=6, eps=2e-8):
+        super().__init__()
+        r = dim // ratio
+
+        D = torch.zeros(dim, r).uniform_(0, 1)
+        C = torch.zeros(r, n).uniform_(0, 1)
+
+        self.K = K
+        self.D = nn.Parameter(D)
+        self.C = nn.Parameter(C)
+
+        self.eps = eps
+
+    def forward(self, x):
+        b, D, C, eps = x.shape[0], self.D, self.C, self.eps
+
+        # x is made non-negative with relu as proposed in paper
+        x = F.relu(x)
+
+        D = repeat(D, 'd r -> b d r', b = b)
+        C = repeat(C, 'r n -> b r n', b = b)
+
+        # transpose
+        t = lambda tensor: rearrange(tensor, 'b i j -> b j i')
+
+        for k in reversed(range(self.K)):
+            # only calculate gradients on the last step, per propose 'One-step Gradient'
+            context = null_context if k == 0 else torch.no_grad
+            with context():
+                C_new = C * ((t(D) @ x) / ((t(D) @ D @ C) + eps))
+                D_new = D * ((x @ t(C)) / ((D @ C @ t(C)) + eps))
+                C, D = C_new, D_new
+
+        return D @ C
\ No newline at end of file
diff --git a/baselines/MTSMixer/arch/mtsmixer_arch.py b/baselines/MTSMixer/arch/mtsmixer_arch.py
new file mode 100644
index 0000000..435f8bf
--- /dev/null
+++ b/baselines/MTSMixer/arch/mtsmixer_arch.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+from .Invertible import RevIN
+from .Projection import ChannelProjection
+from .decomposition import svd_denoise, NMF
+
+class MLPBlock(nn.Module):
+    def __init__(self, input_dim, mlp_dim) :
+        super().__init__()
+        self.fc1 = nn.Linear(input_dim, mlp_dim)
+        self.gelu = nn.GELU()
+        self.fc2 = nn.Linear(mlp_dim, input_dim)
+    
+    def forward(self, x):
+        # [B, L, D] or [B, D, L]
+        return self.fc2(self.gelu(self.fc1(x)))
+
+
+class FactorizedTemporalMixing(nn.Module):
+    def __init__(self, input_dim, mlp_dim, sampling) :
+        super().__init__()
+
+        assert sampling in [1, 2, 3, 4, 6, 8, 12]
+        self.sampling = sampling
+        self.temporal_fac = nn.ModuleList([
+            MLPBlock(input_dim // sampling, mlp_dim) for _ in range(sampling)
+        ])
+
+    def merge(self, shape, x_list):
+        y = torch.zeros(shape, device=x_list[0].device)
+        for idx, x_pad in enumerate(x_list):
+            y[:, :, idx::self.sampling] = x_pad
+
+        return y
+
+    def forward(self, x):
+        x_samp = []
+        for idx, samp in enumerate(self.temporal_fac):
+            x_samp.append(samp(x[:, :, idx::self.sampling]))
+
+        x = self.merge(x.shape, x_samp)
+
+        return x
+
+
+class FactorizedChannelMixing(nn.Module):
+    def __init__(self, input_dim, factorized_dim) :
+        super().__init__()
+
+        assert input_dim > factorized_dim
+        self.channel_mixing = MLPBlock(input_dim, factorized_dim)
+
+    def forward(self, x):
+
+        return self.channel_mixing(x)
+
+
+class MixerBlock(nn.Module):
+    def __init__(self, tokens_dim, channels_dim, tokens_hidden_dim, channels_hidden_dim, fac_T, fac_C, sampling, norm_flag):
+        super().__init__()
+        self.tokens_mixing = FactorizedTemporalMixing(tokens_dim, tokens_hidden_dim, sampling) if fac_T else MLPBlock(tokens_dim, tokens_hidden_dim)
+        self.channels_mixing = FactorizedChannelMixing(channels_dim, channels_hidden_dim) if fac_C else None
+        self.norm = nn.LayerNorm(channels_dim) if norm_flag else None
+
+    def forward(self,x):
+        # token-mixing [B, D, #tokens]
+        y = self.norm(x) if self.norm else x
+        y = self.tokens_mixing(y.transpose(1, 2)).transpose(1, 2)
+
+        # channel-mixing [B, #tokens, D]
+        if self.channels_mixing:
+            y += x
+            res = y
+            y = self.norm(y) if self.norm else y
+            y = res + self.channels_mixing(y)
+
+        return y
+
+
+class MTSMixer(nn.Module):
+    def __init__(self,  **model_args):
+        super(MTSMixer, self).__init__()
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        self.enc_in = model_args['enc_in']
+        self.dec_in = model_args['dec_in']
+        self.d_model = model_args['d_model']
+        self.d_ff = model_args['d_ff']
+        self.norm = model_args['use_norm']
+        self.e_layers = model_args['e_layers']
+        self.fac_T = model_args['fac_T']
+        self.fac_C = model_args['fac_C']
+        self.sampling = model_args['sampling']
+        self.individual = model_args['individual']
+        self.rev = model_args['rev']
+        self.mlp_blocks = nn.ModuleList([
+            MixerBlock(self.seq_len, self.enc_in, self.d_model, self.d_ff, self.fac_T, self.fac_C, self.sampling, self.norm)
+            for _ in range(self.e_layers)
+        ])
+        self.norm = nn.LayerNorm(self.enc_in) if self.norm else None
+        self.projection = ChannelProjection(self.seq_len, self.pred_len, self.enc_in, self.individual)
+        # self.projection = nn.Linear(configs.seq_len, configs.pred_len)
+        # self.refine = MLPBlock(configs.pred_len, configs.d_model) if configs.refine else None
+        self.rev = RevIN(self.enc_in) if self.rev else None
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        """
+
+        Args:
+            history_data (Tensor): Input data with shape: [B, L1, N, C]
+            future_data (Tensor): Future data with shape: [B, L2, N, C]
+
+        Returns:
+            torch.Tensor: outputs with shape [B, L2, N, 1]
+        """
+        x = history_data[:, :, :, 0]
+        x = self.rev(x, 'norm') if self.rev else x
+
+        for block in self.mlp_blocks:
+            x = block(x)
+
+        x = self.norm(x) if self.norm else x
+        x = self.projection(x)
+        # x = self.refine(x.transpose(1, 2)).transpose(1, 2) if self.refine else x
+        x = self.rev(x, 'denorm') if self.rev else x
+
+        return x.unsqueeze(-1)
diff --git a/baselines/Nonstationary_Transformer/Electricity.py b/baselines/Nonstationary_Transformer/Electricity.py
new file mode 100644
index 0000000..971592e
--- /dev/null
+++ b/baselines/Nonstationary_Transformer/Electricity.py
@@ -0,0 +1,157 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import Nonstationary_Transformer
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = Nonstationary_Transformer
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 3, # attn factor
+    "p_hidden_dims": [128, 128],
+    "p_hidden_layers": 2,
+    "d_model": 512,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 2,                              # num of encoder layers
+    "d_layers": 1,                              # num of decoder layers
+    "d_ff": 2048,
+    "distil": True,
+    "sigma" : 0.2,
+    "dropout": 0.05,
+    "freq": 'h',
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/Nonstationary_Transformer/arch/Embed.py b/baselines/Nonstationary_Transformer/arch/Embed.py
new file mode 100644
index 0000000..6ef5503
--- /dev/null
+++ b/baselines/Nonstationary_Transformer/arch/Embed.py
@@ -0,0 +1,132 @@
+import torch
+import torch.nn as nn
+import math
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(TokenEmbedding, self).__init__()
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward(self, x):
+        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
+        return x
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(FixedEmbedding, self).__init__()
+
+        w = torch.zeros(c_in, d_model).float()
+        w.require_grad = False
+
+        position = torch.arange(0, c_in).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
+
+        w[:, 0::2] = torch.sin(position * div_term)
+        w[:, 1::2] = torch.cos(position * div_term)
+
+        self.emb = nn.Embedding(c_in, d_model)
+        self.emb.weight = nn.Parameter(w, requires_grad=False)
+
+    def forward(self, x):
+        return self.emb(x).detach()
+
+
+class TemporalEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='fixed', freq='h'):
+        super(TemporalEmbedding, self).__init__()
+
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
+        if freq == 't':
+            self.minute_embed = Embed(minute_size, d_model)
+        self.hour_embed = Embed(hour_size, d_model)
+        self.weekday_embed = Embed(weekday_size, d_model)
+        self.day_embed = Embed(day_size, d_model)
+        self.month_embed = Embed(month_size, d_model)
+
+    def forward(self, x):
+        x = x.long()
+
+        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(self, 'minute_embed') else 0.
+        hour_x = self.hour_embed(x[:, :, 3])
+        weekday_x = self.weekday_embed(x[:, :, 2])
+        day_x = self.day_embed(x[:, :, 1])
+        month_x = self.month_embed(x[:, :, 0])
+
+        return hour_x + weekday_x + day_x + month_x + minute_x
+
+
+class TimeFeatureEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='timeF', freq='h'):
+        super(TimeFeatureEmbedding, self).__init__()
+
+        freq_map = {'h': 4, 't': 5, 's': 6, 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+        d_inp = freq_map[freq]
+        self.embed = nn.Linear(d_inp, d_model, bias=False)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class DataEmbedding(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        x = self.value_embedding(x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
+        return self.dropout(x)
+
+
+class DataEmbedding_wo_pos(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_wo_pos, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        x = self.value_embedding(x) + self.temporal_embedding(x_mark)
+        return self.dropout(x)
diff --git a/baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py b/baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py
new file mode 100644
index 0000000..0e25b6f
--- /dev/null
+++ b/baselines/Nonstationary_Transformer/arch/SelfAttention_Family.py
@@ -0,0 +1,172 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from math import sqrt
+from .masking import TriangularCausalMask, ProbMask
+
+
+class DSAttention(nn.Module):
+    '''De-stationary Attention'''
+    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
+        super(DSAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        scale = self.scale or 1. / sqrt(E)
+
+        tau = 1.0 if tau is None else tau.unsqueeze(1).unsqueeze(1)  # B x 1 x 1 x 1
+        delta = 0.0 if delta is None else delta.unsqueeze(1).unsqueeze(1)  # B x 1 x 1 x S
+        
+        # De-stationary Attention, rescaling pre-softmax score with learned de-stationary factors
+        scores = torch.einsum("blhe,bshe->bhls", queries, keys) * tau + delta
+
+        if self.mask_flag:
+            if attn_mask is None:
+                attn_mask = TriangularCausalMask(B, L, device=queries.device)
+
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        A = self.dropout(torch.softmax(scale * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", A, values)
+
+        if self.output_attention:
+            return (V.contiguous(), A)
+        else:
+            return (V.contiguous(), None)
+
+
+class DSProbAttention(nn.Module):
+    '''De-stationary ProbAttention for Informer'''
+    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
+        super(DSProbAttention, self).__init__()
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
+        # Q [B, H, L, D]
+        B, H, L_K, E = K.shape
+        _, _, L_Q, _ = Q.shape
+
+        # calculate the sampled Q_K
+        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
+        index_sample = torch.randint(L_K, (L_Q, sample_k))  # real U = U_part(factor*ln(L_k))*L_q
+        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
+        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze()
+
+        # find the Top_k query with sparisty measurement
+        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
+        M_top = M.topk(n_top, sorted=False)[1]
+
+        # use the reduced Q to calculate Q_K
+        Q_reduce = Q[torch.arange(B)[:, None, None],
+                   torch.arange(H)[None, :, None],
+                   M_top, :]  # factor*ln(L_q)
+        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
+
+        return Q_K, M_top
+
+    def _get_initial_context(self, V, L_Q):
+        B, H, L_V, D = V.shape
+        if not self.mask_flag:
+            # V_sum = V.sum(dim=-2)
+            V_sum = V.mean(dim=-2)
+            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
+        else:  # use mask
+            assert (L_Q == L_V)  # requires that L_Q == L_V, i.e. for self-attention only
+            contex = V.cumsum(dim=-2)
+        return contex
+
+    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
+        B, H, L_V, D = V.shape
+
+        if self.mask_flag:
+            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
+
+        context_in[torch.arange(B)[:, None, None],
+        torch.arange(H)[None, :, None],
+        index, :] = torch.matmul(attn, V).type_as(context_in)
+        if self.output_attention:
+            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
+            attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
+            return (context_in, attns)
+        else:
+            return (context_in, None)
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        B, L_Q, H, D = queries.shape
+        _, L_K, _, _ = keys.shape
+
+        queries = queries.transpose(2, 1)
+        keys = keys.transpose(2, 1)
+        values = values.transpose(2, 1)
+
+        U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item()  # c*ln(L_k)
+        u = self.factor * np.ceil(np.log(L_Q)).astype('int').item()  # c*ln(L_q)
+
+        U_part = U_part if U_part < L_K else L_K
+        u = u if u < L_Q else L_Q
+
+        scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
+
+        tau = 1.0 if tau is None else tau.unsqueeze(1).unsqueeze(1)  # B x 1 x 1 x 1
+        delta = 0.0 if delta is None else delta.unsqueeze(1).unsqueeze(1)  # B x 1 x 1 x S
+        scores_top = scores_top * tau + delta
+
+        # add scale factor
+        scale = self.scale or 1. / sqrt(D)
+        if scale is not None:
+            scores_top = scores_top * scale
+        # get the context
+        context = self._get_initial_context(values, L_Q)
+        # update the context with selected top_k queries
+        context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask)
+
+        return context.contiguous(), attn
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, attention, d_model, n_heads, d_keys=None,
+                 d_values=None):
+        super(AttentionLayer, self).__init__()
+
+        d_keys = d_keys or (d_model // n_heads)
+        d_values = d_values or (d_model // n_heads)
+
+        self.inner_attention = attention
+        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.value_projection = nn.Linear(d_model, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        B, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+
+        queries = self.query_projection(queries).view(B, L, H, -1)
+        keys = self.key_projection(keys).view(B, S, H, -1)
+        values = self.value_projection(values).view(B, S, H, -1)
+
+        out, attn = self.inner_attention(
+            queries,
+            keys,
+            values,
+            attn_mask,
+            tau, delta
+        )
+        out = out.view(B, L, -1)
+
+        return self.out_projection(out), attn
+
diff --git a/baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py b/baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py
new file mode 100644
index 0000000..0e63f5e
--- /dev/null
+++ b/baselines/Nonstationary_Transformer/arch/Transformer_EncDec.py
@@ -0,0 +1,143 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super(ConvLayer, self).__init__()
+        self.downConv = nn.Conv1d(in_channels=c_in,
+                                  out_channels=c_in,
+                                  kernel_size=3,
+                                  padding=2,
+                                  padding_mode='circular')
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1)) # BxExS
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        new_x, attn = self.attention(
+            x, x, x,
+            attn_mask=attn_mask,
+            tau=tau, delta=delta
+        )
+        x = x + self.dropout(new_x)
+
+        y = x = self.norm1(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm2(x + y), attn
+
+
+class Encoder(nn.Module):
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        # x [B, L, D]
+        attns = []
+        if self.conv_layers is not None:
+            # The reason why we only import delta for the first attn_block of Encoder
+            # is to integrate Informer into our framework, where row size of attention of Informer is changing each layer
+            # and inconsistent to the sequence length of the initial input,
+            # then no way to add delta to every row, so we make delta=0.0 (See our Appendix E.2)
+            # 
+            for i, (attn_layer, conv_layer) in enumerate(zip(self.attn_layers, self.conv_layers)):
+                delta = delta if i==0 else None
+                x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x, tau=tau, delta=None)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta)
+                attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x, attns
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
+                 dropout=0.1, activation="relu"):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
+        # Note that delta only used for Self-Attention(x_enc with x_enc) 
+        # and Cross-Attention(x_enc with x_dec), 
+        # but not suitable for Self-Attention(x_dec with x_dec)
+        
+        x = x + self.dropout(self.self_attention(
+            x, x, x,
+            attn_mask=x_mask,
+            tau=tau, delta=None
+        )[0])  
+        x = self.norm1(x)
+
+        x = x + self.dropout(self.cross_attention(
+            x, cross, cross,
+            attn_mask=cross_mask,
+            tau=tau, delta=delta
+        )[0])
+
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm3(x + y)
+
+
+class Decoder(nn.Module):
+    def __init__(self, layers, norm_layer=None, projection=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+        self.projection = projection
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
+        for layer in self.layers:
+            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        if self.projection is not None:
+            x = self.projection(x)
+        return x
diff --git a/baselines/Nonstationary_Transformer/arch/__init__.py b/baselines/Nonstationary_Transformer/arch/__init__.py
new file mode 100644
index 0000000..c0a0fd5
--- /dev/null
+++ b/baselines/Nonstationary_Transformer/arch/__init__.py
@@ -0,0 +1 @@
+from .nstransformer_arch import Nonstationary_Transformer
\ No newline at end of file
diff --git a/baselines/Nonstationary_Transformer/arch/masking.py b/baselines/Nonstationary_Transformer/arch/masking.py
new file mode 100644
index 0000000..a19cbf6
--- /dev/null
+++ b/baselines/Nonstationary_Transformer/arch/masking.py
@@ -0,0 +1,26 @@
+import torch
+
+
+class TriangularCausalMask():
+    def __init__(self, B, L, device="cpu"):
+        mask_shape = [B, 1, L, L]
+        with torch.no_grad():
+            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
+
+
+class ProbMask():
+    def __init__(self, B, H, L, index, scores, device="cpu"):
+        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
+        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
+        indicator = _mask_ex[torch.arange(B)[:, None, None],
+                    torch.arange(H)[None, :, None],
+                    index, :].to(device)
+        self._mask = indicator.view(scores.shape).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
diff --git a/baselines/Nonstationary_Transformer/arch/nstransformer_arch.py b/baselines/Nonstationary_Transformer/arch/nstransformer_arch.py
new file mode 100644
index 0000000..402d282
--- /dev/null
+++ b/baselines/Nonstationary_Transformer/arch/nstransformer_arch.py
@@ -0,0 +1,161 @@
+import torch
+import torch.nn as nn
+from .Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer
+from .SelfAttention_Family import DSAttention, AttentionLayer
+from .Embed import DataEmbedding
+from basicts.utils import data_transformation_4_xformer
+class Projector(nn.Module):
+    '''
+    MLP to learn the De-stationary factors
+    '''
+
+    def __init__(self, enc_in, seq_len, hidden_dims, hidden_layers, output_dim, kernel_size=3):
+        super(Projector, self).__init__()
+
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.series_conv = nn.Conv1d(in_channels=seq_len, out_channels=1, kernel_size=kernel_size, padding=padding,
+                                     padding_mode='circular', bias=False)
+
+        layers = [nn.Linear(2 * enc_in, hidden_dims[0]), nn.ReLU()]
+        for i in range(hidden_layers - 1):
+            layers += [nn.Linear(hidden_dims[i], hidden_dims[i + 1]), nn.ReLU()]
+
+        layers += [nn.Linear(hidden_dims[-1], output_dim, bias=False)]
+        self.backbone = nn.Sequential(*layers)
+
+    def forward(self, x, stats):
+        # x:     B x S x E
+        # stats: B x 1 x E
+        # y:     B x O
+        batch_size = x.shape[0]
+        x = self.series_conv(x)  # B x 1 x E
+        x = torch.cat([x, stats], dim=1)  # B x 2 x E
+        x = x.view(batch_size, -1)  # B x 2E
+        y = self.backbone(x)  # B x O
+
+        return y
+
+
+class Nonstationary_Transformer(nn.Module):
+    """
+    Non-stationary Transformer
+    """
+
+    def __init__(self, **model_args):
+        super(Nonstationary_Transformer, self).__init__()
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        self.label_len = int(model_args['label_len'])
+        self.output_attention = model_args['output_attention']
+        self.enc_in = model_args['enc_in']
+        self.dec_in = model_args['dec_in']
+        self.c_out = model_args['c_out']
+        self.factor = model_args["factor"]
+        self.d_model = model_args['d_model']
+        self.n_heads = model_args['n_heads']
+        self.d_ff = model_args['d_ff']
+        self.embed = model_args['embed']
+        self.freq = model_args["freq"]
+        self.dropout = model_args["dropout"]
+        self.activation = model_args['activation']
+        self.e_layers = model_args['e_layers']
+        self.d_layers = model_args['d_layers']
+        self.p_hidden_dims = model_args['p_hidden_dims']
+        self.p_hidden_layers = model_args['p_hidden_layers']
+        # Embedding
+        self.enc_embedding = DataEmbedding(self.enc_in, self.d_model, self.embed, self.freq,
+                                           self.dropout)
+        self.dec_embedding = DataEmbedding(self.dec_in, self.d_model, self.embed, self.freq,
+                                           self.dropout)
+        # Encoder
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    AttentionLayer(
+                        DSAttention(False, self.factor, attention_dropout=self.dropout,
+                                    output_attention=self.output_attention), self.d_model, self.n_heads),
+                    self.d_model,
+                    self.d_ff,
+                    dropout=self.dropout,
+                    activation=self.activation
+                ) for l in range(self.e_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(self.d_model)
+        )
+        # Decoder
+        self.decoder = Decoder(
+            [
+                DecoderLayer(
+                    AttentionLayer(
+                        DSAttention(True, self.factor, attention_dropout=self.dropout, output_attention=False),
+                        self.d_model, self.n_heads),
+                    AttentionLayer(
+                        DSAttention(False, self.factor, attention_dropout=self.dropout, output_attention=False),
+                        self.d_model, self.n_heads),
+                    self.d_model,
+                    self.d_ff,
+                    dropout=self.dropout,
+                    activation=self.activation,
+                )
+                for l in range(self.d_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(self.d_model),
+            projection=nn.Linear(self.d_model, self.c_out, bias=True)
+        )
+
+        self.tau_learner = Projector(enc_in=self.enc_in, seq_len=self.seq_len, hidden_dims=self.p_hidden_dims,
+                                     hidden_layers=self.p_hidden_layers, output_dim=1)
+        self.delta_learner = Projector(enc_in=self.enc_in, seq_len=self.seq_len,
+                                       hidden_dims=self.p_hidden_dims, hidden_layers=self.p_hidden_layers,
+                                       output_dim=self.seq_len)
+
+    def forward_xformer(self, x_enc: torch.Tensor, x_mark_enc: torch.Tensor, x_dec: torch.Tensor,
+                        x_mark_dec: torch.Tensor,
+                        enc_self_mask: torch.Tensor = None, dec_self_mask: torch.Tensor = None,
+                        dec_enc_mask: torch.Tensor = None) -> torch.Tensor:
+        x_raw = x_enc.clone().detach()
+
+        # Normalization
+        mean_enc = x_enc.mean(1, keepdim=True).detach()  # B x 1 x E
+        x_enc = x_enc - mean_enc
+        std_enc = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5).detach()  # B x 1 x E
+        x_enc = x_enc / std_enc
+
+        x_dec_new = torch.cat([x_enc[:, -self.label_len:, :], torch.zeros_like(x_dec[:, -self.pred_len:, :])],
+                              dim=1).to(x_enc.device).clone()
+
+        tau = self.tau_learner(x_raw, std_enc).exp()  # B x S x E, B x 1 x E -> B x 1, positive scalar
+        delta = self.delta_learner(x_raw, mean_enc)  # B x S x E, B x 1 x E -> B x S
+
+        # Model Inference
+        enc_out = self.enc_embedding(x_enc, x_mark_enc)
+        enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask, tau=tau, delta=delta)
+
+        dec_out = self.dec_embedding(x_dec_new, x_mark_dec)
+        dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask, tau=tau, delta=delta)
+
+        # De-normalization
+        dec_out = dec_out * std_enc + mean_enc
+
+        if self.output_attention:
+            return dec_out[:, -self.pred_len:, :].unsqueeze(-1), attns
+        else:
+            return dec_out[:, -self.pred_len:, :].unsqueeze(-1)  # [B, L, D]
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        """
+
+        Args:
+            history_data (Tensor): Input data with shape: [B, L1, N, C]
+            future_data (Tensor): Future data with shape: [B, L2, N, C]
+
+        Returns:
+            torch.Tensor: outputs with shape [B, L2, N, 1]
+        """
+
+        x_enc, x_mark_enc, x_dec, x_mark_dec = data_transformation_4_xformer(history_data=history_data,
+                                                                             future_data=future_data,
+                                                                             start_token_len=self.label_len)
+        prediction = self.forward_xformer(x_enc=x_enc, x_mark_enc=x_mark_enc, x_dec=x_dec, x_mark_dec=x_mark_dec)
+        return prediction
\ No newline at end of file
diff --git a/baselines/SegRNN/Electricity.py b/baselines/SegRNN/Electricity.py
new file mode 100644
index 0000000..740a584
--- /dev/null
+++ b/baselines/SegRNN/Electricity.py
@@ -0,0 +1,152 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import SegRNN
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = SegRNN
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "seg_len": 48,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 1, # attn factor
+    "d_model": 512, # 512
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+                               # num of decoder layers
+    "dropout": 0.1,
+    "freq": 'h',
+    "use_norm" : False,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0005,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/SegRNN/arch/Autoformer_EncDec.py b/baselines/SegRNN/arch/Autoformer_EncDec.py
new file mode 100644
index 0000000..6fce4bc
--- /dev/null
+++ b/baselines/SegRNN/arch/Autoformer_EncDec.py
@@ -0,0 +1,203 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class my_Layernorm(nn.Module):
+    """
+    Special designed layernorm for the seasonal part
+    """
+
+    def __init__(self, channels):
+        super(my_Layernorm, self).__init__()
+        self.layernorm = nn.LayerNorm(channels)
+
+    def forward(self, x):
+        x_hat = self.layernorm(x)
+        bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
+        return x_hat - bias
+
+
+class moving_avg(nn.Module):
+    """
+    Moving average block to highlight the trend of time series
+    """
+
+    def __init__(self, kernel_size, stride):
+        super(moving_avg, self).__init__()
+        self.kernel_size = kernel_size
+        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)
+
+    def forward(self, x):
+        # padding on the both ends of time series
+        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        x = torch.cat([front, x, end], dim=1)
+        x = self.avg(x.permute(0, 2, 1))
+        x = x.permute(0, 2, 1)
+        return x
+
+
+class series_decomp(nn.Module):
+    """
+    Series decomposition block
+    """
+
+    def __init__(self, kernel_size):
+        super(series_decomp, self).__init__()
+        self.moving_avg = moving_avg(kernel_size, stride=1)
+
+    def forward(self, x):
+        moving_mean = self.moving_avg(x)
+        res = x - moving_mean
+        return res, moving_mean
+
+
+class series_decomp_multi(nn.Module):
+    """
+    Multiple Series decomposition block from FEDformer
+    """
+
+    def __init__(self, kernel_size):
+        super(series_decomp_multi, self).__init__()
+        self.kernel_size = kernel_size
+        self.series_decomp = [series_decomp(kernel) for kernel in kernel_size]
+
+    def forward(self, x):
+        moving_mean = []
+        res = []
+        for func in self.series_decomp:
+            sea, moving_avg = func(x)
+            moving_mean.append(moving_avg)
+            res.append(sea)
+
+        sea = sum(res) / len(res)
+        moving_mean = sum(moving_mean) / len(moving_mean)
+        return sea, moving_mean
+
+
+class EncoderLayer(nn.Module):
+    """
+    Autoformer encoder layer with the progressive decomposition architecture
+    """
+
+    def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.decomp1 = series_decomp(moving_avg)
+        self.decomp2 = series_decomp(moving_avg)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None):
+        new_x, attn = self.attention(
+            x, x, x,
+            attn_mask=attn_mask
+        )
+        x = x + self.dropout(new_x)
+        x, _ = self.decomp1(x)
+        y = x
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+        res, _ = self.decomp2(x + y)
+        return res, attn
+
+
+class Encoder(nn.Module):
+    """
+    Autoformer encoder
+    """
+
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None):
+        attns = []
+        if self.conv_layers is not None:
+            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x, attns
+
+
+class DecoderLayer(nn.Module):
+    """
+    Autoformer decoder layer with the progressive decomposition architecture
+    """
+
+    def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None,
+                 moving_avg=25, dropout=0.1, activation="relu"):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.decomp1 = series_decomp(moving_avg)
+        self.decomp2 = series_decomp(moving_avg)
+        self.decomp3 = series_decomp(moving_avg)
+        self.dropout = nn.Dropout(dropout)
+        self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1,
+                                    padding_mode='circular', bias=False)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        x = x + self.dropout(self.self_attention(
+            x, x, x,
+            attn_mask=x_mask
+        )[0])
+        x, trend1 = self.decomp1(x)
+        x = x + self.dropout(self.cross_attention(
+            x, cross, cross,
+            attn_mask=cross_mask
+        )[0])
+        x, trend2 = self.decomp2(x)
+        y = x
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+        x, trend3 = self.decomp3(x + y)
+
+        residual_trend = trend1 + trend2 + trend3
+        residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2)
+        return x, residual_trend
+
+
+class Decoder(nn.Module):
+    """
+    Autoformer encoder
+    """
+
+    def __init__(self, layers, norm_layer=None, projection=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+        self.projection = projection
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, trend=None):
+        for layer in self.layers:
+            x, residual_trend = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
+            trend = trend + residual_trend
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        if self.projection is not None:
+            x = self.projection(x)
+        return x, trend
diff --git a/baselines/SegRNN/arch/__init__.py b/baselines/SegRNN/arch/__init__.py
new file mode 100644
index 0000000..0b5ca8e
--- /dev/null
+++ b/baselines/SegRNN/arch/__init__.py
@@ -0,0 +1 @@
+from .segrnn_arch import SegRNN
\ No newline at end of file
diff --git a/baselines/SegRNN/arch/segrnn_arch.py b/baselines/SegRNN/arch/segrnn_arch.py
new file mode 100644
index 0000000..3769833
--- /dev/null
+++ b/baselines/SegRNN/arch/segrnn_arch.py
@@ -0,0 +1,83 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .Autoformer_EncDec import series_decomp
+
+
+class SegRNN(nn.Module):
+    """
+    Paper link: https://arxiv.org/abs/2308.11200.pdf
+    """
+
+    def __init__(self, **model_args):
+        super(SegRNN, self).__init__()
+
+        # get parameters
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        self.d_model = model_args['d_model']
+        self.dropout = model_args["dropout"]
+        self.seg_len = model_args['seg_len']
+        self.seg_num_x = self.seq_len // self.seg_len
+        self.seg_num_y = self.pred_len // self.seg_len
+        self.enc_in = model_args['enc_in']
+
+        # building model
+        self.valueEmbedding = nn.Sequential(
+            nn.Linear(self.seg_len, self.d_model),
+            nn.ReLU()
+        )
+        self.rnn = nn.GRU(input_size=self.d_model, hidden_size=self.d_model, num_layers=1, bias=True,
+                              batch_first=True, bidirectional=False)
+        self.pos_emb = nn.Parameter(torch.randn(self.seg_num_y, self.d_model // 2))
+        self.channel_emb = nn.Parameter(torch.randn(self.enc_in, self.d_model // 2))
+
+        self.predict = nn.Sequential(
+            nn.Dropout(self.dropout),
+            nn.Linear(self.d_model, self.seg_len)
+        )
+
+    def encoder(self, x):
+        # b:batch_size c:channel_size s:seq_len s:seq_len
+        # d:d_model w:seg_len n:seg_num_x m:seg_num_y
+        batch_size = x.size(0)
+
+        # normalization and permute     b,s,c -> b,c,s
+        seq_last = x[:, -1:, :].detach()
+        x = (x - seq_last).permute(0, 2, 1) # b,c,s
+
+        # segment and embedding    b,c,s -> bc,n,w -> bc,n,d
+        x = self.valueEmbedding(x.reshape(-1, self.seg_num_x, self.seg_len))
+
+        # encoding
+        _, hn = self.rnn(x) # bc,n,d  1,bc,d
+
+        # m,d//2 -> 1,m,d//2 -> c,m,d//2
+        # c,d//2 -> c,1,d//2 -> c,m,d//2
+        # c,m,d -> cm,1,d -> bcm, 1, d
+        pos_emb = torch.cat([
+            self.pos_emb.unsqueeze(0).repeat(self.enc_in, 1, 1),
+            self.channel_emb.unsqueeze(1).repeat(1, self.seg_num_y, 1)
+        ], dim=-1).view(-1, 1, self.d_model).repeat(batch_size,1,1)
+
+        _, hy = self.rnn(pos_emb, hn.repeat(1, 1, self.seg_num_y).view(1, -1, self.d_model)) # bcm,1,d  1,bcm,d
+
+        # 1,bcm,d -> 1,bcm,w -> b,c,s
+        y = self.predict(hy).view(-1, self.enc_in, self.pred_len)
+
+        # permute and denorm
+        y = y.permute(0, 2, 1) + seq_last
+        return y
+
+    def forecast(self, x_enc):
+        # Encoder
+        return self.encoder(x_enc)
+
+
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        x_enc = history_data[:, :, :, 0]
+        dec_out = self.forecast(x_enc)
+        return dec_out[:, -self.pred_len:, :].unsqueeze(-1)  # [B, L, D]
+
diff --git a/baselines/SparseTSF/Electricity.py b/baselines/SparseTSF/Electricity.py
new file mode 100644
index 0000000..94e01ee
--- /dev/null
+++ b/baselines/SparseTSF/Electricity.py
@@ -0,0 +1,158 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import SparseTSF
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = SparseTSF
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "period_len" : 24,
+    "factor": 1, # attn factor
+    "p_hidden_dims": [128, 128],
+    "p_hidden_layers": 2,
+    "d_model": 512,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 2,                              # num of encoder layers
+    "d_layers": 2,                              # num of decoder layers
+    "d_ff": 2048,
+    "distil": True,
+    "sigma" : 0.2,
+    "dropout": 0.1,
+    "freq": 'h',
+    "use_norm" : False,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.02,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/SparseTSF/arch/Embed.py b/baselines/SparseTSF/arch/Embed.py
new file mode 100644
index 0000000..1202616
--- /dev/null
+++ b/baselines/SparseTSF/arch/Embed.py
@@ -0,0 +1,234 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+import math
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(TokenEmbedding, self).__init__()
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward(self, x):
+        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
+        return x
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(FixedEmbedding, self).__init__()
+
+        w = torch.zeros(c_in, d_model).float()
+        w.require_grad = False
+
+        position = torch.arange(0, c_in).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        w[:, 0::2] = torch.sin(position * div_term)
+        w[:, 1::2] = torch.cos(position * div_term)
+
+        self.emb = nn.Embedding(c_in, d_model)
+        self.emb.weight = nn.Parameter(w, requires_grad=False)
+
+    def forward(self, x):
+        return self.emb(x).detach()
+
+
+class TemporalEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='fixed', freq='h'):
+        super(TemporalEmbedding, self).__init__()
+
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
+        if freq == 't':
+            self.minute_embed = Embed(minute_size, d_model)
+        self.hour_embed = Embed(hour_size, d_model)
+        self.weekday_embed = Embed(weekday_size, d_model)
+        self.day_embed = Embed(day_size, d_model)
+        self.month_embed = Embed(month_size, d_model)
+
+    def forward(self, x):
+        x = x.long()
+        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(
+            self, 'minute_embed') else 0.
+        hour_x = self.hour_embed(x[:, :, 3])
+        weekday_x = self.weekday_embed(x[:, :, 2])
+        day_x = self.day_embed(x[:, :, 1])
+        month_x = self.month_embed(x[:, :, 0])
+
+        return hour_x + weekday_x + day_x + month_x + minute_x
+
+
+class TimeFeatureEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='timeF', freq='h'):
+        super(TimeFeatureEmbedding, self).__init__()
+
+        freq_map = {'h': 4, 't': 5, 's': 6,
+                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+        d_inp = freq_map[freq]
+        self.embed = nn.Linear(d_inp, d_model, bias=False)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class DataEmbedding(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding, self).__init__()
+        self.c_in = c_in
+        self.d_model = d_model
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        _, _, N = x.size()
+        if N == self.c_in:
+            if x_mark is None:
+                x = self.value_embedding(x) + self.position_embedding(x)
+            else:
+                x = self.value_embedding(
+                    x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
+        elif N == self.d_model:
+            if x_mark is None:
+                x = x + self.position_embedding(x)
+            else:
+                x = x + self.temporal_embedding(x_mark) + self.position_embedding(x)
+
+        return self.dropout(x)
+
+
+class DataEmbedding_ms(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_ms, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=1, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        B, T, N = x.shape
+        x1 = self.value_embedding(x.reshape(0, 2, 1).reshape(B * N, T).unsqueeze(-1)).reshape(B, N, T, -1).permute(0, 2,
+                                                                                                                   1, 3)
+        if x_mark is None:
+            x = x1
+        else:
+            x = x1 + self.temporal_embedding(x_mark)
+        return self.dropout(x)
+
+
+class DataEmbedding_wo_pos(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_wo_pos, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        if x is None and x_mark is not None:
+            return self.temporal_embedding(x_mark)
+        if x_mark is None:
+            x = self.value_embedding(x)
+        else:
+            x = self.value_embedding(x) + self.temporal_embedding(x_mark)
+        return self.dropout(x)
+
+
+class PatchEmbedding_crossformer(nn.Module):
+    def __init__(self, d_model, patch_len, stride, padding, dropout):
+        super(PatchEmbedding_crossformer, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, padding))
+
+        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
+        self.value_embedding = nn.Linear(patch_len, d_model, bias=False)
+
+        # Positional embedding
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x) + self.position_embedding(x)
+        return self.dropout(x), n_vars
+
+
+class PatchEmbedding(nn.Module):
+    def __init__(self, d_model, patch_len, stride, dropout):
+        super(PatchEmbedding, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, stride))
+
+        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
+        self.value_embedding = TokenEmbedding(patch_len, d_model)
+
+        # Positional embedding
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x) + self.position_embedding(x)
+        return self.dropout(x), n_vars
diff --git a/baselines/SparseTSF/arch/__init__.py b/baselines/SparseTSF/arch/__init__.py
new file mode 100644
index 0000000..245c6b1
--- /dev/null
+++ b/baselines/SparseTSF/arch/__init__.py
@@ -0,0 +1 @@
+from .sparsetsf_arch import SparseTSF
diff --git a/baselines/SparseTSF/arch/sparsetsf_arch.py b/baselines/SparseTSF/arch/sparsetsf_arch.py
new file mode 100644
index 0000000..9afe450
--- /dev/null
+++ b/baselines/SparseTSF/arch/sparsetsf_arch.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+from .Embed import PositionalEmbedding
+
+class SparseTSF(nn.Module):
+    def __init__(self, **model_args):
+        super(SparseTSF, self).__init__()
+
+        # get parameters
+        self.seq_len = model_args['seq_len']
+        self.pred_len = model_args['pred_len']
+        self.enc_in = model_args['enc_in']
+        self.period_len = model_args['period_len']
+
+        self.seg_num_x = self.seq_len // self.period_len
+        self.seg_num_y = self.pred_len // self.period_len
+
+        self.conv1d = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=1 + 2 * (self.period_len // 2),
+                                stride=1, padding=self.period_len // 2, padding_mode="zeros", bias=False)
+
+        self.linear = nn.Linear(self.seg_num_x, self.seg_num_y, bias=False)
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        x = history_data[:, :, :, 0]
+        batch_size = x.shape[0]
+        # normalization and permute     b,s,c -> b,c,s
+        seq_mean = torch.mean(x, dim=1).unsqueeze(1)
+        x = (x - seq_mean).permute(0, 2, 1)
+
+        # 1D convolution aggregation
+        x = self.conv1d(x.reshape(-1, 1, self.seq_len)).reshape(-1, self.enc_in, self.seq_len) + x
+
+        # downsampling: b,c,s -> bc,n,w -> bc,w,n
+        x = x.reshape(-1, self.seg_num_x, self.period_len).permute(0, 2, 1)
+
+        # sparse forecasting
+        y = self.linear(x)  # bc,w,m
+
+        # upsampling: bc,w,m -> bc,m,w -> b,c,s
+        y = y.permute(0, 2, 1).reshape(batch_size, self.enc_in, self.pred_len)
+
+        # permute and denorm
+        y = y.permute(0, 2, 1) + seq_mean
+
+        return y.unsqueeze(-1)
diff --git a/baselines/TiDE/Electricity.py b/baselines/TiDE/Electricity.py
new file mode 100644
index 0000000..aff06f7
--- /dev/null
+++ b/baselines/TiDE/Electricity.py
@@ -0,0 +1,153 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import TiDE
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = TiDE
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "d_model": 256,# 256
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "bias": True,
+    "feature_encode_dim": 2,
+    "e_layers": 2,                              # num of encoder layers
+    "d_layers": 2,                              # num of decoder layers
+    "d_ff": 256, # 256
+    "dropout": 0.3,
+    "freq": 'h',
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 24
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 24
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 24
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/TiDE/arch/__init__.py b/baselines/TiDE/arch/__init__.py
new file mode 100644
index 0000000..2f24eb7
--- /dev/null
+++ b/baselines/TiDE/arch/__init__.py
@@ -0,0 +1 @@
+from .tide_arch import TiDE
\ No newline at end of file
diff --git a/baselines/TiDE/arch/tide_arch.py b/baselines/TiDE/arch/tide_arch.py
new file mode 100644
index 0000000..3a8f386
--- /dev/null
+++ b/baselines/TiDE/arch/tide_arch.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from basicts.utils import data_transformation_4_xformer
+
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.1, bias=True):
+        super().__init__()
+
+        self.fc1 = nn.Linear(input_dim, hidden_dim, bias=bias)
+        self.fc2 = nn.Linear(hidden_dim, output_dim, bias=bias)
+        self.fc3 = nn.Linear(input_dim, output_dim, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+        self.relu = nn.ReLU()
+        self.ln = LayerNorm(output_dim, bias=bias)
+
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.relu(out)
+        out = self.fc2(out)
+        out = self.dropout(out)
+        out = out + self.fc3(x)
+        out = self.ln(out)
+        return out
+
+
+# TiDE
+class TiDE(nn.Module):
+    """
+    paper: https://arxiv.org/pdf/2304.08424.pdf
+    """
+
+    def __init__(self,  **model_args):
+        super(TiDE, self).__init__()
+
+        self.seq_len = model_args['seq_len']  # L
+        self.label_len = int(model_args['label_len'])
+        self.pred_len = model_args['pred_len']  # H
+        self.hidden_dim = model_args['d_model']
+        self.res_hidden = model_args['d_model']
+        self.encoder_num = model_args['e_layers']
+        self.decoder_num =  model_args['d_layers']
+        self.freq = model_args["freq"]
+        self.bias = model_args["bias"]
+        self.feature_encode_dim = model_args["feature_encode_dim"]
+        self.decode_dim = model_args['c_out']
+        self.temporalDecoderHidden = model_args['d_ff']
+        dropout = model_args["dropout"]
+
+        freq_map = {'h': 4, 't': 5, 's': 6,
+                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+
+        self.feature_dim = freq_map[self.freq]
+
+        flatten_dim = self.seq_len + (self.seq_len + self.pred_len) * self.feature_encode_dim
+
+        self.feature_encoder = ResBlock(self.feature_dim, self.res_hidden, self.feature_encode_dim, dropout, self.bias)
+        self.encoders = nn.Sequential(ResBlock(flatten_dim, self.res_hidden, self.hidden_dim, dropout, self.bias), *(
+                    [ResBlock(self.hidden_dim, self.res_hidden, self.hidden_dim, dropout, self.bias)] * (self.encoder_num - 1)))
+
+        self.decoders = nn.Sequential(*(
+                        [ResBlock(self.hidden_dim, self.res_hidden, self.hidden_dim, dropout, self.bias)] * (
+                            self.decoder_num - 1)),
+                                          ResBlock(self.hidden_dim, self.res_hidden, self.decode_dim * self.pred_len,
+                                                   dropout, self.bias))
+        self.temporalDecoder = ResBlock(self.decode_dim + self.feature_encode_dim, self.temporalDecoderHidden, 1,
+                                            dropout, self.bias)
+        self.residual_proj = nn.Linear(self.seq_len, self.pred_len, bias=self.bias)
+
+
+    def forward_xformer(self, x_enc, x_mark_enc, x_dec, batch_y_mark) -> torch.Tensor:
+        # Normalization
+        means = x_enc.mean(1, keepdim=True).detach()
+        x_enc = x_enc - means
+        stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
+        x_enc /= stdev
+
+        feature = self.feature_encoder(batch_y_mark)
+        hidden = self.encoders(torch.cat([x_enc, feature.reshape(feature.shape[0], -1)], dim=-1))
+        decoded = self.decoders(hidden).reshape(hidden.shape[0], self.pred_len, self.decode_dim)
+        dec_out = self.temporalDecoder(torch.cat([feature[:, self.seq_len:], decoded], dim=-1)).squeeze(
+            -1) + self.residual_proj(x_enc)
+
+        # De-Normalization
+        dec_out = dec_out * (stdev[:, 0].unsqueeze(1).repeat(1, self.pred_len))
+        dec_out = dec_out + (means[:, 0].unsqueeze(1).repeat(1, self.pred_len))
+        return dec_out
+
+
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        '''x_mark_enc is the exogenous dynamic feature described in the original paper'''
+        x_enc, x_mark_enc, x_dec, batch_y_mark = data_transformation_4_xformer(history_data=history_data,
+                                                                             future_data=future_data,
+                                                                             start_token_len=0)
+
+        batch_y_mark = torch.concat([x_mark_enc, batch_y_mark[:, -self.pred_len:, :]], dim=1)
+
+        batch_y_mark = torch.concat([x_mark_enc, batch_y_mark[:, -self.pred_len:, :]], dim=1)
+        dec_out = torch.stack([self.forward_xformer(x_enc[:, :, feature], x_mark_enc, x_dec, batch_y_mark) for feature in
+                               range(x_enc.shape[-1])], dim=-1)
+        return dec_out.unsqueeze(-1)  # [B, L, D]
+
+
+
diff --git a/baselines/TimeMixer/Electricity.py b/baselines/TimeMixer/Electricity.py
new file mode 100644
index 0000000..57b0b36
--- /dev/null
+++ b/baselines/TimeMixer/Electricity.py
@@ -0,0 +1,162 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import TimeMixer
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = TimeMixer
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 1, # attn factor
+    "down_sampling_window": 2,
+    "down_sampling_layers": 3,
+    "top_k":5,
+    "down_sampling_method": 'avg',
+    "channel_independence": True,
+    "d_model": 16,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 3,                              # num of encoder layers
+    "d_layers": 1,                              # num of decoder layers
+    "d_ff": 32,
+    "distil": True,
+    "sigma" : 0.2,
+    "dropout": 0.1,
+    "freq": 'h',
+    "use_norm" : 0,
+    "decomp_method" : 'moving_avg',             # dft_decomp  or  moving_avg
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.01,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 16
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/TimeMixer/arch/Autoformer_EncDec.py b/baselines/TimeMixer/arch/Autoformer_EncDec.py
new file mode 100644
index 0000000..6fce4bc
--- /dev/null
+++ b/baselines/TimeMixer/arch/Autoformer_EncDec.py
@@ -0,0 +1,203 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class my_Layernorm(nn.Module):
+    """
+    Special designed layernorm for the seasonal part
+    """
+
+    def __init__(self, channels):
+        super(my_Layernorm, self).__init__()
+        self.layernorm = nn.LayerNorm(channels)
+
+    def forward(self, x):
+        x_hat = self.layernorm(x)
+        bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1)
+        return x_hat - bias
+
+
+class moving_avg(nn.Module):
+    """
+    Moving average block to highlight the trend of time series
+    """
+
+    def __init__(self, kernel_size, stride):
+        super(moving_avg, self).__init__()
+        self.kernel_size = kernel_size
+        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)
+
+    def forward(self, x):
+        # padding on the both ends of time series
+        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        x = torch.cat([front, x, end], dim=1)
+        x = self.avg(x.permute(0, 2, 1))
+        x = x.permute(0, 2, 1)
+        return x
+
+
+class series_decomp(nn.Module):
+    """
+    Series decomposition block
+    """
+
+    def __init__(self, kernel_size):
+        super(series_decomp, self).__init__()
+        self.moving_avg = moving_avg(kernel_size, stride=1)
+
+    def forward(self, x):
+        moving_mean = self.moving_avg(x)
+        res = x - moving_mean
+        return res, moving_mean
+
+
+class series_decomp_multi(nn.Module):
+    """
+    Multiple Series decomposition block from FEDformer
+    """
+
+    def __init__(self, kernel_size):
+        super(series_decomp_multi, self).__init__()
+        self.kernel_size = kernel_size
+        self.series_decomp = [series_decomp(kernel) for kernel in kernel_size]
+
+    def forward(self, x):
+        moving_mean = []
+        res = []
+        for func in self.series_decomp:
+            sea, moving_avg = func(x)
+            moving_mean.append(moving_avg)
+            res.append(sea)
+
+        sea = sum(res) / len(res)
+        moving_mean = sum(moving_mean) / len(moving_mean)
+        return sea, moving_mean
+
+
+class EncoderLayer(nn.Module):
+    """
+    Autoformer encoder layer with the progressive decomposition architecture
+    """
+
+    def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.decomp1 = series_decomp(moving_avg)
+        self.decomp2 = series_decomp(moving_avg)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None):
+        new_x, attn = self.attention(
+            x, x, x,
+            attn_mask=attn_mask
+        )
+        x = x + self.dropout(new_x)
+        x, _ = self.decomp1(x)
+        y = x
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+        res, _ = self.decomp2(x + y)
+        return res, attn
+
+
+class Encoder(nn.Module):
+    """
+    Autoformer encoder
+    """
+
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None):
+        attns = []
+        if self.conv_layers is not None:
+            for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers):
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask)
+                attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x, attns
+
+
+class DecoderLayer(nn.Module):
+    """
+    Autoformer decoder layer with the progressive decomposition architecture
+    """
+
+    def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None,
+                 moving_avg=25, dropout=0.1, activation="relu"):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.decomp1 = series_decomp(moving_avg)
+        self.decomp2 = series_decomp(moving_avg)
+        self.decomp3 = series_decomp(moving_avg)
+        self.dropout = nn.Dropout(dropout)
+        self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1,
+                                    padding_mode='circular', bias=False)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None):
+        x = x + self.dropout(self.self_attention(
+            x, x, x,
+            attn_mask=x_mask
+        )[0])
+        x, trend1 = self.decomp1(x)
+        x = x + self.dropout(self.cross_attention(
+            x, cross, cross,
+            attn_mask=cross_mask
+        )[0])
+        x, trend2 = self.decomp2(x)
+        y = x
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+        x, trend3 = self.decomp3(x + y)
+
+        residual_trend = trend1 + trend2 + trend3
+        residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2)
+        return x, residual_trend
+
+
+class Decoder(nn.Module):
+    """
+    Autoformer encoder
+    """
+
+    def __init__(self, layers, norm_layer=None, projection=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+        self.projection = projection
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, trend=None):
+        for layer in self.layers:
+            x, residual_trend = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask)
+            trend = trend + residual_trend
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        if self.projection is not None:
+            x = self.projection(x)
+        return x, trend
diff --git a/baselines/TimeMixer/arch/Embed.py b/baselines/TimeMixer/arch/Embed.py
new file mode 100644
index 0000000..1202616
--- /dev/null
+++ b/baselines/TimeMixer/arch/Embed.py
@@ -0,0 +1,234 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+import math
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(TokenEmbedding, self).__init__()
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward(self, x):
+        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
+        return x
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(FixedEmbedding, self).__init__()
+
+        w = torch.zeros(c_in, d_model).float()
+        w.require_grad = False
+
+        position = torch.arange(0, c_in).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        w[:, 0::2] = torch.sin(position * div_term)
+        w[:, 1::2] = torch.cos(position * div_term)
+
+        self.emb = nn.Embedding(c_in, d_model)
+        self.emb.weight = nn.Parameter(w, requires_grad=False)
+
+    def forward(self, x):
+        return self.emb(x).detach()
+
+
+class TemporalEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='fixed', freq='h'):
+        super(TemporalEmbedding, self).__init__()
+
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
+        if freq == 't':
+            self.minute_embed = Embed(minute_size, d_model)
+        self.hour_embed = Embed(hour_size, d_model)
+        self.weekday_embed = Embed(weekday_size, d_model)
+        self.day_embed = Embed(day_size, d_model)
+        self.month_embed = Embed(month_size, d_model)
+
+    def forward(self, x):
+        x = x.long()
+        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(
+            self, 'minute_embed') else 0.
+        hour_x = self.hour_embed(x[:, :, 3])
+        weekday_x = self.weekday_embed(x[:, :, 2])
+        day_x = self.day_embed(x[:, :, 1])
+        month_x = self.month_embed(x[:, :, 0])
+
+        return hour_x + weekday_x + day_x + month_x + minute_x
+
+
+class TimeFeatureEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='timeF', freq='h'):
+        super(TimeFeatureEmbedding, self).__init__()
+
+        freq_map = {'h': 4, 't': 5, 's': 6,
+                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+        d_inp = freq_map[freq]
+        self.embed = nn.Linear(d_inp, d_model, bias=False)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class DataEmbedding(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding, self).__init__()
+        self.c_in = c_in
+        self.d_model = d_model
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        _, _, N = x.size()
+        if N == self.c_in:
+            if x_mark is None:
+                x = self.value_embedding(x) + self.position_embedding(x)
+            else:
+                x = self.value_embedding(
+                    x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
+        elif N == self.d_model:
+            if x_mark is None:
+                x = x + self.position_embedding(x)
+            else:
+                x = x + self.temporal_embedding(x_mark) + self.position_embedding(x)
+
+        return self.dropout(x)
+
+
+class DataEmbedding_ms(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_ms, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=1, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        B, T, N = x.shape
+        x1 = self.value_embedding(x.reshape(0, 2, 1).reshape(B * N, T).unsqueeze(-1)).reshape(B, N, T, -1).permute(0, 2,
+                                                                                                                   1, 3)
+        if x_mark is None:
+            x = x1
+        else:
+            x = x1 + self.temporal_embedding(x_mark)
+        return self.dropout(x)
+
+
+class DataEmbedding_wo_pos(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_wo_pos, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        if x is None and x_mark is not None:
+            return self.temporal_embedding(x_mark)
+        if x_mark is None:
+            x = self.value_embedding(x)
+        else:
+            x = self.value_embedding(x) + self.temporal_embedding(x_mark)
+        return self.dropout(x)
+
+
+class PatchEmbedding_crossformer(nn.Module):
+    def __init__(self, d_model, patch_len, stride, padding, dropout):
+        super(PatchEmbedding_crossformer, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, padding))
+
+        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
+        self.value_embedding = nn.Linear(patch_len, d_model, bias=False)
+
+        # Positional embedding
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x) + self.position_embedding(x)
+        return self.dropout(x), n_vars
+
+
+class PatchEmbedding(nn.Module):
+    def __init__(self, d_model, patch_len, stride, dropout):
+        super(PatchEmbedding, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, stride))
+
+        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
+        self.value_embedding = TokenEmbedding(patch_len, d_model)
+
+        # Positional embedding
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x) + self.position_embedding(x)
+        return self.dropout(x), n_vars
diff --git a/baselines/TimeMixer/arch/StandardNorm.py b/baselines/TimeMixer/arch/StandardNorm.py
new file mode 100644
index 0000000..990d0fd
--- /dev/null
+++ b/baselines/TimeMixer/arch/StandardNorm.py
@@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn
+
+
+class Normalize(nn.Module):
+    def __init__(self, num_features: int, eps=1e-5, affine=False, subtract_last=False, non_norm=False):
+        """
+        :param num_features: the number of features or channels
+        :param eps: a value added for numerical stability
+        :param affine: if True, RevIN has learnable affine parameters
+        """
+        super(Normalize, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.affine = affine
+        self.subtract_last = subtract_last
+        self.non_norm = non_norm
+        if self.affine:
+            self._init_params()
+
+    def forward(self, x, mode: str):
+        if mode == 'norm':
+            self._get_statistics(x)
+            x = self._normalize(x)
+        elif mode == 'denorm':
+            x = self._denormalize(x)
+        else:
+            raise NotImplementedError
+        return x
+
+    def _init_params(self):
+        # initialize RevIN params: (C,)
+        self.affine_weight = nn.Parameter(torch.ones(self.num_features))
+        self.affine_bias = nn.Parameter(torch.zeros(self.num_features))
+
+    def _get_statistics(self, x):
+        dim2reduce = tuple(range(1, x.ndim - 1))
+        if self.subtract_last:
+            self.last = x[:, -1, :].unsqueeze(1)
+        else:
+            self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
+        self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
+
+    def _normalize(self, x):
+        if self.non_norm:
+            return x
+        if self.subtract_last:
+            x = x - self.last
+        else:
+            x = x - self.mean
+        x = x / self.stdev
+        if self.affine:
+            x = x * self.affine_weight
+            x = x + self.affine_bias
+        return x
+
+    def _denormalize(self, x):
+        if self.non_norm:
+            return x
+        if self.affine:
+            x = x - self.affine_bias
+            x = x / (self.affine_weight + self.eps * self.eps)
+        x = x * self.stdev
+        if self.subtract_last:
+            x = x + self.last
+        else:
+            x = x + self.mean
+        return x
diff --git a/baselines/TimeMixer/arch/__init__.py b/baselines/TimeMixer/arch/__init__.py
new file mode 100644
index 0000000..1495113
--- /dev/null
+++ b/baselines/TimeMixer/arch/__init__.py
@@ -0,0 +1 @@
+from .timemixer_arch import TimeMixer
\ No newline at end of file
diff --git a/baselines/TimeMixer/arch/timemixer_arch.py b/baselines/TimeMixer/arch/timemixer_arch.py
new file mode 100644
index 0000000..11505bc
--- /dev/null
+++ b/baselines/TimeMixer/arch/timemixer_arch.py
@@ -0,0 +1,419 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .Autoformer_EncDec import series_decomp
+from .Embed import DataEmbedding_wo_pos
+from .StandardNorm import Normalize
+from basicts.utils import data_transformation_4_xformer
+
+class DFT_series_decomp(nn.Module):
+    """
+    Series decomposition block
+    """
+
+    def __init__(self, top_k=5):
+        super(DFT_series_decomp, self).__init__()
+        self.top_k = top_k
+
+    def forward(self, x):
+        xf = torch.fft.rfft(x)
+        freq = abs(xf)
+        freq[0] = 0
+        top_k_freq, top_list = torch.topk(freq, 5)
+        xf[freq <= top_k_freq.min()] = 0
+        x_season = torch.fft.irfft(xf)
+        x_trend = x - x_season
+        return x_season, x_trend
+
+
+class MultiScaleSeasonMixing(nn.Module):
+    """
+    Bottom-up mixing season pattern
+    """
+
+    def __init__(self, **model_args):
+        super(MultiScaleSeasonMixing, self).__init__()
+        self.seq_len = model_args['seq_len']
+        self.down_sampling_window = model_args['down_sampling_window']
+        self.down_sampling_layers = model_args['down_sampling_layers']
+        self.down_sampling_layers = torch.nn.ModuleList(
+            [
+                nn.Sequential(
+                    torch.nn.Linear(
+                        self.seq_len // (self.down_sampling_window ** i),
+                        self.seq_len // (self.down_sampling_window ** (i + 1)),
+                    ),
+                    nn.GELU(),
+                    torch.nn.Linear(
+                        self.seq_len // (self.down_sampling_window ** (i + 1)),
+                        self.seq_len // (self.down_sampling_window ** (i + 1)),
+                    ),
+
+                )
+                for i in range(self.down_sampling_layers)
+            ]
+        )
+
+    def forward(self, season_list):
+
+        # mixing high->low
+        out_high = season_list[0]
+        out_low = season_list[1]
+        out_season_list = [out_high.permute(0, 2, 1)]
+
+        for i in range(len(season_list) - 1):
+            out_low_res = self.down_sampling_layers[i](out_high)
+            out_low = out_low + out_low_res
+            out_high = out_low
+            if i + 2 <= len(season_list) - 1:
+                out_low = season_list[i + 2]
+            out_season_list.append(out_high.permute(0, 2, 1))
+
+        return out_season_list
+
+
+class MultiScaleTrendMixing(nn.Module):
+    """
+    Top-down mixing trend pattern
+    """
+
+    def __init__(self, **model_args):
+        super(MultiScaleTrendMixing, self).__init__()
+        self.seq_len = model_args['seq_len']
+        self.down_sampling_window = model_args['down_sampling_window']
+        self.down_sampling_layers = model_args['down_sampling_layers']
+
+        self.up_sampling_layers = torch.nn.ModuleList(
+            [
+                nn.Sequential(
+                    torch.nn.Linear(
+                        self.seq_len // (self.down_sampling_window ** (i + 1)),
+                        self.seq_len // (self.down_sampling_window ** i),
+                    ),
+                    nn.GELU(),
+                    torch.nn.Linear(
+                        self.seq_len // (self.down_sampling_window ** i),
+                        self.seq_len // (self.down_sampling_window ** i),
+                    ),
+                )
+                for i in reversed(range(self.down_sampling_layers))
+            ])
+
+    def forward(self, trend_list):
+
+        # mixing low->high
+        trend_list_reverse = trend_list.copy()
+        trend_list_reverse.reverse()
+        out_low = trend_list_reverse[0]
+        out_high = trend_list_reverse[1]
+        out_trend_list = [out_low.permute(0, 2, 1)]
+
+        for i in range(len(trend_list_reverse) - 1):
+            out_high_res = self.up_sampling_layers[i](out_low)
+            out_high = out_high + out_high_res
+            out_low = out_high
+            if i + 2 <= len(trend_list_reverse) - 1:
+                out_high = trend_list_reverse[i + 2]
+            out_trend_list.append(out_low.permute(0, 2, 1))
+
+        out_trend_list.reverse()
+        return out_trend_list
+
+
+class PastDecomposableMixing(nn.Module):
+    def __init__(self, **model_args):
+        super(PastDecomposableMixing, self).__init__()
+        self.seq_len = model_args['seq_len']
+        self.pred_len = model_args['pred_len']
+        self.moving_avg = model_args['moving_avg']
+        self.down_sampling_window = model_args['down_sampling_window']
+        self.down_sampling_layers = model_args['down_sampling_layers']
+        self.channel_independence = model_args['channel_independence']
+        self.d_model = model_args['d_model']
+        self.d_ff = model_args['d_ff']
+        self.dp = model_args['dropout']
+
+        self.layer_norm = nn.LayerNorm(self.d_model)
+        self.dropout = nn.Dropout(self.dp)
+        self.decomp_method = model_args['decomp_method']
+        self.top_k = model_args['top_k']
+        if self.decomp_method == 'moving_avg':
+            self.decompsition = series_decomp(self.moving_avg)
+        elif self.decomp_method == "dft_decomp":
+            self.decompsition = DFT_series_decomp(self.top_k)
+        else:
+            raise ValueError('decompsition is error')
+
+        if not self.channel_independence:
+            self.cross_layer = nn.Sequential(
+                nn.Linear(in_features=self.d_model, out_features=self.d_ff),
+                nn.GELU(),
+                nn.Linear(in_features=self.d_ff, out_features=self.d_model),
+            )
+
+        # Mixing season
+        self.mixing_multi_scale_season = MultiScaleSeasonMixing(**model_args)
+
+        # Mxing trend
+        self.mixing_multi_scale_trend = MultiScaleTrendMixing(**model_args)
+
+        self.out_cross_layer = nn.Sequential(
+            nn.Linear(in_features=self.d_model, out_features=self.d_ff),
+            nn.GELU(),
+            nn.Linear(in_features=self.d_ff, out_features=self.d_model),
+        )
+
+    def forward(self, x_list):
+        length_list = []
+        for x in x_list:
+            _, T, _ = x.size()
+            length_list.append(T)
+
+        # Decompose to obtain the season and trend
+        season_list = []
+        trend_list = []
+        for x in x_list:
+            season, trend = self.decompsition(x)
+            if not self.channel_independence:
+                season = self.cross_layer(season)
+                trend = self.cross_layer(trend)
+            season_list.append(season.permute(0, 2, 1))
+            trend_list.append(trend.permute(0, 2, 1))
+
+        # bottom-up season mixing
+        out_season_list = self.mixing_multi_scale_season(season_list)
+        # top-down trend mixing
+        out_trend_list = self.mixing_multi_scale_trend(trend_list)
+
+        out_list = []
+        for ori, out_season, out_trend, length in zip(x_list, out_season_list, out_trend_list,
+                                                      length_list):
+            out = out_season + out_trend
+            if self.channel_independence:
+                out = ori + self.out_cross_layer(out)
+            out_list.append(out[:, :length, :])
+        return out_list
+
+
+class TimeMixer(nn.Module):
+
+    def __init__(self, **model_args):
+        super(TimeMixer, self).__init__()
+
+        self.seq_len = model_args['seq_len']
+        self.pred_len = model_args['pred_len']
+        self.e_layers = model_args['e_layers']
+        self.moving_avg = model_args['moving_avg']
+        self.down_sampling_window = model_args['down_sampling_window']
+        self.down_sampling_layers = model_args['down_sampling_layers']
+        self.channel_independence = model_args['channel_independence']
+        self.d_model = model_args['d_model']
+        self.dec_in = model_args['dec_in']
+        self.enc_in = model_args['enc_in']
+        self.c_out = model_args['c_out']
+        self.freq = model_args['freq']
+        self.dropout = model_args['dropout']
+        self.pdm_blocks = nn.ModuleList([PastDecomposableMixing(**model_args)
+                                         for _ in range(self.e_layers)])
+        self.down_sampling_method = model_args["down_sampling_method"]
+        self.preprocess = series_decomp(self.moving_avg)
+        self.embed = model_args['embed']
+        self.enc_in = model_args['enc_in']
+        self.use_norm = model_args['use_norm']
+        if self.channel_independence:
+            self.enc_embedding = DataEmbedding_wo_pos(1, self.d_model, self.embed, self.freq,
+                                                      self.dropout)
+        else:
+            self.enc_embedding = DataEmbedding_wo_pos(self.enc_in, self.d_model, self.embed, self.freq,
+                                                      self.dropout)
+
+        self.layer = self.e_layers
+
+        self.normalize_layers = torch.nn.ModuleList(
+            [
+                Normalize(self.enc_in, affine=True, non_norm=True if self.use_norm == 0 else False)
+                for i in range(self.down_sampling_layers + 1)
+            ]
+        )
+
+
+        self.predict_layers = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(
+                    self.seq_len // (self.down_sampling_window ** i),
+                    self.pred_len,
+                )
+                for i in range(self.down_sampling_layers + 1)
+            ]
+        )
+
+        if self.channel_independence:
+            self.projection_layer = nn.Linear(
+                    self.d_model, 1, bias=True)
+        else:
+            self.projection_layer = nn.Linear(
+                self.d_model, self.c_out, bias=True)
+
+            self.out_res_layers = torch.nn.ModuleList(
+                [
+                    torch.nn.Linear(
+                        self.seq_len // (self.down_sampling_window ** i),
+                        self.seq_len // (self.down_sampling_window ** i),
+                    )
+                    for i in range(self.down_sampling_layers + 1)
+                ]
+            )
+
+            self.regression_layers = torch.nn.ModuleList(
+                [
+                    torch.nn.Linear(
+                        self.seq_len // (self.down_sampling_window ** i),
+                        self.pred_len,
+                    )
+                    for i in range(self.down_sampling_layers + 1)
+                ]
+            )
+
+
+
+    def out_projection(self, dec_out, i, out_res):
+        dec_out = self.projection_layer(dec_out)
+        out_res = out_res.permute(0, 2, 1)
+        out_res = self.out_res_layers[i](out_res)
+        out_res = self.regression_layers[i](out_res).permute(0, 2, 1)
+        dec_out = dec_out + out_res
+        return dec_out
+
+    def pre_enc(self, x_list):
+        if self.channel_independence:
+            return (x_list, None)
+        else:
+            out1_list = []
+            out2_list = []
+            for x in x_list:
+                x_1, x_2 = self.preprocess(x)
+                out1_list.append(x_1)
+                out2_list.append(x_2)
+            return (out1_list, out2_list)
+
+    def __multi_scale_process_inputs(self, x_enc, x_mark_enc):
+        if self.down_sampling_method == 'max':
+            down_pool = torch.nn.MaxPool1d(self.down_sampling_window, return_indices=False)
+        elif self.down_sampling_method == 'avg':
+            down_pool = torch.nn.AvgPool1d(self.down_sampling_window)
+        elif self.down_sampling_method == 'conv':
+            padding = 1 if torch.__version__ >= '1.5.0' else 2
+            down_pool = nn.Conv1d(in_channels=self.enc_in, out_channels=self.enc_in,
+                                  kernel_size=3, padding=padding,
+                                  stride=self.down_sampling_window,
+                                  padding_mode='circular',
+                                  bias=False)
+        else:
+            return x_enc, x_mark_enc
+        # B,T,C -> B,C,T
+        x_enc = x_enc.permute(0, 2, 1)
+
+        x_enc_ori = x_enc
+        x_mark_enc_mark_ori = x_mark_enc
+
+        x_enc_sampling_list = []
+        x_mark_sampling_list = []
+        x_enc_sampling_list.append(x_enc.permute(0, 2, 1))
+        x_mark_sampling_list.append(x_mark_enc)
+
+        for i in range(self.down_sampling_layers):
+            x_enc_sampling = down_pool(x_enc_ori)
+
+            x_enc_sampling_list.append(x_enc_sampling.permute(0, 2, 1))
+            x_enc_ori = x_enc_sampling
+
+            if x_mark_enc is not None:
+                x_mark_sampling_list.append(x_mark_enc_mark_ori[:, ::self.down_sampling_window, :])
+                x_mark_enc_mark_ori = x_mark_enc_mark_ori[:, ::self.down_sampling_window, :]
+
+        x_enc = x_enc_sampling_list
+        x_mark_enc = x_mark_sampling_list if x_mark_enc is not None else None
+
+        return x_enc, x_mark_enc
+
+    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
+
+        x_enc, x_mark_enc = self.__multi_scale_process_inputs(x_enc, x_mark_enc)
+
+        x_list = []
+        x_mark_list = []
+        if x_mark_enc is not None:
+            for i, x, x_mark in zip(range(len(x_enc)), x_enc, x_mark_enc):
+                B, T, N = x.size()
+                x = self.normalize_layers[i](x, 'norm')
+                if self.channel_independence:
+                    x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)
+                    x_list.append(x)
+                    x_mark = x_mark.repeat(N, 1, 1)
+                    x_mark_list.append(x_mark)
+                else:
+                    x_list.append(x)
+                    x_mark_list.append(x_mark)
+        else:
+            for i, x in zip(range(len(x_enc)), x_enc, ):
+                B, T, N = x.size()
+                x = self.normalize_layers[i](x, 'norm')
+                if self.channel_independence:
+                    x = x.permute(0, 2, 1).contiguous().reshape(B * N, T, 1)
+                x_list.append(x)
+
+        # embedding
+        enc_out_list = []
+        x_list = self.pre_enc(x_list)
+        if x_mark_enc is not None:
+            for i, x, x_mark in zip(range(len(x_list[0])), x_list[0], x_mark_list):
+                enc_out = self.enc_embedding(x, x_mark)  # [B,T,C]
+                enc_out_list.append(enc_out)
+        else:
+            for i, x in zip(range(len(x_list[0])), x_list[0]):
+                enc_out = self.enc_embedding(x, None)  # [B,T,C]
+                enc_out_list.append(enc_out)
+
+        # Past Decomposable Mixing as encoder for past
+        for i in range(self.layer):
+            enc_out_list = self.pdm_blocks[i](enc_out_list)
+
+        # Future Multipredictor Mixing as decoder for future
+        dec_out_list = self.future_multi_mixing(B, enc_out_list, x_list)
+
+        dec_out = torch.stack(dec_out_list, dim=-1).sum(-1)
+        dec_out = self.normalize_layers[0](dec_out, 'denorm')
+        return dec_out
+
+    def future_multi_mixing(self, B, enc_out_list, x_list):
+        dec_out_list = []
+        if self.channel_independence:
+            x_list = x_list[0]
+            for i, enc_out in zip(range(len(x_list)), enc_out_list):
+                dec_out = self.predict_layers[i](enc_out.permute(0, 2, 1)).permute(
+                    0, 2, 1)  # align temporal dimension
+                dec_out = self.projection_layer(dec_out)
+                dec_out = dec_out.reshape(B, self.c_out, self.pred_len).permute(0, 2, 1).contiguous()
+                dec_out_list.append(dec_out)
+
+        else:
+            for i, enc_out, out_res in zip(range(len(x_list[0])), enc_out_list, x_list[1]):
+                dec_out = self.predict_layers[i](enc_out.permute(0, 2, 1)).permute(
+                    0, 2, 1)  # align temporal dimension
+                dec_out = self.out_projection(dec_out, i, out_res)
+                dec_out_list.append(dec_out)
+
+        return dec_out_list
+
+
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+
+        x_enc, x_mark_enc, x_dec, x_mark_dec = data_transformation_4_xformer(history_data=history_data,
+                                                                             future_data=future_data,
+                                                                             start_token_len=0)
+
+        dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
+        return dec_out.unsqueeze(-1)
+
diff --git a/baselines/UMixer/Electricity.py b/baselines/UMixer/Electricity.py
new file mode 100644
index 0000000..fb7ce10
--- /dev/null
+++ b/baselines/UMixer/Electricity.py
@@ -0,0 +1,159 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import UMixer
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = UMixer
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 3, # attn factor
+    "p_hidden_dims": [128, 128],
+    "p_hidden_layers": 2,
+    "d_model": 32,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 2,                              # num of encoder layers
+    "d_layers": 1,                              # num of decoder layers
+    "d_ff": 32,
+    "stride": 8,
+    "patch_len": 16,
+    "distil": True,
+    "sigma" : 0.2,
+    "dropout": 0.1,
+    "freq": 'h',
+    "use_norm" : False,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0001,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/UMixer/arch/Embed.py b/baselines/UMixer/arch/Embed.py
new file mode 100644
index 0000000..1a1c95b
--- /dev/null
+++ b/baselines/UMixer/arch/Embed.py
@@ -0,0 +1,230 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+import math
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(TokenEmbedding, self).__init__()
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward(self, x):
+        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
+        return x
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(FixedEmbedding, self).__init__()
+
+        w = torch.zeros(c_in, d_model).float()
+        w.require_grad = False
+
+        position = torch.arange(0, c_in).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        w[:, 0::2] = torch.sin(position * div_term)
+        w[:, 1::2] = torch.cos(position * div_term)
+
+        self.emb = nn.Embedding(c_in, d_model)
+        self.emb.weight = nn.Parameter(w, requires_grad=False)
+
+    def forward(self, x):
+        return self.emb(x).detach()
+
+
+class TemporalEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='fixed', freq='h'):
+        super(TemporalEmbedding, self).__init__()
+
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
+        if freq == 't':
+            self.minute_embed = Embed(minute_size, d_model)
+        self.hour_embed = Embed(hour_size, d_model)
+        self.weekday_embed = Embed(weekday_size, d_model)
+        self.day_embed = Embed(day_size, d_model)
+        self.month_embed = Embed(month_size, d_model)
+
+    def forward(self, x):
+        x = x.long()
+        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(
+            self, 'minute_embed') else 0.
+        hour_x = self.hour_embed(x[:, :, 3])
+        weekday_x = self.weekday_embed(x[:, :, 2])
+        day_x = self.day_embed(x[:, :, 1])
+        month_x = self.month_embed(x[:, :, 0])
+
+        return hour_x + weekday_x + day_x + month_x + minute_x
+
+
+class TimeFeatureEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='timeF', freq='h'):
+        super(TimeFeatureEmbedding, self).__init__()
+
+        freq_map = {'h': 4, 't': 5, 's': 6,
+                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+        d_inp = freq_map[freq]
+        self.embed = nn.Linear(d_inp, d_model, bias=False)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class DataEmbedding(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        if x_mark is None:
+            x = self.value_embedding(x) + self.position_embedding(x)
+        else:
+            x = self.value_embedding(
+                x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
+        return self.dropout(x)
+
+
+class TokenEmbed(nn.Module):
+    def __init__(self,d_model, embed_type='fixed', freq='h'):
+        super(TokenEmbed, self).__init__()
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+
+    def forward(self, x_mark):
+        return self.temporal_embedding(x_mark)
+
+
+class DataEmbedding_wo_pos(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_wo_pos, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        if x_mark is None:
+            x = self.value_embedding(x)
+        else:
+            x = self.value_embedding(x) + self.temporal_embedding(x_mark)
+        return self.dropout(x)
+
+
+class PatchEmbedding(nn.Module):
+    def __init__(self, d_model, patch_len, stride, dropout):
+        super(PatchEmbedding, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, stride))
+
+        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
+        self.value_embedding = TokenEmbedding(patch_len, d_model)
+
+        # Positional embedding
+        self.position_embedding = PositionalEmbedding(d_model)
+
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)  # 对最后一维，进行size为patch_len，step为stride的滑动
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x) + self.position_embedding(x)
+        return self.dropout(x), n_vars
+
+
+class PosiEmbed(nn.Module):
+    def __init__(self, d_model, patch_len, stride):
+        super(PosiEmbed, self).__init__()
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, stride))
+        self.position_embedding = PositionalEmbedding(d_model)
+
+    def forward(self, x):
+        # do patching
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)  # 对最后一维，进行size为patch_len，step为stride的滑动
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.position_embedding(x)
+        return x
+
+
+class PatchEmbedding_wopos(nn.Module):
+    def __init__(self, d_model, patch_len, stride, dropout):
+        super(PatchEmbedding_wopos, self).__init__()
+        # Patching
+        self.patch_len = patch_len
+        self.stride = stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, stride))
+
+        # Backbone, Input encoding: projection of feature vectors onto a d-dim vector space
+        self.value_embedding = TokenEmbedding(patch_len, d_model)
+
+        # Positional embedding
+        # self.position_embedding = PositionalEmbedding(d_model)
+
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        # do patching
+        n_vars = x.shape[1]
+        x = self.padding_patch_layer(x)
+        x = x.unfold(dimension=-1, size=self.patch_len, step=self.stride)  # 对最后一维，进行size为patch_len，step为stride的滑动
+        x = torch.reshape(x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3]))
+        # Input encoding
+        x = self.value_embedding(x)
+        return self.dropout(x), n_vars
\ No newline at end of file
diff --git a/baselines/UMixer/arch/RevIN.py b/baselines/UMixer/arch/RevIN.py
new file mode 100644
index 0000000..f780f3c
--- /dev/null
+++ b/baselines/UMixer/arch/RevIN.py
@@ -0,0 +1,103 @@
+import torch
+import torch.nn as nn
+
+
+class RevIN(nn.Module):
+    def __init__(self, num_features: int, eps=1e-5, affine=True):
+        """
+        :param num_features: the number of features or channels
+        :param eps: a value added for numerical stability
+        :param affine: if True, RevIN has learnable affine parameters
+        """
+        super(RevIN, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self._init_params()
+
+    def forward(self, x, mode:str):
+        if mode == 'norm':
+            self._get_statistics(x)
+            x = self._normalize(x)
+        elif mode == 'denorm':
+            x = self._denormalize(x)
+        else: raise NotImplementedError
+        return x
+
+    def _init_params(self):
+        # initialize RevIN params: (C,)
+        self.affine_weight = nn.Parameter(torch.ones(self.num_features))
+        self.affine_bias = nn.Parameter(torch.zeros(self.num_features))
+
+    def _get_statistics(self, x):
+        dim2reduce = tuple(range(1, x.ndim-1))
+        self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
+        self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()
+
+    def _normalize(self, x):
+        x = x - self.mean
+        x = x / self.stdev
+        if self.affine:
+            x = x * self.affine_weight
+            x = x + self.affine_bias
+        return x
+
+    def _denormalize(self, x):
+        if self.affine:
+            x = x - self.affine_bias
+            x = x / (self.affine_weight + self.eps*self.eps)
+        x = x * self.stdev
+        x = x + self.mean
+        return x
+
+#
+# class RevIN_loc(nn.Module):
+#     def __init__(self, num_features: int, eps=1e-5, affine=True):
+#         """
+#         :param num_features: the number of features or channels
+#         :param eps: a value added for numerical stability
+#         :param affine: if True, RevIN has learnable affine parameters
+#         """
+#         super(RevIN_loc, self).__init__()
+#         self.num_features_l = num_features
+#         self.eps_l = eps
+#         self.affine_l = affine
+#         if self.affine_l:
+#             self._init_params_l()
+#
+#     def forward(self, x, mode:str):
+#         if mode == 'norm':
+#             self._get_statistics_l(x)
+#             x = self._normalize_l(x)
+#         elif mode == 'denorm':
+#             x = self._denormalize_l(x)
+#         else: raise NotImplementedError
+#         return x
+#
+#     def _init_params_l(self):
+#         # initialize RevIN params: (C,)
+#         self.affine_weight_l = nn.Parameter(torch.ones(self.num_features_l))
+#         self.affine_bias_l = nn.Parameter(torch.zeros(self.num_features_l))
+#
+#     def _get_statistics_l(self, x):
+#         dim2reduce_l = tuple(range(1, x.ndim-1))
+#         self.mean_l = torch.mean(x, dim=dim2reduce_l, keepdim=True).detach()
+#         self.stdev_l = torch.sqrt(torch.var(x, dim=dim2reduce_l, keepdim=True, unbiased=False) + self.eps_l).detach()
+#
+#     def _normalize_l(self, x):
+#         x = x - self.mean_l
+#         x = x / self.stdev_l
+#         if self.affine_l:
+#             x = x * self.affine_weight_l
+#             x = x + self.affine_bias_l
+#         return x
+#
+#     def _denormalize_l(self, x):
+#         if self.affine_l:
+#             x = x - self.affine_bias_l
+#             x = x / (self.affine_weight_l + self.eps_l*self.eps_l)
+#         x = x * self.stdev_l
+#         x = x + self.mean_l
+#         return x
+#
diff --git a/baselines/UMixer/arch/__init__.py b/baselines/UMixer/arch/__init__.py
new file mode 100644
index 0000000..9bc84af
--- /dev/null
+++ b/baselines/UMixer/arch/__init__.py
@@ -0,0 +1 @@
+from .umixer_arch import UMixer
\ No newline at end of file
diff --git a/baselines/UMixer/arch/umixer_arch.py b/baselines/UMixer/arch/umixer_arch.py
new file mode 100644
index 0000000..9396911
--- /dev/null
+++ b/baselines/UMixer/arch/umixer_arch.py
@@ -0,0 +1,234 @@
+import torch
+import torch.nn as nn
+import torch.fft
+from .Embed import PatchEmbedding
+from .RevIN import RevIN
+
+
+def S_Correction(x, x_pre):
+    x_fft = torch.fft.rfft(x,dim=1,norm='ortho')
+    x_pre_fft = torch.fft.rfft(x_pre, dim=1, norm='ortho')
+    x_fft = x_fft * torch.conj(x_fft)
+    x_pre_fft = x_pre_fft * torch.conj(x_pre_fft)
+    x_ifft = torch.fft.irfft(x_fft, dim=1) #
+    x_pre_ifft = torch.fft.irfft(x_pre_fft, dim=1)
+    x_ifft = torch.clamp(x_ifft,min=0)
+    x_pre_ifft = torch.clamp(x_pre_ifft,min=0)
+    alpha = torch.sum(x_ifft*x_pre_ifft,dim=1,keepdim=True)/(torch.sum(x_pre_ifft*x_pre_ifft,dim=1,keepdim=True)+0.001)
+    #alpha = (x_ifft * x_pre_ifft) / (x_pre_ifft * x_pre_ifft + 0.001)
+    return torch.sqrt(alpha)
+
+
+class Flatten_Head(nn.Module):
+    def __init__(self, n_vars, nf, target_window, head_dropout=0):
+        super().__init__()
+        self.n_vars = n_vars
+        self.flatten = nn.Flatten(start_dim=-2)
+        self.linear = nn.Linear(nf, target_window)
+        self.dropout = nn.Dropout(head_dropout)
+
+    def forward(self, x):  # x: [bs x nvars x d_model x patch_num]
+        x = self.flatten(x)
+        x = self.linear(x)
+        x = self.dropout(x)
+        return x
+
+
+class moving_avg(nn.Module):
+    """
+    Moving average block to highlight the trend of time series
+    """
+
+    def __init__(self, kernel_size, stride):
+        super(moving_avg, self).__init__()
+        self.kernel_size = kernel_size
+        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)
+
+    def forward(self, x):
+        # x shape: batch,seq_len,channels
+        # padding on the both ends of time series
+        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
+        x = torch.cat([front, x, end], dim=1)
+        x = self.avg(x.permute(0, 2, 1))
+        x = x.permute(0, 2, 1)
+        return x
+
+
+class series_decomp(nn.Module):
+    """
+    Series decomposition block
+    """
+    def __init__(self, kernel_size):
+        super(series_decomp, self).__init__()
+        self.moving_avg = moving_avg(kernel_size, stride=1)
+
+    def forward(self, x):
+        moving_mean = self.moving_avg(x)
+        res = x - moving_mean
+        return res, moving_mean
+
+
+class series_decomp_multi(nn.Module):
+    """
+    Series decomposition block
+    """
+
+    def __init__(self, kernel_size):
+        super(series_decomp_multi, self).__init__()
+        self.kernel_size = kernel_size
+        self.moving_avg = [moving_avg(kernel, stride=1) for kernel in kernel_size]
+
+    def forward(self, x):
+        moving_mean = []
+        res = []
+        for func in self.moving_avg:
+            moving_avg = func(x)
+            moving_mean.append(moving_avg)
+            sea = x - moving_avg
+            res.append(sea)
+
+        sea = sum(res) / len(res)
+        moving_mean = sum(moving_mean) / len(moving_mean)
+        return sea, moving_mean
+
+
+class channelMix_CI_pat(nn.Module):
+    def __init__(self, **model_args):
+        super(channelMix_CI_pat, self).__init__()
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        self.stride = model_args['stride']
+        self.patch_len = model_args['patch_len']
+        self.d_model = model_args['d_model']
+        self.dropout = model_args["dropout"]
+        self.Pnum = int((self.pred_len + self.seq_len - self.patch_len) / self.stride + 2)
+
+        self.conv1 = nn.ModuleList(nn.Linear(self.Pnum, self.Pnum) for _ in range(self.d_model))
+        self.conv2 = nn.ModuleList(nn.Linear(self.Pnum, self.Pnum) for _ in range(self.d_model))
+        self.gelu = nn.GELU()
+        self.drop = nn.Dropout(self.dropout)
+        self.norm = nn.LayerNorm(self.d_model)
+        self.channels = self.d_model
+
+    def forward(self, x):
+        o = torch.zeros(x.shape, dtype=x.dtype, device='cuda:0')
+        for i in range(self.channels):
+            o[:, :, i] = self.drop(self.conv2[i](self.gelu(self.conv1[i](x[:, :, i]))))
+        res = o + x
+        res = self.norm(res)
+        return res
+
+
+class tempolMix_CI_pat(nn.Module):
+    def __init__(self, **model_args):
+        super(tempolMix_CI_pat, self).__init__()
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        self.stride = model_args['stride']
+        self.patch_len = model_args['patch_len']
+        self.d_model = model_args['d_model']
+        self.dropout = model_args["dropout"]
+        self.Pnum = int((self.pred_len + self.seq_len - self.patch_len) / self.stride + 2)
+
+        self.conv1 = nn.ModuleList(nn.Linear(self.d_model, self.d_model) for _ in range(self.Pnum))
+        self.conv2 = nn.ModuleList(nn.Linear(self.d_model, self.d_model) for _ in range(self.Pnum))
+        self.gelu = nn.GELU()
+        self.drop = nn.Dropout(self.dropout)
+        self.norm = nn.LayerNorm(self.d_model)
+        self.channels = self.Pnum
+
+    def forward(self, x):
+        o = torch.zeros(x.shape, dtype=x.dtype, device='cuda:0')
+        for i in range(self.channels):
+            o[:, i, :] = self.drop(self.conv2[i](self.gelu(self.conv1[i](x[:, i, :]))))
+        res = o + x
+        res = self.norm(res)
+        return res
+
+
+class UMixer(nn.Module):
+    def __init__(self,  **model_args):
+        super(UMixer, self).__init__()
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        self.stride = model_args['stride']
+        self.patch_len = model_args['patch_len']
+        self.d_model = model_args['d_model']
+        self.dropout = model_args["dropout"]
+        self.Pnum = int((self.pred_len + self.seq_len - self.patch_len) / self.stride + 2)
+
+        self.layer = model_args['e_layers']
+
+        self.layer_norm = nn.LayerNorm(self.d_model)
+        self.predict_linear = nn.Linear(self.seq_len, self.pred_len+self.seq_len)
+        self.e_layers = model_args['e_layers']
+        self.d_layers = model_args['d_layers']
+        self.enc_in = model_args['enc_in']
+        self.dropout = model_args["dropout"]
+        self.c_out = model_args['c_out']
+
+        self.mlp_tempmix_md = nn.ModuleList([tempolMix_CI_pat(**model_args)
+                       for _ in range(self.e_layers)])
+        self.mlp_chanmix_md = nn.ModuleList([channelMix_CI_pat(**model_args)
+                       for _ in range(self.e_layers)])
+        self.mlp_tempmix_mu = nn.ModuleList([tempolMix_CI_pat(**model_args)
+                                            for _ in range(self.e_layers)])
+        self.mlp_chanmix_mu = nn.ModuleList([channelMix_CI_pat(**model_args)
+                                            for _ in range(self.e_layers)])
+
+        self.mlp_trend_ci = nn.ModuleList(nn.Linear(self.pred_len, self.d_model) for _ in range(self.c_out))
+        self.mlp_trend2_ci = nn.ModuleList(nn.Linear(self.d_model, self.pred_len) for _ in range(self.c_out))
+
+        self.revin = RevIN(self.enc_in)
+        self.patch_embedding = PatchEmbedding(
+            self.d_model, self.patch_len, self.stride, self.dropout)
+        self.head = Flatten_Head(self.enc_in, self.d_model * self.Pnum, self.pred_len,
+                                 head_dropout=self.dropout)
+        self.comb = nn.Linear(self.e_layers, 1)
+
+    def forecast(self, x_input):
+        x_ori = x_input.contiguous()
+        x_input = self.revin(x_input, 'norm')
+        x_input = self.predict_linear(x_input.permute(0, 2, 1))
+        x_input, n_vars = self.patch_embedding(x_input)
+
+        x_old, _ = self.patch_embedding(x_ori.permute(0, 2, 1))
+
+        x_all = torch.zeros([x_input.shape[0],x_input.shape[1],x_input.shape[2],self.layer], device='cuda:0')
+        for i in range(self.layer):
+            x_ud = self.mlp_tempmix_md[i](x_input)
+            x_ud = self.mlp_chanmix_md[i](x_ud)
+            for i in range(i,-1,-1):
+                x_ud = self.mlp_tempmix_mu[i](x_ud)
+                x_ud = self.mlp_chanmix_mu[i](x_ud)
+            x_all[:,:,:,i] = x_ud
+        x_input = self.comb(x_all).squeeze(-1)
+        x_input = S_Correction(self.layer_norm(x_old), self.layer_norm(x_input[:, :x_old.shape[1], :])) * x_input
+        x_input = torch.reshape(
+            x_input, (-1, n_vars, x_input.shape[-2], x_input.shape[-1]))
+        x_input = x_input.permute(0, 1, 3, 2)
+
+        x_input = self.head(x_input)
+        x_input = x_input.permute(0, 2, 1)
+        x_input = self.revin(x_input, 'denorm')
+
+        x = x_input[:,-self.pred_len:,:]
+
+        return x
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        """
+
+        Args:
+            history_data (Tensor): Input data with shape: [B, L1, N, C]
+            future_data (Tensor): Future data with shape: [B, L2, N, C]
+
+        Returns:
+            torch.Tensor: outputs with shape [B, L2, N, 1]
+        """
+        x_input = history_data[:, :, :, 0]
+
+        out = self.forecast(x_input)
+        return out.unsqueeze(-1)  # [B, L, C]
diff --git a/baselines/iTransformer/Electricity.py b/baselines/iTransformer/Electricity.py
new file mode 100644
index 0000000..6729510
--- /dev/null
+++ b/baselines/iTransformer/Electricity.py
@@ -0,0 +1,158 @@
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from basicts.metrics import masked_mae, masked_mse
+from basicts.data import TimeSeriesForecastingDataset
+from basicts.runners import SimpleTimeSeriesForecastingRunner
+from basicts.scaler import ZScoreScaler
+from basicts.utils import get_regular_settings
+
+from .arch import iTransformer
+
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+DATA_NAME = 'Electricity'  # Dataset name
+regular_settings = get_regular_settings(DATA_NAME)
+INPUT_LEN = regular_settings['INPUT_LEN']  # Length of input sequence
+OUTPUT_LEN = regular_settings['OUTPUT_LEN']  # Length of output sequence
+TRAIN_VAL_TEST_RATIO = regular_settings['TRAIN_VAL_TEST_RATIO']  # Train/Validation/Test split ratios
+NORM_EACH_CHANNEL = regular_settings['NORM_EACH_CHANNEL'] # Whether to normalize each channel of the data
+RESCALE = regular_settings['RESCALE'] # Whether to rescale the data
+NULL_VAL = regular_settings['NULL_VAL'] # Null value in the data
+# Model architecture and parameters
+MODEL_ARCH = iTransformer
+NUM_NODES = 321
+MODEL_PARAM = {
+    "enc_in": NUM_NODES,                        # num nodes
+    "dec_in": NUM_NODES,
+    "c_out": NUM_NODES,
+    "seq_len": INPUT_LEN,
+    "label_len": INPUT_LEN/2,       # start token length used in decoder
+    "pred_len": OUTPUT_LEN,         # prediction sequence length
+    "factor": 3, # attn factor
+    "p_hidden_dims": [128, 128],
+    "p_hidden_layers": 2,
+    "d_model": 512,
+    "moving_avg": 25,                           # window size of moving average. This is a CRUCIAL hyper-parameter.
+    "n_heads": 8,
+    "e_layers": 3,                              # num of encoder layers
+    "d_layers": 1,                              # num of decoder layers
+    "d_ff": 512,
+    "distil": True,
+    "sigma" : 0.2,
+    "dropout": 0.1,
+    "freq": 'h',
+    "use_norm" : False,
+    "output_attention": False,
+    "embed": "timeF",                           # [timeF, fixed, learned]
+    "activation": "gelu",
+    "num_time_features": 4,                     # number of used time features
+    "time_of_day_size": 24,
+    "day_of_week_size": 7,
+    "day_of_month_size": 31,
+    "day_of_year_size": 366
+    }
+NUM_EPOCHS = 100
+
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'An Example Config'
+CFG.GPU_NUM = 1 # Number of GPUs to use (0 for CPU mode)
+# Runner
+CFG.RUNNER = SimpleTimeSeriesForecastingRunner
+
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = TimeSeriesForecastingDataset
+CFG.DATASET.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_val_test_ratio': TRAIN_VAL_TEST_RATIO,
+    'input_len': INPUT_LEN,
+    'output_len': OUTPUT_LEN,
+    # 'mode' is automatically set by the runner
+})
+
+############################## Scaler Configuration ##############################
+CFG.SCALER = EasyDict()
+# Scaler settings
+CFG.SCALER.TYPE = ZScoreScaler # Scaler class
+CFG.SCALER.PARAM = EasyDict({
+    'dataset_name': DATA_NAME,
+    'train_ratio': TRAIN_VAL_TEST_RATIO[0],
+    'norm_each_channel': NORM_EACH_CHANNEL,
+    'rescale': RESCALE,
+})
+
+############################## Model Configuration ##############################
+CFG.MODEL = EasyDict()
+# Model settings
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.FORWARD_FEATURES = [0, 1, 2, 3, 4]
+CFG.MODEL.TARGET_FEATURES = [0]
+
+############################## Metrics Configuration ##############################
+
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({
+                                'MAE': masked_mae,
+                                'MSE': masked_mse
+                            })
+CFG.METRICS.TARGET = 'MAE'
+CFG.METRICS.NULL_VAL = NULL_VAL
+
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.NUM_EPOCHS = NUM_EPOCHS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_EPOCHS), str(INPUT_LEN), str(OUTPUT_LEN)])
+)
+CFG.TRAIN.LOSS = masked_mae
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "Adam"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 0.0005,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "MultiStepLR"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    "milestones": [1, 25, 50],
+    "gamma": 0.5
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 5.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 64
+CFG.TRAIN.DATA.SHUFFLE = True
+
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = 1
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 64
+
+############################## Test Configuration ##############################
+CFG.TEST = EasyDict()
+CFG.TEST.INTERVAL = 1
+CFG.TEST.DATA = EasyDict()
+CFG.TEST.DATA.BATCH_SIZE = 64
+
+############################## Evaluation Configuration ##############################
+
+CFG.EVAL = EasyDict()
+
+# Evaluation parameters
+CFG.EVAL.HORIZONS = [12, 24, 48, 96, 192, 288, 336]
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
diff --git a/baselines/iTransformer/arch/Embed.py b/baselines/iTransformer/arch/Embed.py
new file mode 100644
index 0000000..33fae23
--- /dev/null
+++ b/baselines/iTransformer/arch/Embed.py
@@ -0,0 +1,143 @@
+import torch
+import torch.nn as nn
+import math
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEmbedding, self).__init__()
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model).float()
+        pe.require_grad = False
+
+        position = torch.arange(0, max_len).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return self.pe[:, :x.size(1)]
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(TokenEmbedding, self).__init__()
+        padding = 1 if torch.__version__ >= '1.5.0' else 2
+        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
+                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_in', nonlinearity='leaky_relu')
+
+    def forward(self, x):
+        x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
+        return x
+
+
+class FixedEmbedding(nn.Module):
+    def __init__(self, c_in, d_model):
+        super(FixedEmbedding, self).__init__()
+
+        w = torch.zeros(c_in, d_model).float()
+        w.require_grad = False
+
+        position = torch.arange(0, c_in).float().unsqueeze(1)
+        div_term = (torch.arange(0, d_model, 2).float()
+                    * -(math.log(10000.0) / d_model)).exp()
+
+        w[:, 0::2] = torch.sin(position * div_term)
+        w[:, 1::2] = torch.cos(position * div_term)
+
+        self.emb = nn.Embedding(c_in, d_model)
+        self.emb.weight = nn.Parameter(w, requires_grad=False)
+
+    def forward(self, x):
+        return self.emb(x).detach()
+
+
+class TemporalEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='fixed', freq='h'):
+        super(TemporalEmbedding, self).__init__()
+
+        minute_size = 4
+        hour_size = 24
+        weekday_size = 7
+        day_size = 32
+        month_size = 13
+
+        Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding
+        if freq == 't':
+            self.minute_embed = Embed(minute_size, d_model)
+        self.hour_embed = Embed(hour_size, d_model)
+        self.weekday_embed = Embed(weekday_size, d_model)
+        self.day_embed = Embed(day_size, d_model)
+        self.month_embed = Embed(month_size, d_model)
+
+    def forward(self, x):
+        x = x.long()
+        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(
+            self, 'minute_embed') else 0.
+        hour_x = self.hour_embed(x[:, :, 3])
+        weekday_x = self.weekday_embed(x[:, :, 2])
+        day_x = self.day_embed(x[:, :, 1])
+        month_x = self.month_embed(x[:, :, 0])
+
+        return hour_x + weekday_x + day_x + month_x + minute_x
+
+
+class TimeFeatureEmbedding(nn.Module):
+    def __init__(self, d_model, embed_type='timeF', freq='h'):
+        super(TimeFeatureEmbedding, self).__init__()
+
+        freq_map = {'h': 4, 't': 5, 's': 6,
+                    'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3}
+        d_inp = freq_map[freq]
+        self.embed = nn.Linear(d_inp, d_model, bias=False)
+
+    def forward(self, x):
+        return self.embed(x)
+
+
+class DataEmbedding(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding, self).__init__()
+
+        self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
+        self.position_embedding = PositionalEmbedding(d_model=d_model)
+        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
+                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
+            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        if x_mark is None:
+            x = self.value_embedding(x) + self.position_embedding(x)
+        else:
+            x = self.value_embedding(
+                x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
+        return self.dropout(x)
+
+
+class DataEmbedding_inverted(nn.Module):
+    def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
+        super(DataEmbedding_inverted, self).__init__()
+        self.value_embedding = nn.Linear(c_in, d_model)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x, x_mark):
+        x = x.permute(0, 2, 1)
+        # x: [Batch Variate Time]
+        if x_mark is None:
+            x = self.value_embedding(x)
+        else:
+            # the potential to take covariates (e.g. timestamps) as tokens
+            x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) 
+        # x: [Batch Variate d_model]
+        return self.dropout(x)
+
diff --git a/baselines/iTransformer/arch/SelfAttention_Family.py b/baselines/iTransformer/arch/SelfAttention_Family.py
new file mode 100644
index 0000000..afe927b
--- /dev/null
+++ b/baselines/iTransformer/arch/SelfAttention_Family.py
@@ -0,0 +1,302 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from math import sqrt
+from .masking import TriangularCausalMask, ProbMask
+from einops import rearrange
+
+
+# Code implementation from https://github.com/thuml/Flowformer
+class FlowAttention(nn.Module):
+    def __init__(self, attention_dropout=0.1):
+        super(FlowAttention, self).__init__()
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def kernel_method(self, x):
+        return torch.sigmoid(x)
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        # kernel
+        queries = self.kernel_method(queries)
+        keys = self.kernel_method(keys)
+        # incoming and outgoing
+        normalizer_row = 1.0 / (torch.einsum("nhld,nhd->nhl", queries + 1e-6, keys.sum(dim=2) + 1e-6))
+        normalizer_col = 1.0 / (torch.einsum("nhsd,nhd->nhs", keys + 1e-6, queries.sum(dim=2) + 1e-6))
+        # reweighting
+        normalizer_row_refine = (
+            torch.einsum("nhld,nhd->nhl", queries + 1e-6, (keys * normalizer_col[:, :, :, None]).sum(dim=2) + 1e-6))
+        normalizer_col_refine = (
+            torch.einsum("nhsd,nhd->nhs", keys + 1e-6, (queries * normalizer_row[:, :, :, None]).sum(dim=2) + 1e-6))
+        # competition and allocation
+        normalizer_row_refine = torch.sigmoid(
+            normalizer_row_refine * (float(queries.shape[2]) / float(keys.shape[2])))
+        normalizer_col_refine = torch.softmax(normalizer_col_refine, dim=-1) * keys.shape[2]  # B h L vis
+        # multiply
+        kv = keys.transpose(-2, -1) @ (values * normalizer_col_refine[:, :, :, None])
+        x = (((queries @ kv) * normalizer_row[:, :, :, None]) * normalizer_row_refine[:, :, :, None]).transpose(1,
+                                                                                                                2).contiguous()
+        return x, None
+
+
+# Code implementation from https://github.com/shreyansh26/FlashAttention-PyTorch
+class FlashAttention(nn.Module):
+    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
+        super(FlashAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def flash_attention_forward(self, Q, K, V, mask=None):
+        BLOCK_SIZE = 32
+        NEG_INF = -1e10  # -infinity
+        EPSILON = 1e-10
+        # mask = torch.randint(0, 2, (128, 8)).to(device='cuda')
+        O = torch.zeros_like(Q, requires_grad=True)
+        l = torch.zeros(Q.shape[:-1])[..., None]
+        m = torch.ones(Q.shape[:-1])[..., None] * NEG_INF
+
+        O = O.to(device='cuda')
+        l = l.to(device='cuda')
+        m = m.to(device='cuda')
+
+        Q_BLOCK_SIZE = min(BLOCK_SIZE, Q.shape[-1])
+        KV_BLOCK_SIZE = BLOCK_SIZE
+
+        Q_BLOCKS = torch.split(Q, Q_BLOCK_SIZE, dim=2)
+        K_BLOCKS = torch.split(K, KV_BLOCK_SIZE, dim=2)
+        V_BLOCKS = torch.split(V, KV_BLOCK_SIZE, dim=2)
+        if mask is not None:
+            mask_BLOCKS = list(torch.split(mask, KV_BLOCK_SIZE, dim=1))
+
+        Tr = len(Q_BLOCKS)
+        Tc = len(K_BLOCKS)
+
+        O_BLOCKS = list(torch.split(O, Q_BLOCK_SIZE, dim=2))
+        l_BLOCKS = list(torch.split(l, Q_BLOCK_SIZE, dim=2))
+        m_BLOCKS = list(torch.split(m, Q_BLOCK_SIZE, dim=2))
+
+        for j in range(Tc):
+            Kj = K_BLOCKS[j]
+            Vj = V_BLOCKS[j]
+            if mask is not None:
+                maskj = mask_BLOCKS[j]
+
+            for i in range(Tr):
+                Qi = Q_BLOCKS[i]
+                Oi = O_BLOCKS[i]
+                li = l_BLOCKS[i]
+                mi = m_BLOCKS[i]
+
+                scale = 1 / np.sqrt(Q.shape[-1])
+                Qi_scaled = Qi * scale
+
+                S_ij = torch.einsum('... i d, ... j d -> ... i j', Qi_scaled, Kj)
+                if mask is not None:
+                    # Masking
+                    maskj_temp = rearrange(maskj, 'b j -> b 1 1 j')
+                    S_ij = torch.where(maskj_temp > 0, S_ij, NEG_INF)
+
+                m_block_ij, _ = torch.max(S_ij, dim=-1, keepdims=True)
+                P_ij = torch.exp(S_ij - m_block_ij)
+                if mask is not None:
+                    # Masking
+                    P_ij = torch.where(maskj_temp > 0, P_ij, 0.)
+
+                l_block_ij = torch.sum(P_ij, dim=-1, keepdims=True) + EPSILON
+
+                P_ij_Vj = torch.einsum('... i j, ... j d -> ... i d', P_ij, Vj)
+
+                mi_new = torch.maximum(m_block_ij, mi)
+                li_new = torch.exp(mi - mi_new) * li + torch.exp(m_block_ij - mi_new) * l_block_ij
+
+                O_BLOCKS[i] = (li / li_new) * torch.exp(mi - mi_new) * Oi + (
+                        torch.exp(m_block_ij - mi_new) / li_new) * P_ij_Vj
+                l_BLOCKS[i] = li_new
+                m_BLOCKS[i] = mi_new
+
+        O = torch.cat(O_BLOCKS, dim=2)
+        l = torch.cat(l_BLOCKS, dim=2)
+        m = torch.cat(m_BLOCKS, dim=2)
+        return O, l, m
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        res = \
+        self.flash_attention_forward(queries.permute(0, 2, 1, 3), keys.permute(0, 2, 1, 3), values.permute(0, 2, 1, 3),
+                                     attn_mask)[0]
+        return res.permute(0, 2, 1, 3).contiguous(), None
+
+
+class FullAttention(nn.Module):
+    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
+        super(FullAttention, self).__init__()
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        B, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        scale = self.scale or 1. / sqrt(E)
+
+        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
+
+        if self.mask_flag:
+            if attn_mask is None:
+                attn_mask = TriangularCausalMask(B, L, device=queries.device)
+
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        A = self.dropout(torch.softmax(scale * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", A, values)
+
+        if self.output_attention:
+            return (V.contiguous(), A)
+        else:
+            return (V.contiguous(), None)
+
+
+# Code implementation from https://github.com/zhouhaoyi/Informer2020
+class ProbAttention(nn.Module):
+    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
+        super(ProbAttention, self).__init__()
+        self.factor = factor
+        self.scale = scale
+        self.mask_flag = mask_flag
+        self.output_attention = output_attention
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
+        # Q [B, H, L, D]
+        B, H, L_K, E = K.shape
+        _, _, L_Q, _ = Q.shape
+
+        # calculate the sampled Q_K
+        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
+        # real U = U_part(factor*ln(L_k))*L_q
+        index_sample = torch.randint(L_K, (L_Q, sample_k))
+        K_sample = K_expand[:, :, torch.arange(
+            L_Q).unsqueeze(1), index_sample, :]
+        Q_K_sample = torch.matmul(
+            Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze()
+
+        # find the Top_k query with sparisty measurement
+        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
+        M_top = M.topk(n_top, sorted=False)[1]
+
+        # use the reduced Q to calculate Q_K
+        Q_reduce = Q[torch.arange(B)[:, None, None],
+                   torch.arange(H)[None, :, None],
+                   M_top, :]  # factor*ln(L_q)
+        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
+
+        return Q_K, M_top
+
+    def _get_initial_context(self, V, L_Q):
+        B, H, L_V, D = V.shape
+        if not self.mask_flag:
+            # V_sum = V.sum(dim=-2)
+            V_sum = V.mean(dim=-2)
+            contex = V_sum.unsqueeze(-2).expand(B, H,
+                                                L_Q, V_sum.shape[-1]).clone()
+        else:  # use mask
+            # requires that L_Q == L_V, i.e. for self-attention only
+            assert (L_Q == L_V)
+            contex = V.cumsum(dim=-2)
+        return contex
+
+    def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
+        B, H, L_V, D = V.shape
+
+        if self.mask_flag:
+            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
+            scores.masked_fill_(attn_mask.mask, -np.inf)
+
+        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
+
+        context_in[torch.arange(B)[:, None, None],
+        torch.arange(H)[None, :, None],
+        index, :] = torch.matmul(attn, V).type_as(context_in)
+        if self.output_attention:
+            attns = (torch.ones([B, H, L_V, L_V]) /
+                     L_V).type_as(attn).to(attn.device)
+            attns[torch.arange(B)[:, None, None], torch.arange(H)[
+                                                  None, :, None], index, :] = attn
+            return (context_in, attns)
+        else:
+            return (context_in, None)
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        B, L_Q, H, D = queries.shape
+        _, L_K, _, _ = keys.shape
+
+        queries = queries.transpose(2, 1)
+        keys = keys.transpose(2, 1)
+        values = values.transpose(2, 1)
+
+        U_part = self.factor * \
+                 np.ceil(np.log(L_K)).astype('int').item()  # c*ln(L_k)
+        u = self.factor * \
+            np.ceil(np.log(L_Q)).astype('int').item()  # c*ln(L_q)
+
+        U_part = U_part if U_part < L_K else L_K
+        u = u if u < L_Q else L_Q
+
+        scores_top, index = self._prob_QK(
+            queries, keys, sample_k=U_part, n_top=u)
+
+        # add scale factor
+        scale = self.scale or 1. / sqrt(D)
+        if scale is not None:
+            scores_top = scores_top * scale
+        # get the context
+        context = self._get_initial_context(values, L_Q)
+        # update the context with selected top_k queries
+        context, attn = self._update_context(
+            context, values, scores_top, index, L_Q, attn_mask)
+
+        return context.contiguous(), attn
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, attention, d_model, n_heads, d_keys=None,
+                 d_values=None):
+        super(AttentionLayer, self).__init__()
+
+        d_keys = d_keys or (d_model // n_heads)
+        d_values = d_values or (d_model // n_heads)
+
+        self.inner_attention = attention
+        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
+        self.value_projection = nn.Linear(d_model, d_values * n_heads)
+        self.out_projection = nn.Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+
+    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
+        B, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+
+        queries = self.query_projection(queries).view(B, L, H, -1)
+        keys = self.key_projection(keys).view(B, S, H, -1)
+        values = self.value_projection(values).view(B, S, H, -1)
+
+        out, attn = self.inner_attention(
+            queries,
+            keys,
+            values,
+            attn_mask,
+            tau=tau,
+            delta=delta
+        )
+        out = out.view(B, L, -1)
+
+        return self.out_projection(out), attn
+
+
+
diff --git a/baselines/iTransformer/arch/Transformer_EncDec.py b/baselines/iTransformer/arch/Transformer_EncDec.py
new file mode 100644
index 0000000..c48ddc3
--- /dev/null
+++ b/baselines/iTransformer/arch/Transformer_EncDec.py
@@ -0,0 +1,134 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, c_in):
+        super(ConvLayer, self).__init__()
+        self.downConv = nn.Conv1d(in_channels=c_in,
+                                  out_channels=c_in,
+                                  kernel_size=3,
+                                  padding=2,
+                                  padding_mode='circular')
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.downConv(x.permute(0, 2, 1))
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.maxPool(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+        super(EncoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        new_x, attn = self.attention(
+            x, x, x,
+            attn_mask=attn_mask,
+            tau=tau, delta=delta
+        )
+        x = x + self.dropout(new_x)
+
+        y = x = self.norm1(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm2(x + y), attn
+
+
+class Encoder(nn.Module):
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(Encoder, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        # x [B, L, D]
+        attns = []
+        if self.conv_layers is not None:
+            for i, (attn_layer, conv_layer) in enumerate(zip(self.attn_layers, self.conv_layers)):
+                delta = delta if i == 0 else None
+                x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x, tau=tau, delta=None)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(x, attn_mask=attn_mask, tau=tau, delta=delta)
+                attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x, attns
+
+
+class DecoderLayer(nn.Module):
+    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
+                 dropout=0.1, activation="relu"):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
+        x = x + self.dropout(self.self_attention(
+            x, x, x,
+            attn_mask=x_mask,
+            tau=tau, delta=None
+        )[0])
+        x = self.norm1(x)
+
+        x = x + self.dropout(self.cross_attention(
+            x, cross, cross,
+            attn_mask=cross_mask,
+            tau=tau, delta=delta
+        )[0])
+
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm3(x + y)
+
+
+class Decoder(nn.Module):
+    def __init__(self, layers, norm_layer=None, projection=None):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList(layers)
+        self.norm = norm_layer
+        self.projection = projection
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
+        for layer in self.layers:
+            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        if self.projection is not None:
+            x = self.projection(x)
+        return x
diff --git a/baselines/iTransformer/arch/__init__.py b/baselines/iTransformer/arch/__init__.py
new file mode 100644
index 0000000..41dd6e0
--- /dev/null
+++ b/baselines/iTransformer/arch/__init__.py
@@ -0,0 +1 @@
+from .itransformer_arch import iTransformer
\ No newline at end of file
diff --git a/baselines/iTransformer/arch/itransformer_arch.py b/baselines/iTransformer/arch/itransformer_arch.py
new file mode 100644
index 0000000..a499c59
--- /dev/null
+++ b/baselines/iTransformer/arch/itransformer_arch.py
@@ -0,0 +1,108 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .Transformer_EncDec import Encoder, EncoderLayer
+from .SelfAttention_Family import FullAttention, AttentionLayer
+from .Embed import DataEmbedding_inverted
+import numpy as np
+from basicts.utils import data_transformation_4_xformer
+
+class iTransformer(nn.Module):
+    """
+    Paper link: https://arxiv.org/abs/2310.06625
+    """
+
+    def __init__(self, **model_args):
+        super(iTransformer, self).__init__()
+        self.pred_len = model_args['pred_len']
+        self.seq_len = model_args['seq_len']
+        self.output_attention = model_args['output_attention']
+        self.enc_in = model_args['enc_in']
+        self.dec_in = model_args['dec_in']
+        self.c_out = model_args['c_out']
+        self.factor = model_args["factor"]
+        self.d_model = model_args['d_model']
+        self.n_heads = model_args['n_heads']
+        self.d_ff = model_args['d_ff']
+        self.embed = model_args['embed']
+        self.freq = model_args["freq"]
+        self.dropout = model_args["dropout"]
+        self.activation = model_args['activation']
+        self.e_layers = model_args['e_layers']
+        self.d_layers = model_args['d_layers']
+
+        self.use_norm =model_args['use_norm']
+        # Embedding
+        self.enc_embedding = DataEmbedding_inverted(self.seq_len, self.d_model, self.embed, self.freq,
+                                                    self.dropout)
+
+        # Encoder-only architecture
+        self.encoder = Encoder(
+            [
+                EncoderLayer(
+                    AttentionLayer(
+                        FullAttention(False, self.factor, attention_dropout=self.dropout,
+                                      output_attention=self.output_attention), self.d_model, self.n_heads),
+                    self.d_model,
+                    self.d_ff,
+                    dropout=self.dropout,
+                    activation=self.activation
+                ) for l in range(self.e_layers)
+            ],
+            norm_layer=torch.nn.LayerNorm(self.d_model)
+        )
+        self.projector = nn.Linear(self.d_model, self.pred_len, bias=True)
+
+    def forward_xformer(self, x_enc: torch.Tensor, x_mark_enc: torch.Tensor, x_dec: torch.Tensor,
+                        x_mark_dec: torch.Tensor,
+                        enc_self_mask: torch.Tensor = None, dec_self_mask: torch.Tensor = None,
+                        dec_enc_mask: torch.Tensor = None) -> torch.Tensor:
+
+        if self.use_norm:
+            # Normalization from Non-stationary Transformer
+            means = x_enc.mean(1, keepdim=True).detach()
+            x_enc = x_enc - means
+            stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
+            x_enc /= stdev
+
+        _, _, N = x_enc.shape  # B L N
+        # B: batch_size;    E: d_model;
+        # L: seq_len;       S: pred_len;
+        # N: number of variate (tokens), can also includes covariates
+
+        # Embedding
+        # B L N -> B N E                (B L N -> B L E in the vanilla Transformer)
+        enc_out = self.enc_embedding(x_enc, x_mark_enc)  # covariates (e.g timestamp) can be also embedded as tokens
+
+        # B N E -> B N E                (B L E -> B L E in the vanilla Transformer)
+        # the dimensions of embedded time series has been inverted, and then processed by native attn, layernorm and ffn modules
+        enc_out, attns = self.encoder(enc_out, attn_mask=None)
+
+        # B N E -> B N S -> B S N
+        dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N]  # filter the covariates
+
+        if self.use_norm:
+            # De-Normalization from Non-stationary Transformer
+            dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
+            dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
+
+        return dec_out
+
+    def forward(self, history_data: torch.Tensor, future_data: torch.Tensor, batch_seen: int, epoch: int, train: bool,
+                **kwargs) -> torch.Tensor:
+        """
+
+        Args:
+            history_data (Tensor): Input data with shape: [B, L1, N, C]
+            future_data (Tensor): Future data with shape: [B, L2, N, C]
+
+        Returns:
+            torch.Tensor: outputs with shape [B, L2, N, 1]
+        """
+
+        x_enc, x_mark_enc, x_dec, x_mark_dec = data_transformation_4_xformer(history_data=history_data,
+                                                                             future_data=future_data,
+                                                                             start_token_len=0)
+        #print(x_mark_enc.shape, x_mark_dec.shape)
+        prediction = self.forward_xformer(x_enc=x_enc, x_mark_enc=x_mark_enc, x_dec=x_dec, x_mark_dec=x_mark_dec)
+        return prediction.unsqueeze(-1)
\ No newline at end of file
diff --git a/baselines/iTransformer/arch/masking.py b/baselines/iTransformer/arch/masking.py
new file mode 100644
index 0000000..a19cbf6
--- /dev/null
+++ b/baselines/iTransformer/arch/masking.py
@@ -0,0 +1,26 @@
+import torch
+
+
+class TriangularCausalMask():
+    def __init__(self, B, L, device="cpu"):
+        mask_shape = [B, 1, L, L]
+        with torch.no_grad():
+            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
+
+    @property
+    def mask(self):
+        return self._mask
+
+
+class ProbMask():
+    def __init__(self, B, H, L, index, scores, device="cpu"):
+        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
+        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
+        indicator = _mask_ex[torch.arange(B)[:, None, None],
+                    torch.arange(H)[None, :, None],
+                    index, :].to(device)
+        self._mask = indicator.view(scores.shape).to(device)
+
+    @property
+    def mask(self):
+        return self._mask