init

Browse files

Files changed (6) hide show

README.md +41 -3
Simulator.ipynb +1071 -0
Training.ipynb +1433 -0
img/Solar_Transformer.png +0 -0
img/output.png +0 -0
models/model-best.h5 +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,41 @@
----
-license: mit
----

+# Solar Transformer
+Please check our paper [Solar Irradiance Forecasting with Transformer model
+](https://www.mdpi.com/2076-3417/12/17/8852) for more details.
+[![Issues](https://img.shields.io/github/issues/markub3327/Solar-Transformer)](https://github.com/markub3327/Solar-Transformer/issues)
+![Commits](https://img.shields.io/github/commit-activity/w/markub3327/Solar-Transformer)
+![Size](https://img.shields.io/github/repo-size/markub3327/Solar-Transformer)
+## Paper
+  * Vaswani, A.; Shazeer, N.; Parmar, N.; Uszkoreit, J.; Jones, L.; Gomez, A.N.; Kaiser, Ł.; Polosukhin, I. Attention is all you need. Advances in neural information processing systems 2017, 30.
+  * Dosovitskiy, A.; Beyer, L.; Kolesnikov, A.; Weissenborn, D.; Zhai, X.; Unterthiner, T.; Dehghani, M.; Minderer, M.; Heigold, G.; Gelly, S.; Uszkoreit, J. An image is worth 16x16 words: Transformers for image recognition at scale. 2020, arXiv preprint arXiv:2010.11929.
+  * Bao, H.; Dong, L.; Wei, F. Beit: Bert pre-training of image transformers. 2021, arXiv preprint arXiv:2106.08254.
+  * Brahma, B.; Wadhvani, R. Solar irradiance forecasting based on deep learning methodologies and multi-site data. Sym-metry 2020, 12(11), p.1830. Available online: https://www.mdpi.com/2073-8994/12/11/1830
+## About
+Solar energy is one of the most popular sources of renewable energy today. It is therefore essential to be able to predict solar power generation and adapt the energy needs to these predictions. This paper uses Transformer deep neural network model, in which the attention mechanism is typically applied in NLP or vision problems. Here it is extended by combining features based on their spatio-temporal properties in solar irradiance prediction. The results were predicted for arbitrary long-time horizons since the prediction is always 1 day ahead, which can be included at the end along the timestep axis of the input data and the first timestep representing the oldest timestep removed. A maximum worst-case mean absolute percentage error of 3.45% for the 1 day-ahead prediction was achieved, thus providing better results than the directly competing method.
+## Dataset
+[NASA POWER Project](https://power.larc.nasa.gov)
+Solar irradiance + Weather (temperature, humidity, pressure, wind speed, wind direction)
+## Model
+<p align="center">
+  <img src="img/Solar_Transformer.png">
+</p>
+## Results
+<p align="center">
+  <img src="img/output.png">
+</p>
+----------------------------------
+**Frameworks:** TensorFlow, NumPy, Pandas, WanDB, Seaborn, Matplotlib

Simulator.ipynb ADDED Viewed

	@@ -0,0 +1,1071 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install wandb tensorflow_probability tensorflow_addons"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.layers import Add, Dense, Dropout, Layer, LayerNormalization, MultiHeadAttention\n",
+    "from tensorflow.keras.models import Model\n",
+    "from tensorflow.keras.initializers import TruncatedNormal\n",
+    "from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError\n",
+    "from tensorflow_addons.metrics import RSquare\n",
+    "\n",
+    "import pandas as pd\n",
+    "import tensorflow as tf\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plotting"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_prediction(targets, predictions, max_subplots=3):\n",
+    "  plt.figure(figsize=(12, 15))\n",
+    "  max_n = min(max_subplots, len(targets))\n",
+    "  for n in range(max_n):\n",
+    "    # input\n",
+    "    plt.subplot(max_n, 1, n+1)\n",
+    "    plt.ylabel('Solar irradiance [kW-hr/m^2/day]', fontfamily=\"Arial\", fontsize=16)\n",
+    "    plt.plot(np.arange(targets.shape[1]-horizon), targets[n, :-horizon, 0, -1], label='Inputs', marker='.', zorder=-10)\n",
+    "\n",
+    "    # real\n",
+    "    plt.scatter(np.arange(1, targets.shape[1]), targets[n, 1:, 0, -1], edgecolors='k', label='Targets', c='#2cb01d', s=64)\n",
+    "    \n",
+    "    # predicted\n",
+    "    plt.scatter(np.arange(1, targets.shape[1]), predictions[n, :, 0, -1], marker='X', edgecolors='k', label='Predictions', c='#fe7e0f', s=64)\n",
+    "\n",
+    "    if n == 0:\n",
+    "      plt.legend()\n",
+    "\n",
+    "  plt.xlabel('Time [day]', fontfamily=\"Arial\", fontsize=16)\n",
+    "  plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def patch_similarity_plot(pos):\n",
+    "  similarity_scores = np.dot(\n",
+    "      pos, np.transpose(pos)\n",
+    "  ) / (\n",
+    "      np.linalg.norm(pos, axis=-1)\n",
+    "      * np.linalg.norm(pos, axis=-1)\n",
+    "  )\n",
+    "\n",
+    "  plt.figure(figsize=(7, 7), dpi=300)\n",
+    "  ax = sns.heatmap(similarity_scores, center=0)\n",
+    "  ax.set_title(\"Spatial Positional Embedding\", fontfamily=\"Arial\", fontsize=16)\n",
+    "  ax.set_xlabel(\"Patch\", fontfamily=\"Arial\", fontsize=16)\n",
+    "  ax.set_ylabel(\"Patch\", fontfamily=\"Arial\", fontsize=16)\n",
+    "  plt.show()\n",
+    "\n",
+    "def timestep_similarity_plot(pos):\n",
+    "  similarity_scores = np.dot(\n",
+    "      pos, np.transpose(pos)\n",
+    "  ) / (\n",
+    "      np.linalg.norm(pos, axis=-1)\n",
+    "      * np.linalg.norm(pos, axis=-1)\n",
+    "  )\n",
+    "\n",
+    "  plt.figure(figsize=(7, 7), dpi=300)\n",
+    "  ax = sns.heatmap(similarity_scores, center=0)\n",
+    "  ax.set_title(\"Temporal Positional Embedding\", fontfamily=\"Arial\", fontsize=16)\n",
+    "  ax.set_xlabel(\"Timestep\", fontfamily=\"Arial\", fontsize=16)\n",
+    "  ax.set_ylabel(\"Timestep\", fontfamily=\"Arial\", fontsize=16)\n",
+    "  plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Layer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Normalization(tf.keras.layers.experimental.preprocessing.PreprocessingLayer):\n",
+    "    \"\"\"A preprocessing layer which normalizes continuous features.\n",
+    "    This layer will shift and scale inputs into a distribution centered around\n",
+    "    0 with standard deviation 1. It accomplishes this by precomputing the mean\n",
+    "    and variance of the data, and calling `(input - mean) / sqrt(var)` at\n",
+    "    runtime.\n",
+    "    The mean and variance values for the layer must be either supplied on\n",
+    "    construction or learned via `adapt()`. `adapt()` will compute the mean and\n",
+    "    variance of the data and store them as the layer's weights. `adapt()` should\n",
+    "    be called before `fit()`, `evaluate()`, or `predict()`.\n",
+    "    For an overview and full list of preprocessing layers, see the preprocessing\n",
+    "    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).\n",
+    "    Args:\n",
+    "        axis: Integer, tuple of integers, or None. The axis or axes that should\n",
+    "          have a separate mean and variance for each index in the shape. For\n",
+    "          example, if shape is `(None, 5)` and `axis=1`, the layer will track 5\n",
+    "          separate mean and variance values for the last axis. If `axis` is set\n",
+    "          to `None`, the layer will normalize all elements in the input by a\n",
+    "          scalar mean and variance. Defaults to -1, where the last axis of the\n",
+    "          input is assumed to be a feature dimension and is normalized per\n",
+    "          index. Note that in the specific case of batched scalar inputs where\n",
+    "          the only axis is the batch axis, the default will normalize each index\n",
+    "          in the batch separately. In this case, consider passing `axis=None`.\n",
+    "        mean: The mean value(s) to use during normalization. The passed value(s)\n",
+    "          will be broadcast to the shape of the kept axes above; if the value(s)\n",
+    "          cannot be broadcast, an error will be raised when this layer's\n",
+    "          `build()` method is called.\n",
+    "        variance: The variance value(s) to use during normalization. The passed\n",
+    "          value(s) will be broadcast to the shape of the kept axes above; if the\n",
+    "          value(s) cannot be broadcast, an error will be raised when this\n",
+    "          layer's `build()` method is called.\n",
+    "        invert: If True, this layer will apply the inverse transformation\n",
+    "          to its inputs: it would turn a normalized input back into its\n",
+    "          original form.\n",
+    "    Examples:\n",
+    "    Calculate a global mean and variance by analyzing the dataset in `adapt()`.\n",
+    "    >>> adapt_data = np.array([1., 2., 3., 4., 5.], dtype='float32')\n",
+    "    >>> input_data = np.array([1., 2., 3.], dtype='float32')\n",
+    "    >>> layer = tf.keras.layers.Normalization(axis=None)\n",
+    "    >>> layer.adapt(adapt_data)\n",
+    "    >>> layer(input_data)\n",
+    "    <tf.Tensor: shape=(3,), dtype=float32, numpy=\n",
+    "    array([-1.4142135, -0.70710677, 0.], dtype=float32)>\n",
+    "    Calculate a mean and variance for each index on the last axis.\n",
+    "    >>> adapt_data = np.array([[0., 7., 4.],\n",
+    "    ...                        [2., 9., 6.],\n",
+    "    ...                        [0., 7., 4.],\n",
+    "    ...                        [2., 9., 6.]], dtype='float32')\n",
+    "    >>> input_data = np.array([[0., 7., 4.]], dtype='float32')\n",
+    "    >>> layer = tf.keras.layers.Normalization(axis=-1)\n",
+    "    >>> layer.adapt(adapt_data)\n",
+    "    >>> layer(input_data)\n",
+    "    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=\n",
+    "    array([-1., -1., -1.], dtype=float32)>\n",
+    "    Pass the mean and variance directly.\n",
+    "    >>> input_data = np.array([[1.], [2.], [3.]], dtype='float32')\n",
+    "    >>> layer = tf.keras.layers.Normalization(mean=3., variance=2.)\n",
+    "    >>> layer(input_data)\n",
+    "    <tf.Tensor: shape=(3, 1), dtype=float32, numpy=\n",
+    "    array([[-1.4142135 ],\n",
+    "           [-0.70710677],\n",
+    "           [ 0.        ]], dtype=float32)>\n",
+    "    Use the layer to de-normalize inputs (after adapting the layer).\n",
+    "    >>> adapt_data = np.array([[0., 7., 4.],\n",
+    "    ...                        [2., 9., 6.],\n",
+    "    ...                        [0., 7., 4.],\n",
+    "    ...                        [2., 9., 6.]], dtype='float32')\n",
+    "    >>> input_data = np.array([[1., 2., 3.]], dtype='float32')\n",
+    "    >>> layer = tf.keras.layers.Normalization(axis=-1, invert=True)\n",
+    "    >>> layer.adapt(adapt_data)\n",
+    "    >>> layer(input_data)\n",
+    "    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=\n",
+    "    array([2., 10., 8.], dtype=float32)>\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self, axis=-1, mean=None, variance=None, invert=False, **kwargs\n",
+    "    ):\n",
+    "        super().__init__(**kwargs)\n",
+    "\n",
+    "        # Standardize `axis` to a tuple.\n",
+    "        if axis is None:\n",
+    "            axis = ()\n",
+    "        elif isinstance(axis, int):\n",
+    "            axis = (axis,)\n",
+    "        else:\n",
+    "            axis = tuple(axis)\n",
+    "        self.axis = axis\n",
+    "\n",
+    "        # Set `mean` and `variance` if passed.\n",
+    "        if isinstance(mean, tf.Variable):\n",
+    "            raise ValueError(\n",
+    "                \"Normalization does not support passing a Variable \"\n",
+    "                \"for the `mean` init arg.\"\n",
+    "            )\n",
+    "        if isinstance(variance, tf.Variable):\n",
+    "            raise ValueError(\n",
+    "                \"Normalization does not support passing a Variable \"\n",
+    "                \"for the `variance` init arg.\"\n",
+    "            )\n",
+    "        if (mean is not None) != (variance is not None):\n",
+    "            raise ValueError(\n",
+    "                \"When setting values directly, both `mean` and `variance` \"\n",
+    "                \"must be set. Got mean: {} and variance: {}\".format(\n",
+    "                    mean, variance\n",
+    "                )\n",
+    "            )\n",
+    "        self.input_mean = mean\n",
+    "        self.input_variance = variance\n",
+    "        self.invert = invert\n",
+    "\n",
+    "    def build(self, input_shape):\n",
+    "        super().build(input_shape)\n",
+    "\n",
+    "        if isinstance(input_shape, (list, tuple)) and all(\n",
+    "            isinstance(shape, tf.TensorShape) for shape in input_shape\n",
+    "        ):\n",
+    "            raise ValueError(\n",
+    "                \"Normalization only accepts a single input. If you are \"\n",
+    "                \"passing a python list or tuple as a single input, \"\n",
+    "                \"please convert to a numpy array or `tf.Tensor`.\"\n",
+    "            )\n",
+    "\n",
+    "        input_shape = tf.TensorShape(input_shape).as_list()\n",
+    "        ndim = len(input_shape)\n",
+    "\n",
+    "        if any(a < -ndim or a >= ndim for a in self.axis):\n",
+    "            raise ValueError(\n",
+    "                \"All `axis` values must be in the range [-ndim, ndim). \"\n",
+    "                \"Found ndim: `{}`, axis: {}\".format(ndim, self.axis)\n",
+    "            )\n",
+    "\n",
+    "        # Axes to be kept, replacing negative values with positive equivalents.\n",
+    "        # Sorted to avoid transposing axes.\n",
+    "        self._keep_axis = sorted([d if d >= 0 else d + ndim for d in self.axis])\n",
+    "        # All axes to be kept should have known shape.\n",
+    "        for d in self._keep_axis:\n",
+    "            if input_shape[d] is None:\n",
+    "                raise ValueError(\n",
+    "                    \"All `axis` values to be kept must have known shape. \"\n",
+    "                    \"Got axis: {}, \"\n",
+    "                    \"input shape: {}, with unknown axis at index: {}\".format(\n",
+    "                        self.axis, input_shape, d\n",
+    "                    )\n",
+    "                )\n",
+    "        # Axes to be reduced.\n",
+    "        self._reduce_axis = [d for d in range(ndim) if d not in self._keep_axis]\n",
+    "        # 1 if an axis should be reduced, 0 otherwise.\n",
+    "        self._reduce_axis_mask = [\n",
+    "            0 if d in self._keep_axis else 1 for d in range(ndim)\n",
+    "        ]\n",
+    "        # Broadcast any reduced axes.\n",
+    "        self._broadcast_shape = [\n",
+    "            input_shape[d] if d in self._keep_axis else 1 for d in range(ndim)\n",
+    "        ]\n",
+    "        mean_and_var_shape = tuple(input_shape[d] for d in self._keep_axis)\n",
+    "\n",
+    "        if self.input_mean is None:\n",
+    "            self.adapt_mean = self.add_weight(\n",
+    "                name=\"mean\",\n",
+    "                shape=mean_and_var_shape,\n",
+    "                dtype=self.compute_dtype,\n",
+    "                initializer=\"zeros\",\n",
+    "                trainable=False,\n",
+    "            )\n",
+    "            self.adapt_variance = self.add_weight(\n",
+    "                name=\"variance\",\n",
+    "                shape=mean_and_var_shape,\n",
+    "                dtype=self.compute_dtype,\n",
+    "                initializer=\"ones\",\n",
+    "                trainable=False,\n",
+    "            )\n",
+    "            self.count = self.add_weight(\n",
+    "                name=\"count\",\n",
+    "                shape=(),\n",
+    "                dtype=tf.int64,\n",
+    "                initializer=\"zeros\",\n",
+    "                trainable=False,\n",
+    "            )\n",
+    "            self.finalize_state()\n",
+    "        else:\n",
+    "            # In the no adapt case, make constant tensors for mean and variance\n",
+    "            # with proper broadcast shape for use during call.\n",
+    "            mean = self.input_mean * np.ones(mean_and_var_shape)\n",
+    "            variance = self.input_variance * np.ones(mean_and_var_shape)\n",
+    "            mean = tf.reshape(mean, self._broadcast_shape)\n",
+    "            variance = tf.reshape(variance, self._broadcast_shape)\n",
+    "            self.mean = tf.cast(mean, self.compute_dtype)\n",
+    "            self.variance = tf.cast(variance, self.compute_dtype)\n",
+    "\n",
+    "    # We override this method solely to generate a docstring.\n",
+    "    def adapt(self, data, batch_size=None, steps=None):\n",
+    "        \"\"\"Computes the mean and variance of values in a dataset.\n",
+    "        Calling `adapt()` on a `Normalization` layer is an alternative to\n",
+    "        passing in `mean` and `variance` arguments during layer construction. A\n",
+    "        `Normalization` layer should always either be adapted over a dataset or\n",
+    "        passed `mean` and `variance`.\n",
+    "        During `adapt()`, the layer will compute a `mean` and `variance`\n",
+    "        separately for each position in each axis specified by the `axis`\n",
+    "        argument. To calculate a single `mean` and `variance` over the input\n",
+    "        data, simply pass `axis=None`.\n",
+    "        In order to make `Normalization` efficient in any distribution context,\n",
+    "        the computed mean and variance are kept static with respect to any\n",
+    "        compiled `tf.Graph`s that call the layer. As a consequence, if the layer\n",
+    "        is adapted a second time, any models using the layer should be\n",
+    "        re-compiled. For more information see\n",
+    "        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.\n",
+    "        `adapt()` is meant only as a single machine utility to compute layer\n",
+    "        state.  To analyze a dataset that cannot fit on a single machine, see\n",
+    "        [Tensorflow Transform](\n",
+    "        https://www.tensorflow.org/tfx/transform/get_started)\n",
+    "        for a multi-machine, map-reduce solution.\n",
+    "        Arguments:\n",
+    "          data: The data to train on. It can be passed either as a\n",
+    "              `tf.data.Dataset`, or as a numpy array.\n",
+    "          batch_size: Integer or `None`.\n",
+    "              Number of samples per state update.\n",
+    "              If unspecified, `batch_size` will default to 32.\n",
+    "              Do not specify the `batch_size` if your data is in the\n",
+    "              form of datasets, generators, or `keras.utils.Sequence` instances\n",
+    "              (since they generate batches).\n",
+    "          steps: Integer or `None`.\n",
+    "              Total number of steps (batches of samples)\n",
+    "              When training with input tensors such as\n",
+    "              TensorFlow data tensors, the default `None` is equal to\n",
+    "              the number of samples in your dataset divided by\n",
+    "              the batch size, or 1 if that cannot be determined. If x is a\n",
+    "              `tf.data` dataset, and 'steps' is None, the epoch will run until\n",
+    "              the input dataset is exhausted. When passing an infinitely\n",
+    "              repeating dataset, you must specify the `steps` argument. This\n",
+    "              argument is not supported with array inputs.\n",
+    "        \"\"\"\n",
+    "        super().adapt(data, batch_size=batch_size, steps=steps)\n",
+    "\n",
+    "    def update_state(self, data):\n",
+    "        if self.input_mean is not None:\n",
+    "            raise ValueError(\n",
+    "                \"Cannot `adapt` a Normalization layer that is initialized with \"\n",
+    "                \"static `mean` and `variance`, \"\n",
+    "                \"you passed mean {} and variance {}.\".format(\n",
+    "                    self.input_mean, self.input_variance\n",
+    "                )\n",
+    "            )\n",
+    "\n",
+    "        if not self.built:\n",
+    "            raise RuntimeError(\"`build` must be called before `update_state`.\")\n",
+    "\n",
+    "        data = self._standardize_inputs(data)\n",
+    "        data = tf.cast(data, self.adapt_mean.dtype)\n",
+    "        batch_mean, batch_variance = tf.nn.moments(data, axes=self._reduce_axis)\n",
+    "        batch_shape = tf.shape(data, out_type=self.count.dtype)\n",
+    "        if self._reduce_axis:\n",
+    "            batch_reduce_shape = tf.gather(batch_shape, self._reduce_axis)\n",
+    "            batch_count = tf.reduce_prod(batch_reduce_shape)\n",
+    "        else:\n",
+    "            batch_count = 1\n",
+    "\n",
+    "        total_count = batch_count + self.count\n",
+    "        batch_weight = tf.cast(batch_count, dtype=self.compute_dtype) / tf.cast(\n",
+    "            total_count, dtype=self.compute_dtype\n",
+    "        )\n",
+    "        existing_weight = 1.0 - batch_weight\n",
+    "\n",
+    "        total_mean = (\n",
+    "            self.adapt_mean * existing_weight + batch_mean * batch_weight\n",
+    "        )\n",
+    "        # The variance is computed using the lack-of-fit sum of squares\n",
+    "        # formula (see\n",
+    "        # https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).\n",
+    "        total_variance = (\n",
+    "            self.adapt_variance + (self.adapt_mean - total_mean) ** 2\n",
+    "        ) * existing_weight + (\n",
+    "            batch_variance + (batch_mean - total_mean) ** 2\n",
+    "        ) * batch_weight\n",
+    "        self.adapt_mean.assign(total_mean)\n",
+    "        self.adapt_variance.assign(total_variance)\n",
+    "        self.count.assign(total_count)\n",
+    "\n",
+    "    def reset_state(self):\n",
+    "        if self.input_mean is not None or not self.built:\n",
+    "            return\n",
+    "\n",
+    "        self.adapt_mean.assign(tf.zeros_like(self.adapt_mean))\n",
+    "        self.adapt_variance.assign(tf.ones_like(self.adapt_variance))\n",
+    "        self.count.assign(tf.zeros_like(self.count))\n",
+    "\n",
+    "    def finalize_state(self):\n",
+    "        if self.input_mean is not None or not self.built:\n",
+    "            return\n",
+    "\n",
+    "        # In the adapt case, we make constant tensors for mean and variance with\n",
+    "        # proper broadcast shape and dtype each time `finalize_state` is called.\n",
+    "        self.mean = tf.reshape(self.adapt_mean, self._broadcast_shape)\n",
+    "        self.mean = tf.cast(self.mean, self.compute_dtype)\n",
+    "        self.variance = tf.reshape(self.adapt_variance, self._broadcast_shape)\n",
+    "        self.variance = tf.cast(self.variance, self.compute_dtype)\n",
+    "\n",
+    "    def call(self, inputs):\n",
+    "        inputs = self._standardize_inputs(inputs)\n",
+    "        # The base layer automatically casts floating-point inputs, but we\n",
+    "        # explicitly cast here to also allow integer inputs to be passed\n",
+    "        inputs = tf.cast(inputs, self.compute_dtype)\n",
+    "        if self.invert:\n",
+    "            return (inputs + self.mean) * tf.maximum(\n",
+    "                tf.sqrt(self.variance), tf.keras.backend.epsilon()\n",
+    "            )\n",
+    "        else:\n",
+    "            return (inputs - self.mean) / tf.maximum(\n",
+    "                tf.sqrt(self.variance), tf.keras.backend.epsilon()\n",
+    "            )\n",
+    "\n",
+    "    def compute_output_shape(self, input_shape):\n",
+    "        return input_shape\n",
+    "\n",
+    "    def compute_output_signature(self, input_spec):\n",
+    "        return input_spec\n",
+    "\n",
+    "    def get_config(self):\n",
+    "        config = super().get_config()\n",
+    "        config.update(\n",
+    "            {\n",
+    "                \"axis\": self.axis,\n",
+    "                \"mean\": tf.keras.layers.experimental.preprocessing.preprocessing_utils.utils.listify_tensors(self.input_mean),\n",
+    "                \"variance\": tf.keras.layers.experimental.preprocessing.preprocessing_utils.utils.listify_tensors(self.input_variance),\n",
+    "            }\n",
+    "        )\n",
+    "        return config\n",
+    "\n",
+    "    def _standardize_inputs(self, inputs):\n",
+    "        inputs = tf.convert_to_tensor(inputs)\n",
+    "        if inputs.dtype != self.compute_dtype:\n",
+    "            inputs = tf.cast(inputs, self.compute_dtype)\n",
+    "        return inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class PositionalEmbedding(Layer):\n",
+    "    def __init__(self, units, dropout_rate, **kwargs):\n",
+    "        super(PositionalEmbedding, self).__init__(**kwargs)\n",
+    "\n",
+    "        self.units = units\n",
+    "\n",
+    "        self.projection = Dense(units, kernel_initializer=TruncatedNormal(stddev=0.02))\n",
+    "        self.dropout = Dropout(rate=dropout_rate)\n",
+    "\n",
+    "    def build(self, input_shape):\n",
+    "        super(PositionalEmbedding, self).build(input_shape)\n",
+    "\n",
+    "        print(\"pos_embbeding: \", input_shape)\n",
+    "        self.temporal_position = self.add_weight(\n",
+    "            name=\"temporal_position\",\n",
+    "            shape=(1, input_shape[1], 1, self.units),\n",
+    "            initializer=TruncatedNormal(stddev=0.02),\n",
+    "            trainable=True,\n",
+    "        )\n",
+    "        self.spatial_position = self.add_weight(\n",
+    "            name=\"spatial_position\",\n",
+    "            shape=(1, 1, input_shape[2], self.units),\n",
+    "            initializer=TruncatedNormal(stddev=0.02),\n",
+    "            trainable=True,\n",
+    "        )\n",
+    "\n",
+    "    def call(self, inputs, training):\n",
+    "        x = self.projection(inputs)\n",
+    "        x += self.temporal_position\n",
+    "        x += self.spatial_position\n",
+    "\n",
+    "        return self.dropout(x, training=training)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Encoder(Layer):\n",
+    "    def __init__(\n",
+    "        self, embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate, **kwargs\n",
+    "    ):\n",
+    "        super(Encoder, self).__init__(**kwargs)\n",
+    "\n",
+    "        # Multi-head Attention\n",
+    "        self.mha = MultiHeadAttention(\n",
+    "            num_heads=num_heads,\n",
+    "            key_dim=embed_dim,\n",
+    "            dropout=attention_dropout_rate,\n",
+    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+    "            attention_axes=(1, 2),       # 2D attention (timestep, patch)\n",
+    "        )\n",
+    "\n",
+    "        # Point wise feed forward network\n",
+    "        self.dense_0 = Dense(\n",
+    "            units=mlp_dim,\n",
+    "            activation=\"gelu\",\n",
+    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+    "        )\n",
+    "        self.dense_1 = Dense(\n",
+    "            units=embed_dim, kernel_initializer=TruncatedNormal(stddev=0.02)\n",
+    "        )\n",
+    "\n",
+    "        self.dropout_0 = Dropout(rate=dropout_rate)\n",
+    "        self.dropout_1 = Dropout(rate=dropout_rate)\n",
+    "\n",
+    "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
+    "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
+    "\n",
+    "        self.add_0 = Add()\n",
+    "        self.add_1 = Add()\n",
+    "\n",
+    "    def call(self, inputs, training):\n",
+    "        # Attention block\n",
+    "        x = self.norm_0(inputs)\n",
+    "        x = self.mha(\n",
+    "            query=x,\n",
+    "            key=x,\n",
+    "            value=x,\n",
+    "            training=training,\n",
+    "        )\n",
+    "        x = self.dropout_0(x, training=training)\n",
+    "        x = self.add_0([x, inputs])\n",
+    "\n",
+    "        # MLP block\n",
+    "        y = self.norm_1(x)\n",
+    "        y = self.dense_0(y)\n",
+    "        y = self.dense_1(y)\n",
+    "        y = self.dropout_1(y, training=training)\n",
+    "\n",
+    "        return self.add_1([x, y])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Decoder(Layer):\n",
+    "    def __init__(\n",
+    "        self, embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate, **kwargs\n",
+    "    ):\n",
+    "        super(Decoder, self).__init__(**kwargs)\n",
+    "\n",
+    "        # MultiHeadAttention\n",
+    "        self.mha_0 = MultiHeadAttention(\n",
+    "            num_heads=num_heads,\n",
+    "            key_dim=embed_dim,\n",
+    "            dropout=attention_dropout_rate,\n",
+    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+    "            attention_axes=(1, 2),          # 2D attention (timestep, patch)\n",
+    "        )\n",
+    "        self.mha_1 = MultiHeadAttention(\n",
+    "            num_heads=num_heads,\n",
+    "            key_dim=embed_dim,\n",
+    "            dropout=attention_dropout_rate,\n",
+    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+    "            attention_axes=(1, 2),          # 2D attention (timestep, patch)\n",
+    "        )\n",
+    "\n",
+    "        # Point wise feed forward network\n",
+    "        self.dense_0 = Dense(\n",
+    "            units=mlp_dim,\n",
+    "            activation=\"gelu\",\n",
+    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+    "        )\n",
+    "        self.dense_1 = Dense(\n",
+    "            units=embed_dim, kernel_initializer=TruncatedNormal(stddev=0.02)\n",
+    "        )\n",
+    "\n",
+    "        self.dropout_0 = Dropout(rate=dropout_rate)\n",
+    "        self.dropout_1 = Dropout(rate=dropout_rate)\n",
+    "        self.dropout_2 = Dropout(rate=dropout_rate)\n",
+    "\n",
+    "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
+    "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
+    "        self.norm_2 = LayerNormalization(epsilon=1e-12)\n",
+    "\n",
+    "        self.add_0 = Add()\n",
+    "        self.add_1 = Add()\n",
+    "        self.add_2 = Add()\n",
+    "\n",
+    "    def call(self, inputs, enc_output, training):\n",
+    "        # Attention block\n",
+    "        x = self.norm_0(inputs)\n",
+    "        x = self.mha_0(\n",
+    "            query=x,\n",
+    "            key=x,\n",
+    "            value=x,\n",
+    "            training=training,\n",
+    "        )\n",
+    "        x = self.dropout_0(x, training=training)\n",
+    "        x = self.add_0([x, inputs])\n",
+    "\n",
+    "        # Attention block\n",
+    "        y = self.norm_1(x)\n",
+    "        y = self.mha_1(\n",
+    "            query=y,\n",
+    "            key=enc_output,\n",
+    "            value=enc_output,\n",
+    "            training=training,\n",
+    "        )\n",
+    "        y = self.dropout_1(y, training=training)\n",
+    "        y = self.add_1([x, y])\n",
+    "\n",
+    "        # MLP block\n",
+    "        z = self.norm_2(y)\n",
+    "        z = self.dense_0(z)\n",
+    "        z = self.dense_1(z)\n",
+    "        z = self.dropout_2(z, training=training)\n",
+    "\n",
+    "        return self.add_2([y, z])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Transformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DailyTransformer(Model):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        num_encoder_layers,\n",
+    "        num_decoder_layers,\n",
+    "        embed_dim,\n",
+    "        mlp_dim,\n",
+    "        num_heads,\n",
+    "        num_outputs,\n",
+    "        dropout_rate,\n",
+    "        attention_dropout_rate,\n",
+    "        **kwargs\n",
+    "    ):\n",
+    "        super(DailyTransformer, self).__init__(**kwargs)\n",
+    "\n",
+    "        # Input (normalization of RAW measurements)\n",
+    "        self.input_norm_enc = Normalization(invert=False)\n",
+    "        self.input_norm_dec1 = Normalization(invert=False)\n",
+    "        self.input_norm_dec2 = Normalization(invert=True)\n",
+    "\n",
+    "        # Input\n",
+    "        self.pos_embs_0 = PositionalEmbedding(embed_dim, dropout_rate)\n",
+    "        self.pos_embs_1 = PositionalEmbedding(embed_dim, dropout_rate)\n",
+    "\n",
+    "        # Encoder\n",
+    "        self.enc_layers = [\n",
+    "            Encoder(embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate)\n",
+    "            for _ in range(num_encoder_layers)\n",
+    "        ]\n",
+    "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
+    "\n",
+    "        # Decoder\n",
+    "        self.dec_layers = [\n",
+    "            Decoder(embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate)\n",
+    "            for _ in range(num_decoder_layers)\n",
+    "        ]\n",
+    "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
+    "\n",
+    "        # Output\n",
+    "        self.final_layer = Dense(\n",
+    "            units=num_outputs,\n",
+    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+    "        )\n",
+    "\n",
+    "    def call(self, inputs, training):\n",
+    "        inputs, targets = inputs\n",
+    "\n",
+    "        # Encoder input\n",
+    "        x_e = self.input_norm_enc(inputs)\n",
+    "        x_e = self.pos_embs_0(x_e, training=training)\n",
+    "\n",
+    "        # Encoder\n",
+    "        for layer in self.enc_layers:\n",
+    "            x_e = layer(x_e, training=training)\n",
+    "        x_e = self.norm_0(x_e)\n",
+    "\n",
+    "        # Decoder input\n",
+    "        x_d = self.input_norm_dec1(targets)\n",
+    "        x_d = self.pos_embs_1(x_d, training=training)\n",
+    "\n",
+    "        # Decoder\n",
+    "        for layer in self.dec_layers:\n",
+    "            x_d = layer(x_d, x_e, training=training)\n",
+    "        x_d = self.norm_1(x_d)\n",
+    "\n",
+    "        # Output\n",
+    "        final_output = self.final_layer(x_d)\n",
+    "        final_output = self.input_norm_dec2(final_output)\n",
+    "\n",
+    "        return final_output\n",
+    "\n",
+    "    def train_step(self, inputs):\n",
+    "        inputs, targets = inputs\n",
+    "        inputs = inputs[:, :-1]\n",
+    "        targets_inputs = targets[:, :-1]\n",
+    "        targets_real = targets[:, 1:, :, -1:]\n",
+    "\n",
+    "        with tf.GradientTape() as tape:\n",
+    "            y_pred = self([inputs, targets_inputs], training=True)\n",
+    "            loss = self.compiled_loss(targets_real, y_pred, regularization_losses=self.losses)\n",
+    "\n",
+    "        print(y_pred)\n",
+    "        print(targets_real)\n",
+    "\n",
+    "        # Compute gradients\n",
+    "        trainable_vars = self.trainable_variables\n",
+    "        gradients = tape.gradient(loss, trainable_vars)\n",
+    "\n",
+    "        # Update weights\n",
+    "        self.optimizer.apply_gradients(zip(gradients, trainable_vars))\n",
+    "\n",
+    "        # Update metrics (includes the metric that tracks the loss)\n",
+    "        self.compiled_metrics.update_state(targets_real[:, -1], y_pred[:, -1])\n",
+    "\n",
+    "        # Return a dict mapping metric names to current value\n",
+    "        return {m.name: m.result() for m in self.metrics}\n",
+    "    \n",
+    "    def test_step(self, inputs):\n",
+    "        inputs, targets = inputs\n",
+    "        inputs = inputs[:, :-1]\n",
+    "        targets_inputs = targets[:, :-1]\n",
+    "        targets_real = targets[:, 1:, :, -1:]\n",
+    "\n",
+    "        # Compute predictions\n",
+    "        y_pred = self([inputs, targets_inputs], training=False)\n",
+    "\n",
+    "        # Updates the metrics tracking the loss\n",
+    "        self.compiled_loss(targets_real, y_pred, regularization_losses=self.losses)\n",
+    "\n",
+    "        # Update the metrics\n",
+    "        self.compiled_metrics.update_state(targets_real[:, -1], y_pred[:, -1])\n",
+    "\n",
+    "        # Return a dict mapping metric names to current value\n",
+    "        # Note that it will include the loss (tracked in self.metrics)\n",
+    "        return {m.name: m.result() for m in self.metrics}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Simulator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Simulator(tf.Module):\n",
+    "  def __init__(self, transformer):\n",
+    "    self.transformer = transformer\n",
+    "    self.pi = tf.constant(np.pi)\n",
+    "\n",
+    "  def __call__(self, inputs, horizon_length):\n",
+    "    inputs, targets = inputs\n",
+    "    output_array = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)\n",
+    "\n",
+    "    for i in tf.range(horizon_length):\n",
+    "      tar = targets[:, i:]\n",
+    "      #print(\"target_old:\", tar[0])\n",
+    "      \n",
+    "      # Concatenate history with the predicted future\n",
+    "      if i > 0:\n",
+    "        output = tf.transpose(output_array.stack(), perm=[1, 0, 2, 3])\n",
+    "        if i > tf.shape(inputs)[1]:\n",
+    "          tar = tf.concat([tar, output[:, (i - tf.shape(inputs)[1]):]], axis=1)\n",
+    "        else:\n",
+    "          tar = tf.concat([tar, output], axis=1)\n",
+    "        #print(\"target_new[\", i, \"]:\", tar[0])\n",
+    "\n",
+    "      #print(\"day sin/cos_OLD:\", tar[0, -1, 0, :-1])\n",
+    "\n",
+    "      day = (tf.atan2(tar[:, -1, :, 0], tar[:, -1, :, 1]) * 183.0) / self.pi\n",
+    "      day = tf.round(tf.where(day > 0, day, day + 366))\n",
+    "      \n",
+    "      day_sin = tf.expand_dims(tf.sin(2.0 * self.pi * (day + 1) / 366.0), axis=-1)\n",
+    "      day_cos = tf.expand_dims(tf.cos(2.0 * self.pi * (day + 1) / 366.0), axis=-1)\n",
+    "\n",
+    "      #print(\"day: \", day)\n",
+    "      #print(\"day sin/cos_NEW:\", day_sin[0], day_cos[0])\n",
+    "\n",
+    "      predictions = self.transformer([inputs, tar], training=False)\n",
+    "      #print(\"predictions: \", predictions[0])\n",
+    "\n",
+    "      if i == 0:\n",
+    "        zero_predictions = predictions[:, :-1]\n",
+    "\n",
+    "      # concatentate the prediction to the output which is given to the decoder as its input\n",
+    "      output_array = output_array.write(i, tf.concat([day_sin, day_cos, predictions[:, -1]], axis=-1))\n",
+    "\n",
+    "    output = tf.transpose(output_array.stack(), perm=[1, 0, 2, 3])\n",
+    "    #print(output.shape)\n",
+    "\n",
+    "    return tf.concat([zero_predictions, output[:, :, :, -1:]], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_X = pd.read_csv(\"./dataset/1984_2022/X_all_daily.csv\")\n",
+    "df_y_daily = pd.read_csv(\"./dataset/1984_2022/y_all_daily.csv\")\n",
+    "\n",
+    "num_of_patches = df_X['Name'].nunique()\n",
+    "\n",
+    "df_X = df_X.drop(\n",
+    "    columns=['DateTime', 'Name', 'Latitude', 'Longitude'] +\n",
+    "            [c for c in df_X.columns if c[:9] == 'WindSpeed'] +\n",
+    "            [c for c in df_X.columns if c[:12] == 'WindSpeedMin'] +\n",
+    "            [c for c in df_X.columns if c[:12] == 'WindSpeedMax'] +\n",
+    "            [c for c in df_X.columns if c[:13] == 'WindDirection']\n",
+    ")\n",
+    "df_y_daily = df_y_daily.drop(\n",
+    "    columns=['DateTime', 'Name', 'Latitude', 'Longitude'] +\n",
+    "            [c for c in df_y_daily.columns if c[:9] == 'WindSpeed'] +\n",
+    "            [c for c in df_y_daily.columns if c[:12] == 'WindSpeedMin'] +\n",
+    "            [c for c in df_y_daily.columns if c[:12] == 'WindSpeedMax'] +\n",
+    "            [c for c in df_y_daily.columns if c[:13] == 'WindDirection']\n",
+    ")\n",
+    "\n",
+    "loc_names = [\n",
+    "    \"54 MW PV SOLAR POWER PLANT\",\n",
+    "    \"5MW Solar Power Plant Varroc\",\n",
+    "    \"Adani Green Energy Tamilnadu Limited\",\n",
+    "    \"Arete Elena Energy Pvt Ltd\",\n",
+    "    \"Bitta Solar Power Plant\",\n",
+    "    \"Charanka Solar Park\",\n",
+    "    \"Chennai Metropolitan Area\",\n",
+    "    \"Ctrls Data Center Mumbai\",\n",
+    "    \"Indira Paryavaran Bhawan\",\n",
+    "    \"Kurnool Ultra Mega Solar Park\",\n",
+    "    \"Pavagada Solar Park\",\n",
+    "    \"Rewa Ultra Mega Solar\",\n",
+    "    \"Solar Power Plant Chandasar\",\n",
+    "    \"Solar Power Plant Khera Silajit\",\n",
+    "    \"Solar power plant Koppal\",\n",
+    "    \"Target 1\",\n",
+    "    \"Target 2\",\n",
+    "    \"Welspun Solar MP project\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df_X.head())\n",
+    "print(df_y_daily.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_dataset(data, sequence_length, sequence_stride, sampling_rate):\n",
+    "    def make_window(data):\n",
+    "        dataset = tf.data.Dataset.from_tensor_slices(data)\n",
+    "        dataset = dataset.window(sequence_length, shift=sequence_stride, stride=sampling_rate, drop_remainder=True)\n",
+    "        dataset = dataset.flat_map(lambda x: x.batch(sequence_length, drop_remainder=True))        \n",
+    "        return dataset\n",
+    "\n",
+    "    data = np.array(data, dtype=np.float32)\n",
+    "    data = np.reshape(data, (-1, num_of_patches, data.shape[-1]))\n",
+    "\n",
+    "    # Split the data\n",
+    "    # (80%, 10%, 10%)\n",
+    "    n = data.shape[0]\n",
+    "    n_train = int(n*0.8)\n",
+    "    n_val = int(n*0.9)\n",
+    "    train_data = data[0:n_train]\n",
+    "    val_data = data[n_train:n_val]\n",
+    "    test_data = data[n_val:]\n",
+    "\n",
+    "    return (\n",
+    "        (n_train, make_window(train_data)),\n",
+    "        (n_val - n_train, make_window(val_data)),\n",
+    "        make_window(test_data)\n",
+    "    )\n",
+    "\n",
+    "def merge_dataset(datasets, batch_size, shuffle):\n",
+    "    dataset = tf.data.Dataset.zip(datasets)\n",
+    "    dataset = dataset.prefetch(tf.data.AUTOTUNE)\n",
+    "\n",
+    "    if shuffle:\n",
+    "        # Shuffle locally at each iteration\n",
+    "        dataset = dataset.shuffle(buffer_size=1000)\n",
+    "    dataset = dataset.batch(batch_size)\n",
+    "    \n",
+    "    return dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simulation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "horizon = 7\n",
+    "window_size = 7\n",
+    "batch_size = 32\n",
+    "\n",
+    "_, _, test_X_ds = make_dataset(df_X, (window_size + horizon), 1, 1)\n",
+    "_, _, test_y_daily_ds = make_dataset(df_y_daily, (window_size + horizon), 1, 1)\n",
+    "\n",
+    "test_ds = merge_dataset(\n",
+    "    (test_X_ds, test_y_daily_ds),\n",
+    "    batch_size,\n",
+    "    shuffle=False,\n",
+    ")\n",
+    "\n",
+    "daily_model = DailyTransformer(\n",
+    "    attention_dropout_rate=0.25,\n",
+    "    dropout_rate=0.15,\n",
+    "    embed_dim=64,\n",
+    "    mlp_dim=256,\n",
+    "    num_decoder_layers=6,\n",
+    "    num_encoder_layers=3,\n",
+    "    num_heads=6,\n",
+    "    num_outputs=1,\n",
+    ")\n",
+    "daily_model.build([(None, window_size, num_of_patches, 302), (None, window_size, num_of_patches, 3)])\n",
+    "daily_model.load_weights(\"./models/model-best.h5\")\n",
+    "simulator = Simulator(daily_model)\n",
+    "\n",
+    "print(daily_model.input_norm_enc.variables)\n",
+    "print(daily_model.input_norm_dec1.variables)\n",
+    "print(daily_model.input_norm_dec2.variables)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patch_similarity_plot(daily_model.pos_embs_0.spatial_position[0, 0])\n",
+    "patch_similarity_plot(daily_model.pos_embs_1.spatial_position[0, 0])\n",
+    "\n",
+    "timestep_similarity_plot(daily_model.pos_embs_0.temporal_position[0, :, 0])\n",
+    "timestep_similarity_plot(daily_model.pos_embs_1.temporal_position[0, :, 0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metrics = [MeanSquaredError(), RootMeanSquaredError(), MeanAbsoluteError(), MeanAbsolutePercentageError(), RSquare()]\n",
+    "\n",
+    "# Location 1 = 15 (64.67 % na 4 dni), (80.6 % na 1 den)\n",
+    "# Location 2 = 16 (69.8 % na 4 dni), (83.67 % na 1 den)\n",
+    "\n",
+    "# Chennai = 6 (69.8 % na 4 dni), (83.67 % na 1 den)\n",
+    "# Mumbai = 7 (69.8 % na 4 dni), (83.67 % na 1 den)\n",
+    "\n",
+    "for loc in range(num_of_patches):\n",
+    "    print(\"Location: \", loc_names[loc])\n",
+    "    print(\"-----------------------------------------------------\")\n",
+    "    for inputs in test_ds:\n",
+    "        inputs, targets = inputs\n",
+    "        inputs = inputs[:, :-horizon]\n",
+    "        targets_inputs = targets[:, :-horizon]\n",
+    "        targets_real = targets[:, 1:, loc, -1:]\n",
+    "\n",
+    "        #y_pred = daily_model([inputs, targets_inputs], training=False)\n",
+    "        y_pred = simulator([inputs, targets_inputs], horizon_length=horizon)\n",
+    "\n",
+    "        # Update the metrics\n",
+    "        for m in metrics:\n",
+    "            m.update_state(targets_real, y_pred[:, :, loc, -1:])\n",
+    "\n",
+    "    # visualize the last results\n",
+    "    plot_prediction(targets, y_pred)\n",
+    "\n",
+    "    print({m.name: m.result() for m in metrics}, \"\\n\")\n",
+    "    for m in metrics:\n",
+    "        m.reset_states()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.10 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "9185113d2128201d66faecd4f34fb34e89a635073a034991399523e584519355"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Training.ipynb ADDED Viewed

	@@ -0,0 +1,1433 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AMmP-w4YRzBU"
+      },
+      "source": [
+        "# Solar Transformer"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7Z0GTvvpRzBx",
+        "outputId": "33adb728-76ee-48e4-b388-89717bca8482"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install wandb tensorflow_probability tensorflow_addons"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "EMUn36ELRzCJ"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.keras.layers import Add, Dense, Dropout, Layer, LayerNormalization, MultiHeadAttention\n",
+        "from tensorflow.keras.models import Model\n",
+        "from tensorflow.keras.initializers import TruncatedNormal\n",
+        "from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, Callback\n",
+        "from tensorflow.keras.optimizers import Adam\n",
+        "from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError\n",
+        "from tensorflow_addons.metrics import RSquare\n",
+        "from wandb.keras import WandbCallback\n",
+        "\n",
+        "import math\n",
+        "import wandb\n",
+        "import pandas as pd\n",
+        "import tensorflow as tf\n",
+        "import tensorflow_probability as tfp\n",
+        "import tensorflow_addons as tfa\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "VWPvxWIdKfRM",
+        "outputId": "3e04ac70-9036-4b16-daca-28b2ef0707cd"
+      },
+      "outputs": [],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XgDBs9_3l4uD"
+      },
+      "source": [
+        "## Plotting"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TDdF5YM4l4Au"
+      },
+      "outputs": [],
+      "source": [
+        "def plot_4d(matrix):\n",
+        "    fig = plt.figure(figsize=(10, 20), dpi=300)\n",
+        "    plt.title(\"Attention heatmap\")\n",
+        "\n",
+        "    # create grid\n",
+        "    x = np.arange(0, matrix.shape[0], 1, dtype=np.int32)  # timesteps\n",
+        "    y = np.arange(0, matrix.shape[1], 1, dtype=np.int32)    # patches\n",
+        "    z = np.arange(0, matrix.shape[2], 1, dtype=np.int32)  # timesteps\n",
+        "    X, Y, Z = np.meshgrid(x, y, z)\n",
+        "\n",
+        "    X = X.transpose([1, 0, 2])\n",
+        "    Y = Y.transpose([1, 0, 2])\n",
+        "    Z = Z.transpose([1, 0, 2])\n",
+        "\n",
+        "    for I in range(matrix.shape[3]):\n",
+        "        # Plot\n",
+        "        ax = plt.subplot(5, 5, I+1, projection=\"3d\")\n",
+        "        ax.scatter3D(X, Y, Z, c=matrix[:, :, :, I], marker='s', s=99, cmap='jet')\n",
+        "        ax.set_title(\n",
+        "            f\"{I}-th patch\"\n",
+        "        )\n",
+        "        ax.set_xlabel(\"Timestep\")\n",
+        "        ax.set_ylabel(\"Patch\")\n",
+        "        ax.set_zlabel(\"Timestep\")\n",
+        "\n",
+        "    plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hdxToaPDOXPg"
+      },
+      "outputs": [],
+      "source": [
+        "def patch_similarity_plot(pos):\n",
+        "  similarity_scores = np.dot(\n",
+        "      pos, np.transpose(pos)\n",
+        "  ) / (\n",
+        "      np.linalg.norm(pos, axis=-1)\n",
+        "      * np.linalg.norm(pos, axis=-1)\n",
+        "  )\n",
+        "\n",
+        "  plt.figure(figsize=(7, 7), dpi=300)\n",
+        "  ax = sns.heatmap(similarity_scores, center=0)\n",
+        "  ax.set_title(\"Spatial Positional Embedding\")\n",
+        "  ax.set_xlabel(\"Patch\")\n",
+        "  ax.set_ylabel(\"Patch\")\n",
+        "  plt.show()\n",
+        "\n",
+        "def timestep_similarity_plot(pos):\n",
+        "  similarity_scores = np.dot(\n",
+        "      pos, np.transpose(pos)\n",
+        "  ) / (\n",
+        "      np.linalg.norm(pos, axis=-1)\n",
+        "      * np.linalg.norm(pos, axis=-1)\n",
+        "  )\n",
+        "\n",
+        "  plt.figure(figsize=(7, 7), dpi=300)\n",
+        "  ax = sns.heatmap(similarity_scores, center=0)\n",
+        "  ax.set_title(\"Temporal Positional Embedding\")\n",
+        "  ax.set_xlabel(\"Timestep\")\n",
+        "  ax.set_ylabel(\"Timestep\")\n",
+        "  plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ky-VvsYT2aSz"
+      },
+      "outputs": [],
+      "source": [
+        "def plot_prediction(inputs, model, max_subplots=3):\n",
+        "  inputs, targets = inputs\n",
+        "  inputs = inputs[:, :-1]\n",
+        "\n",
+        "  plt.figure(figsize=(12, 15))\n",
+        "  max_n = min(max_subplots, len(targets))\n",
+        "  for n in range(max_n):\n",
+        "    # input\n",
+        "    plt.subplot(max_n, 1, n+1)\n",
+        "    plt.ylabel('Solar irradiance [kW-hr/m^2/day]')\n",
+        "    plt.plot(np.arange(targets.shape[1]-1), targets[n, :-1, 0, -1], label='Inputs', marker='.', zorder=-10)\n",
+        "\n",
+        "    # real\n",
+        "    plt.scatter(np.arange(1, targets.shape[1]), targets[n, 1:, 0, -1], edgecolors='k', label='Targets', c='#2cb01d', s=64)\n",
+        "    \n",
+        "    # predicted\n",
+        "    predictions = model([inputs, targets[:, :-1]], training=False)\n",
+        "    plt.scatter(np.arange(1, targets.shape[1]), predictions[n, :, 0, -1], marker='X', edgecolors='k', label='Predictions', c='#fe7e0f', s=64)\n",
+        "\n",
+        "    if n == 0:\n",
+        "      plt.legend()\n",
+        "\n",
+        "  plt.xlabel('Time [day]')\n",
+        "  plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "I4i5bIzkRzC2"
+      },
+      "source": [
+        "## Init logger"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 106
+        },
+        "id": "ABI9_YirRzDA",
+        "outputId": "1c9c6c2d-a8dd-4375-fe49-62e59da6969a"
+      },
+      "outputs": [],
+      "source": [
+        "wandb.login()\n",
+        "\n",
+        "sweep_config = {\n",
+        "  'method': 'grid',\n",
+        "  'metric': {\n",
+        "    'goal': 'minimize',\n",
+        "    'name': 'val_mean_squared_error'\n",
+        "  },\n",
+        "  'parameters': {\n",
+        "      'epochs': {\n",
+        "        'value': 1000\n",
+        "      },\n",
+        "      'num_encoder_layers': {\n",
+        "        'value': 3\n",
+        "      },\n",
+        "      'num_decoder_layers': {\n",
+        "        'value': 6\n",
+        "      },\n",
+        "      'embed_layer_size': {\n",
+        "        'value': 64\n",
+        "      },\n",
+        "      'fc_layer_size': {\n",
+        "        'value': 256\n",
+        "      },\n",
+        "      'num_heads': {\n",
+        "        'value': 8\n",
+        "      },\n",
+        "      'dropout': {\n",
+        "        'value': 0.15\n",
+        "      },\n",
+        "      'attention_dropout': {\n",
+        "        'value': 0.25\n",
+        "      },\n",
+        "      'optimizer': {\n",
+        "        'value': 'adamw'\n",
+        "      },\n",
+        "      'global_clipnorm': {\n",
+        "        'value': 2.0\n",
+        "      },\n",
+        "      'learning_rate': {\n",
+        "        'value': 0.005\n",
+        "      },\n",
+        "      'weight_decay': {\n",
+        "        'value': 0.00001\n",
+        "      },\n",
+        "      'warmup_steps': {\n",
+        "        'value': 70\n",
+        "      },\n",
+        "      'window_size': {\n",
+        "        'value': 7           # every 7 days\n",
+        "      },\n",
+        "      'batch_size': {\n",
+        "        'value': 32\n",
+        "      },\n",
+        "    }\n",
+        "}\n",
+        "\n",
+        "sweep_id = wandb.sweep(sweep_config, project=\"solar-transformer\")\n",
+        "!export WANDB_AGENT_MAX_INITIAL_FAILURES=1024"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oQyRcTjTRzEE"
+      },
+      "source": [
+        "## Layer"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pbmmbGbfRzEO"
+      },
+      "outputs": [],
+      "source": [
+        "class PositionalEmbedding(Layer):\n",
+        "    def __init__(self, units, dropout_rate, **kwargs):\n",
+        "        super(PositionalEmbedding, self).__init__(**kwargs)\n",
+        "\n",
+        "        self.units = units\n",
+        "\n",
+        "        self.projection = Dense(units, kernel_initializer=TruncatedNormal(stddev=0.02))\n",
+        "        self.dropout = Dropout(rate=dropout_rate)\n",
+        "\n",
+        "    def build(self, input_shape):\n",
+        "        super(PositionalEmbedding, self).build(input_shape)\n",
+        "\n",
+        "        print(\"pos_embbeding: \", input_shape)\n",
+        "        self.temporal_position = self.add_weight(\n",
+        "            name=\"temporal_position\",\n",
+        "            shape=(1, input_shape[1], 1, self.units),\n",
+        "            initializer=TruncatedNormal(stddev=0.02),\n",
+        "            trainable=True,\n",
+        "        )\n",
+        "        self.spatial_position = self.add_weight(\n",
+        "            name=\"spatial_position\",\n",
+        "            shape=(1, 1, input_shape[2], self.units),\n",
+        "            initializer=TruncatedNormal(stddev=0.02),\n",
+        "            trainable=True,\n",
+        "        )\n",
+        "\n",
+        "    def call(self, inputs, training):\n",
+        "        x = self.projection(inputs)\n",
+        "        x += self.temporal_position\n",
+        "        x += self.spatial_position\n",
+        "\n",
+        "        return self.dropout(x, training=training)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uswJFdtUVk8Q"
+      },
+      "outputs": [],
+      "source": [
+        "class Normalization(tf.keras.layers.experimental.preprocessing.PreprocessingLayer):\n",
+        "    \"\"\"A preprocessing layer which normalizes continuous features.\n",
+        "    This layer will shift and scale inputs into a distribution centered around\n",
+        "    0 with standard deviation 1. It accomplishes this by precomputing the mean\n",
+        "    and variance of the data, and calling `(input - mean) / sqrt(var)` at\n",
+        "    runtime.\n",
+        "    The mean and variance values for the layer must be either supplied on\n",
+        "    construction or learned via `adapt()`. `adapt()` will compute the mean and\n",
+        "    variance of the data and store them as the layer's weights. `adapt()` should\n",
+        "    be called before `fit()`, `evaluate()`, or `predict()`.\n",
+        "    For an overview and full list of preprocessing layers, see the preprocessing\n",
+        "    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).\n",
+        "    Args:\n",
+        "        axis: Integer, tuple of integers, or None. The axis or axes that should\n",
+        "          have a separate mean and variance for each index in the shape. For\n",
+        "          example, if shape is `(None, 5)` and `axis=1`, the layer will track 5\n",
+        "          separate mean and variance values for the last axis. If `axis` is set\n",
+        "          to `None`, the layer will normalize all elements in the input by a\n",
+        "          scalar mean and variance. Defaults to -1, where the last axis of the\n",
+        "          input is assumed to be a feature dimension and is normalized per\n",
+        "          index. Note that in the specific case of batched scalar inputs where\n",
+        "          the only axis is the batch axis, the default will normalize each index\n",
+        "          in the batch separately. In this case, consider passing `axis=None`.\n",
+        "        mean: The mean value(s) to use during normalization. The passed value(s)\n",
+        "          will be broadcast to the shape of the kept axes above; if the value(s)\n",
+        "          cannot be broadcast, an error will be raised when this layer's\n",
+        "          `build()` method is called.\n",
+        "        variance: The variance value(s) to use during normalization. The passed\n",
+        "          value(s) will be broadcast to the shape of the kept axes above; if the\n",
+        "          value(s) cannot be broadcast, an error will be raised when this\n",
+        "          layer's `build()` method is called.\n",
+        "        invert: If True, this layer will apply the inverse transformation\n",
+        "          to its inputs: it would turn a normalized input back into its\n",
+        "          original form.\n",
+        "    Examples:\n",
+        "    Calculate a global mean and variance by analyzing the dataset in `adapt()`.\n",
+        "    >>> adapt_data = np.array([1., 2., 3., 4., 5.], dtype='float32')\n",
+        "    >>> input_data = np.array([1., 2., 3.], dtype='float32')\n",
+        "    >>> layer = tf.keras.layers.Normalization(axis=None)\n",
+        "    >>> layer.adapt(adapt_data)\n",
+        "    >>> layer(input_data)\n",
+        "    <tf.Tensor: shape=(3,), dtype=float32, numpy=\n",
+        "    array([-1.4142135, -0.70710677, 0.], dtype=float32)>\n",
+        "    Calculate a mean and variance for each index on the last axis.\n",
+        "    >>> adapt_data = np.array([[0., 7., 4.],\n",
+        "    ...                        [2., 9., 6.],\n",
+        "    ...                        [0., 7., 4.],\n",
+        "    ...                        [2., 9., 6.]], dtype='float32')\n",
+        "    >>> input_data = np.array([[0., 7., 4.]], dtype='float32')\n",
+        "    >>> layer = tf.keras.layers.Normalization(axis=-1)\n",
+        "    >>> layer.adapt(adapt_data)\n",
+        "    >>> layer(input_data)\n",
+        "    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=\n",
+        "    array([-1., -1., -1.], dtype=float32)>\n",
+        "    Pass the mean and variance directly.\n",
+        "    >>> input_data = np.array([[1.], [2.], [3.]], dtype='float32')\n",
+        "    >>> layer = tf.keras.layers.Normalization(mean=3., variance=2.)\n",
+        "    >>> layer(input_data)\n",
+        "    <tf.Tensor: shape=(3, 1), dtype=float32, numpy=\n",
+        "    array([[-1.4142135 ],\n",
+        "           [-0.70710677],\n",
+        "           [ 0.        ]], dtype=float32)>\n",
+        "    Use the layer to de-normalize inputs (after adapting the layer).\n",
+        "    >>> adapt_data = np.array([[0., 7., 4.],\n",
+        "    ...                        [2., 9., 6.],\n",
+        "    ...                        [0., 7., 4.],\n",
+        "    ...                        [2., 9., 6.]], dtype='float32')\n",
+        "    >>> input_data = np.array([[1., 2., 3.]], dtype='float32')\n",
+        "    >>> layer = tf.keras.layers.Normalization(axis=-1, invert=True)\n",
+        "    >>> layer.adapt(adapt_data)\n",
+        "    >>> layer(input_data)\n",
+        "    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=\n",
+        "    array([2., 10., 8.], dtype=float32)>\n",
+        "    \"\"\"\n",
+        "\n",
+        "    def __init__(\n",
+        "        self, axis=-1, mean=None, variance=None, invert=False, **kwargs\n",
+        "    ):\n",
+        "        super().__init__(**kwargs)\n",
+        "\n",
+        "        # Standardize `axis` to a tuple.\n",
+        "        if axis is None:\n",
+        "            axis = ()\n",
+        "        elif isinstance(axis, int):\n",
+        "            axis = (axis,)\n",
+        "        else:\n",
+        "            axis = tuple(axis)\n",
+        "        self.axis = axis\n",
+        "\n",
+        "        # Set `mean` and `variance` if passed.\n",
+        "        if isinstance(mean, tf.Variable):\n",
+        "            raise ValueError(\n",
+        "                \"Normalization does not support passing a Variable \"\n",
+        "                \"for the `mean` init arg.\"\n",
+        "            )\n",
+        "        if isinstance(variance, tf.Variable):\n",
+        "            raise ValueError(\n",
+        "                \"Normalization does not support passing a Variable \"\n",
+        "                \"for the `variance` init arg.\"\n",
+        "            )\n",
+        "        if (mean is not None) != (variance is not None):\n",
+        "            raise ValueError(\n",
+        "                \"When setting values directly, both `mean` and `variance` \"\n",
+        "                \"must be set. Got mean: {} and variance: {}\".format(\n",
+        "                    mean, variance\n",
+        "                )\n",
+        "            )\n",
+        "        self.input_mean = mean\n",
+        "        self.input_variance = variance\n",
+        "        self.invert = invert\n",
+        "\n",
+        "    def build(self, input_shape):\n",
+        "        super().build(input_shape)\n",
+        "\n",
+        "        if isinstance(input_shape, (list, tuple)) and all(\n",
+        "            isinstance(shape, tf.TensorShape) for shape in input_shape\n",
+        "        ):\n",
+        "            raise ValueError(\n",
+        "                \"Normalization only accepts a single input. If you are \"\n",
+        "                \"passing a python list or tuple as a single input, \"\n",
+        "                \"please convert to a numpy array or `tf.Tensor`.\"\n",
+        "            )\n",
+        "\n",
+        "        input_shape = tf.TensorShape(input_shape).as_list()\n",
+        "        ndim = len(input_shape)\n",
+        "\n",
+        "        if any(a < -ndim or a >= ndim for a in self.axis):\n",
+        "            raise ValueError(\n",
+        "                \"All `axis` values must be in the range [-ndim, ndim). \"\n",
+        "                \"Found ndim: `{}`, axis: {}\".format(ndim, self.axis)\n",
+        "            )\n",
+        "\n",
+        "        # Axes to be kept, replacing negative values with positive equivalents.\n",
+        "        # Sorted to avoid transposing axes.\n",
+        "        self._keep_axis = sorted([d if d >= 0 else d + ndim for d in self.axis])\n",
+        "        # All axes to be kept should have known shape.\n",
+        "        for d in self._keep_axis:\n",
+        "            if input_shape[d] is None:\n",
+        "                raise ValueError(\n",
+        "                    \"All `axis` values to be kept must have known shape. \"\n",
+        "                    \"Got axis: {}, \"\n",
+        "                    \"input shape: {}, with unknown axis at index: {}\".format(\n",
+        "                        self.axis, input_shape, d\n",
+        "                    )\n",
+        "                )\n",
+        "        # Axes to be reduced.\n",
+        "        self._reduce_axis = [d for d in range(ndim) if d not in self._keep_axis]\n",
+        "        # 1 if an axis should be reduced, 0 otherwise.\n",
+        "        self._reduce_axis_mask = [\n",
+        "            0 if d in self._keep_axis else 1 for d in range(ndim)\n",
+        "        ]\n",
+        "        # Broadcast any reduced axes.\n",
+        "        self._broadcast_shape = [\n",
+        "            input_shape[d] if d in self._keep_axis else 1 for d in range(ndim)\n",
+        "        ]\n",
+        "        mean_and_var_shape = tuple(input_shape[d] for d in self._keep_axis)\n",
+        "\n",
+        "        if self.input_mean is None:\n",
+        "            self.adapt_mean = self.add_weight(\n",
+        "                name=\"mean\",\n",
+        "                shape=mean_and_var_shape,\n",
+        "                dtype=self.compute_dtype,\n",
+        "                initializer=\"zeros\",\n",
+        "                trainable=False,\n",
+        "            )\n",
+        "            self.adapt_variance = self.add_weight(\n",
+        "                name=\"variance\",\n",
+        "                shape=mean_and_var_shape,\n",
+        "                dtype=self.compute_dtype,\n",
+        "                initializer=\"ones\",\n",
+        "                trainable=False,\n",
+        "            )\n",
+        "            self.count = self.add_weight(\n",
+        "                name=\"count\",\n",
+        "                shape=(),\n",
+        "                dtype=tf.int64,\n",
+        "                initializer=\"zeros\",\n",
+        "                trainable=False,\n",
+        "            )\n",
+        "            self.finalize_state()\n",
+        "        else:\n",
+        "            # In the no adapt case, make constant tensors for mean and variance\n",
+        "            # with proper broadcast shape for use during call.\n",
+        "            mean = self.input_mean * np.ones(mean_and_var_shape)\n",
+        "            variance = self.input_variance * np.ones(mean_and_var_shape)\n",
+        "            mean = tf.reshape(mean, self._broadcast_shape)\n",
+        "            variance = tf.reshape(variance, self._broadcast_shape)\n",
+        "            self.mean = tf.cast(mean, self.compute_dtype)\n",
+        "            self.variance = tf.cast(variance, self.compute_dtype)\n",
+        "\n",
+        "    # We override this method solely to generate a docstring.\n",
+        "    def adapt(self, data, batch_size=None, steps=None):\n",
+        "        \"\"\"Computes the mean and variance of values in a dataset.\n",
+        "        Calling `adapt()` on a `Normalization` layer is an alternative to\n",
+        "        passing in `mean` and `variance` arguments during layer construction. A\n",
+        "        `Normalization` layer should always either be adapted over a dataset or\n",
+        "        passed `mean` and `variance`.\n",
+        "        During `adapt()`, the layer will compute a `mean` and `variance`\n",
+        "        separately for each position in each axis specified by the `axis`\n",
+        "        argument. To calculate a single `mean` and `variance` over the input\n",
+        "        data, simply pass `axis=None`.\n",
+        "        In order to make `Normalization` efficient in any distribution context,\n",
+        "        the computed mean and variance are kept static with respect to any\n",
+        "        compiled `tf.Graph`s that call the layer. As a consequence, if the layer\n",
+        "        is adapted a second time, any models using the layer should be\n",
+        "        re-compiled. For more information see\n",
+        "        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.\n",
+        "        `adapt()` is meant only as a single machine utility to compute layer\n",
+        "        state.  To analyze a dataset that cannot fit on a single machine, see\n",
+        "        [Tensorflow Transform](\n",
+        "        https://www.tensorflow.org/tfx/transform/get_started)\n",
+        "        for a multi-machine, map-reduce solution.\n",
+        "        Arguments:\n",
+        "          data: The data to train on. It can be passed either as a\n",
+        "              `tf.data.Dataset`, or as a numpy array.\n",
+        "          batch_size: Integer or `None`.\n",
+        "              Number of samples per state update.\n",
+        "              If unspecified, `batch_size` will default to 32.\n",
+        "              Do not specify the `batch_size` if your data is in the\n",
+        "              form of datasets, generators, or `keras.utils.Sequence` instances\n",
+        "              (since they generate batches).\n",
+        "          steps: Integer or `None`.\n",
+        "              Total number of steps (batches of samples)\n",
+        "              When training with input tensors such as\n",
+        "              TensorFlow data tensors, the default `None` is equal to\n",
+        "              the number of samples in your dataset divided by\n",
+        "              the batch size, or 1 if that cannot be determined. If x is a\n",
+        "              `tf.data` dataset, and 'steps' is None, the epoch will run until\n",
+        "              the input dataset is exhausted. When passing an infinitely\n",
+        "              repeating dataset, you must specify the `steps` argument. This\n",
+        "              argument is not supported with array inputs.\n",
+        "        \"\"\"\n",
+        "        super().adapt(data, batch_size=batch_size, steps=steps)\n",
+        "\n",
+        "    def update_state(self, data):\n",
+        "        if self.input_mean is not None:\n",
+        "            raise ValueError(\n",
+        "                \"Cannot `adapt` a Normalization layer that is initialized with \"\n",
+        "                \"static `mean` and `variance`, \"\n",
+        "                \"you passed mean {} and variance {}.\".format(\n",
+        "                    self.input_mean, self.input_variance\n",
+        "                )\n",
+        "            )\n",
+        "\n",
+        "        if not self.built:\n",
+        "            raise RuntimeError(\"`build` must be called before `update_state`.\")\n",
+        "\n",
+        "        data = self._standardize_inputs(data)\n",
+        "        data = tf.cast(data, self.adapt_mean.dtype)\n",
+        "        batch_mean, batch_variance = tf.nn.moments(data, axes=self._reduce_axis)\n",
+        "        batch_shape = tf.shape(data, out_type=self.count.dtype)\n",
+        "        if self._reduce_axis:\n",
+        "            batch_reduce_shape = tf.gather(batch_shape, self._reduce_axis)\n",
+        "            batch_count = tf.reduce_prod(batch_reduce_shape)\n",
+        "        else:\n",
+        "            batch_count = 1\n",
+        "\n",
+        "        total_count = batch_count + self.count\n",
+        "        batch_weight = tf.cast(batch_count, dtype=self.compute_dtype) / tf.cast(\n",
+        "            total_count, dtype=self.compute_dtype\n",
+        "        )\n",
+        "        existing_weight = 1.0 - batch_weight\n",
+        "\n",
+        "        total_mean = (\n",
+        "            self.adapt_mean * existing_weight + batch_mean * batch_weight\n",
+        "        )\n",
+        "        # The variance is computed using the lack-of-fit sum of squares\n",
+        "        # formula (see\n",
+        "        # https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).\n",
+        "        total_variance = (\n",
+        "            self.adapt_variance + (self.adapt_mean - total_mean) ** 2\n",
+        "        ) * existing_weight + (\n",
+        "            batch_variance + (batch_mean - total_mean) ** 2\n",
+        "        ) * batch_weight\n",
+        "        self.adapt_mean.assign(total_mean)\n",
+        "        self.adapt_variance.assign(total_variance)\n",
+        "        self.count.assign(total_count)\n",
+        "\n",
+        "    def reset_state(self):\n",
+        "        if self.input_mean is not None or not self.built:\n",
+        "            return\n",
+        "\n",
+        "        self.adapt_mean.assign(tf.zeros_like(self.adapt_mean))\n",
+        "        self.adapt_variance.assign(tf.ones_like(self.adapt_variance))\n",
+        "        self.count.assign(tf.zeros_like(self.count))\n",
+        "\n",
+        "    def finalize_state(self):\n",
+        "        if self.input_mean is not None or not self.built:\n",
+        "            return\n",
+        "\n",
+        "        # In the adapt case, we make constant tensors for mean and variance with\n",
+        "        # proper broadcast shape and dtype each time `finalize_state` is called.\n",
+        "        self.mean = tf.reshape(self.adapt_mean, self._broadcast_shape)\n",
+        "        self.mean = tf.cast(self.mean, self.compute_dtype)\n",
+        "        self.variance = tf.reshape(self.adapt_variance, self._broadcast_shape)\n",
+        "        self.variance = tf.cast(self.variance, self.compute_dtype)\n",
+        "\n",
+        "    def call(self, inputs):\n",
+        "        inputs = self._standardize_inputs(inputs)\n",
+        "        # The base layer automatically casts floating-point inputs, but we\n",
+        "        # explicitly cast here to also allow integer inputs to be passed\n",
+        "        inputs = tf.cast(inputs, self.compute_dtype)\n",
+        "        if self.invert:\n",
+        "            return (inputs + self.mean) * tf.maximum(\n",
+        "                tf.sqrt(self.variance), tf.keras.backend.epsilon()\n",
+        "            )\n",
+        "        else:\n",
+        "            return (inputs - self.mean) / tf.maximum(\n",
+        "                tf.sqrt(self.variance), tf.keras.backend.epsilon()\n",
+        "            )\n",
+        "\n",
+        "    def compute_output_shape(self, input_shape):\n",
+        "        return input_shape\n",
+        "\n",
+        "    def compute_output_signature(self, input_spec):\n",
+        "        return input_spec\n",
+        "\n",
+        "    def get_config(self):\n",
+        "        config = super().get_config()\n",
+        "        config.update(\n",
+        "            {\n",
+        "                \"axis\": self.axis,\n",
+        "                \"mean\": tf.keras.layers.experimental.preprocessing.preprocessing_utils.utils.listify_tensors(self.input_mean),\n",
+        "                \"variance\": tf.keras.layers.experimental.preprocessing.preprocessing_utils.utils.listify_tensors(self.input_variance),\n",
+        "            }\n",
+        "        )\n",
+        "        return config\n",
+        "\n",
+        "    def _standardize_inputs(self, inputs):\n",
+        "        inputs = tf.convert_to_tensor(inputs)\n",
+        "        if inputs.dtype != self.compute_dtype:\n",
+        "            inputs = tf.cast(inputs, self.compute_dtype)\n",
+        "        return inputs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rvpW2SbnRzEc"
+      },
+      "outputs": [],
+      "source": [
+        "class Encoder(Layer):\n",
+        "    def __init__(\n",
+        "        self, embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate, **kwargs\n",
+        "    ):\n",
+        "        super(Encoder, self).__init__(**kwargs)\n",
+        "\n",
+        "        # Multi-head Attention\n",
+        "        self.mha = MultiHeadAttention(\n",
+        "            num_heads=num_heads,\n",
+        "            key_dim=embed_dim,\n",
+        "            dropout=attention_dropout_rate,\n",
+        "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+        "            attention_axes=(1, 2),       # 2D attention (timestep, patch)\n",
+        "        )\n",
+        "\n",
+        "        # Point wise feed forward network\n",
+        "        self.dense_0 = Dense(\n",
+        "            units=mlp_dim,\n",
+        "            activation=\"gelu\",\n",
+        "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+        "        )\n",
+        "        self.dense_1 = Dense(\n",
+        "            units=embed_dim, kernel_initializer=TruncatedNormal(stddev=0.02)\n",
+        "        )\n",
+        "\n",
+        "        self.dropout_0 = Dropout(rate=dropout_rate)\n",
+        "        self.dropout_1 = Dropout(rate=dropout_rate)\n",
+        "\n",
+        "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
+        "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
+        "\n",
+        "        self.add_0 = Add()\n",
+        "        self.add_1 = Add()\n",
+        "\n",
+        "    def call(self, inputs, training):\n",
+        "        # Attention block\n",
+        "        x = self.norm_0(inputs)\n",
+        "        x = self.mha(\n",
+        "            query=x,\n",
+        "            key=x,\n",
+        "            value=x,\n",
+        "            training=training,\n",
+        "        )\n",
+        "        x = self.dropout_0(x, training=training)\n",
+        "        x = self.add_0([x, inputs])\n",
+        "\n",
+        "        # MLP block\n",
+        "        y = self.norm_1(x)\n",
+        "        y = self.dense_0(y)\n",
+        "        y = self.dense_1(y)\n",
+        "        y = self.dropout_1(y, training=training)\n",
+        "\n",
+        "        return self.add_1([x, y])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "V3n2jEdBRzEo"
+      },
+      "outputs": [],
+      "source": [
+        "class Decoder(Layer):\n",
+        "    def __init__(\n",
+        "        self, embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate, **kwargs\n",
+        "    ):\n",
+        "        super(Decoder, self).__init__(**kwargs)\n",
+        "\n",
+        "        # MultiHeadAttention\n",
+        "        self.mha_0 = MultiHeadAttention(\n",
+        "            num_heads=num_heads,\n",
+        "            key_dim=embed_dim,\n",
+        "            dropout=attention_dropout_rate,\n",
+        "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+        "            attention_axes=(1, 2),          # 2D attention (timestep, patch)\n",
+        "        )\n",
+        "        self.mha_1 = MultiHeadAttention(\n",
+        "            num_heads=num_heads,\n",
+        "            key_dim=embed_dim,\n",
+        "            dropout=attention_dropout_rate,\n",
+        "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+        "            attention_axes=(1, 2),          # 2D attention (timestep, patch)\n",
+        "        )\n",
+        "\n",
+        "        # Point wise feed forward network\n",
+        "        self.dense_0 = Dense(\n",
+        "            units=mlp_dim,\n",
+        "            activation=\"gelu\",\n",
+        "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+        "        )\n",
+        "        self.dense_1 = Dense(\n",
+        "            units=embed_dim, kernel_initializer=TruncatedNormal(stddev=0.02)\n",
+        "        )\n",
+        "\n",
+        "        self.dropout_0 = Dropout(rate=dropout_rate)\n",
+        "        self.dropout_1 = Dropout(rate=dropout_rate)\n",
+        "        self.dropout_2 = Dropout(rate=dropout_rate)\n",
+        "\n",
+        "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
+        "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
+        "        self.norm_2 = LayerNormalization(epsilon=1e-12)\n",
+        "\n",
+        "        self.add_0 = Add()\n",
+        "        self.add_1 = Add()\n",
+        "        self.add_2 = Add()\n",
+        "\n",
+        "    def call(self, inputs, enc_output, training):\n",
+        "        # Attention block\n",
+        "        x = self.norm_0(inputs)\n",
+        "        x = self.mha_0(\n",
+        "            query=x,\n",
+        "            key=x,\n",
+        "            value=x,\n",
+        "            training=training,\n",
+        "        )\n",
+        "        x = self.dropout_0(x, training=training)\n",
+        "        x = self.add_0([x, inputs])\n",
+        "\n",
+        "        # Attention block\n",
+        "        y = self.norm_1(x)\n",
+        "        y = self.mha_1(\n",
+        "            query=y,\n",
+        "            key=enc_output,\n",
+        "            value=enc_output,\n",
+        "            training=training,\n",
+        "        )\n",
+        "        y = self.dropout_1(y, training=training)\n",
+        "        y = self.add_1([x, y])\n",
+        "\n",
+        "        # MLP block\n",
+        "        z = self.norm_2(y)\n",
+        "        z = self.dense_0(z)\n",
+        "        z = self.dense_1(z)\n",
+        "        z = self.dropout_2(z, training=training)\n",
+        "\n",
+        "        return self.add_2([y, z])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7O_O6FKlRzE1"
+      },
+      "source": [
+        "## Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jiO-dcXURzE8"
+      },
+      "outputs": [],
+      "source": [
+        "class DailyTransformer(Model):\n",
+        "    def __init__(\n",
+        "        self,\n",
+        "        num_encoder_layers,\n",
+        "        num_decoder_layers,\n",
+        "        embed_dim,\n",
+        "        mlp_dim,\n",
+        "        num_heads,\n",
+        "        num_outputs,\n",
+        "        dropout_rate,\n",
+        "        attention_dropout_rate,\n",
+        "        **kwargs\n",
+        "    ):\n",
+        "        super(DailyTransformer, self).__init__(**kwargs)\n",
+        "\n",
+        "        # Input (normalization of RAW measurements)\n",
+        "        self.input_norm_enc = Normalization(invert=False)\n",
+        "        self.input_norm_dec1 = Normalization(invert=False)\n",
+        "        self.input_norm_dec2 = Normalization(invert=True)\n",
+        "\n",
+        "        # Input\n",
+        "        self.pos_embs_0 = PositionalEmbedding(embed_dim, dropout_rate)\n",
+        "        self.pos_embs_1 = PositionalEmbedding(embed_dim, dropout_rate)\n",
+        "\n",
+        "        # Encoder\n",
+        "        self.enc_layers = [\n",
+        "            Encoder(embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate)\n",
+        "            for _ in range(num_encoder_layers)\n",
+        "        ]\n",
+        "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
+        "\n",
+        "        # Decoder\n",
+        "        self.dec_layers = [\n",
+        "            Decoder(embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate)\n",
+        "            for _ in range(num_decoder_layers)\n",
+        "        ]\n",
+        "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
+        "\n",
+        "        # Output\n",
+        "        self.final_layer = Dense(\n",
+        "            units=num_outputs,\n",
+        "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
+        "        )\n",
+        "\n",
+        "    def call(self, inputs, training):\n",
+        "        inputs, targets = inputs\n",
+        "\n",
+        "        # Encoder input\n",
+        "        x_e = self.input_norm_enc(inputs)\n",
+        "        x_e = self.pos_embs_0(x_e, training=training)\n",
+        "\n",
+        "        # Encoder\n",
+        "        for layer in self.enc_layers:\n",
+        "            x_e = layer(x_e, training=training)\n",
+        "        x_e = self.norm_0(x_e)\n",
+        "\n",
+        "        # Decoder input\n",
+        "        x_d = self.input_norm_dec1(targets)\n",
+        "        x_d = self.pos_embs_1(x_d, training=training)\n",
+        "\n",
+        "        # Decoder\n",
+        "        for layer in self.dec_layers:\n",
+        "            x_d = layer(x_d, x_e, training=training)\n",
+        "        x_d = self.norm_1(x_d)\n",
+        "\n",
+        "        # Output\n",
+        "        final_output = self.final_layer(x_d)\n",
+        "        final_output = self.input_norm_dec2(final_output)\n",
+        "\n",
+        "        return final_output\n",
+        "\n",
+        "    def train_step(self, inputs):\n",
+        "        inputs, targets = inputs\n",
+        "        inputs = inputs[:, :-1]\n",
+        "        targets_inputs = targets[:, :-1]\n",
+        "        targets_real = targets[:, 1:, :, -1:]\n",
+        "\n",
+        "        with tf.GradientTape() as tape:\n",
+        "            y_pred = self([inputs, targets_inputs], training=True)\n",
+        "            loss = self.compiled_loss(targets_real, y_pred, regularization_losses=self.losses)\n",
+        "\n",
+        "        print(y_pred)\n",
+        "        print(targets_real)\n",
+        "\n",
+        "        # Compute gradients\n",
+        "        trainable_vars = self.trainable_variables\n",
+        "        gradients = tape.gradient(loss, trainable_vars)\n",
+        "\n",
+        "        # Update weights\n",
+        "        self.optimizer.apply_gradients(zip(gradients, trainable_vars))\n",
+        "\n",
+        "        # Update metrics (includes the metric that tracks the loss)\n",
+        "        self.compiled_metrics.update_state(targets_real[:, -1], y_pred[:, -1])\n",
+        "\n",
+        "        # Return a dict mapping metric names to current value\n",
+        "        return {m.name: m.result() for m in self.metrics}\n",
+        "    \n",
+        "    def test_step(self, inputs):\n",
+        "        inputs, targets = inputs\n",
+        "        inputs = inputs[:, :-1]\n",
+        "        targets_inputs = targets[:, :-1]\n",
+        "        targets_real = targets[:, 1:, :, -1:]\n",
+        "\n",
+        "        # Compute predictions\n",
+        "        y_pred = self([inputs, targets_inputs], training=False)\n",
+        "\n",
+        "        # Updates the metrics tracking the loss\n",
+        "        self.compiled_loss(targets_real, y_pred, regularization_losses=self.losses)\n",
+        "\n",
+        "        # Update the metrics\n",
+        "        self.compiled_metrics.update_state(targets_real[:, -1], y_pred[:, -1])\n",
+        "\n",
+        "        # Return a dict mapping metric names to current value\n",
+        "        # Note that it will include the loss (tracked in self.metrics)\n",
+        "        return {m.name: m.result() for m in self.metrics}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LwEwVCXTRzFx"
+      },
+      "source": [
+        "## LR scheduler"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6U3PZiLzRzF1"
+      },
+      "outputs": [],
+      "source": [
+        "def cosine_schedule(base_lr, total_steps, warmup_steps):\n",
+        "    def step_fn(epoch):\n",
+        "        lr = base_lr\n",
+        "        epoch += 1\n",
+        "\n",
+        "        progress = (epoch - warmup_steps) / float(total_steps - warmup_steps)\n",
+        "        progress = tf.clip_by_value(progress, 0.0, 1.0)\n",
+        "        \n",
+        "        lr = lr * 0.5 * (1.0 + tf.cos(math.pi * progress))\n",
+        "\n",
+        "        if warmup_steps:\n",
+        "            lr = lr * tf.minimum(1.0, epoch / warmup_steps)\n",
+        "\n",
+        "        return lr\n",
+        "\n",
+        "    return step_fn"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RkGDtc9sRzF6"
+      },
+      "outputs": [],
+      "source": [
+        "class PrintLR(Callback):\n",
+        "    def on_epoch_end(self, epoch, logs=None):\n",
+        "        wandb.log({\"lr\": self.model.optimizer.lr.numpy()}, commit=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GFi9Y2pORzGA"
+      },
+      "source": [
+        "## Daily Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Dn615CW3I1nL"
+      },
+      "outputs": [],
+      "source": [
+        "df_X = pd.read_csv(\"/content/drive/MyDrive/Solar-Transformer/1984_2022/X_all_daily.csv\")\n",
+        "df_y_daily = pd.read_csv(\"/content/drive/MyDrive/Solar-Transformer/1984_2022/y_all_daily.csv\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 89
+        },
+        "id": "1gCRYg_Gzo9U",
+        "outputId": "c73a6bf2-40d0-48de-8356-5be6207c16c3"
+      },
+      "outputs": [],
+      "source": [
+        "plt.hist2d(df_X['WindDirection1'], df_X['WindSpeed1'], bins=(50, 50))\n",
+        "plt.colorbar()\n",
+        "plt.xlabel('Wind Direction [deg]')\n",
+        "plt.ylabel('Wind Velocity [m/s]')\n",
+        "plt.title(\"Wind\")\n",
+        "plt.show()\n",
+        "\n",
+        "plt.hist2d(df_X['WindDirection1'], df_X['WindSpeedMin1'], bins=(50, 50))\n",
+        "plt.colorbar()\n",
+        "plt.xlabel('Wind Direction [deg]')\n",
+        "plt.ylabel('Min Wind Velocity [m/s]')\n",
+        "plt.title(\"Min Wind\")\n",
+        "plt.show()\n",
+        "\n",
+        "plt.hist2d(df_X['WindDirection1'], df_X['WindSpeedMax1'], bins=(50, 50))\n",
+        "plt.colorbar()\n",
+        "plt.xlabel('Wind Direction [deg]')\n",
+        "plt.ylabel('Max Wind Velocity [m/s]')\n",
+        "plt.title(\"Max Wind\")\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TfiNz_9LkNF7"
+      },
+      "outputs": [],
+      "source": [
+        "date_time = pd.to_datetime(df_X.pop('DateTime'), format='%Y-%m-%d')\n",
+        "num_of_patches = df_X['Name'].nunique()\n",
+        "\n",
+        "df_X = df_X.drop(\n",
+        "    columns=['Name', 'Latitude', 'Longitude'] +\n",
+        "            [c for c in df_X.columns if c[:9] == 'WindSpeed'] +\n",
+        "            [c for c in df_X.columns if c[:12] == 'WindSpeedMin'] +\n",
+        "            [c for c in df_X.columns if c[:12] == 'WindSpeedMax'] +\n",
+        "            [c for c in df_X.columns if c[:13] == 'WindDirection']\n",
+        ")\n",
+        "df_y_daily = df_y_daily.drop(\n",
+        "    columns=['DateTime', 'Name', 'Latitude', 'Longitude'] +\n",
+        "            [c for c in df_y_daily.columns if c[:9] == 'WindSpeed'] +\n",
+        "            [c for c in df_y_daily.columns if c[:12] == 'WindSpeedMin'] +\n",
+        "            [c for c in df_y_daily.columns if c[:12] == 'WindSpeedMax'] +\n",
+        "            [c for c in df_y_daily.columns if c[:13] == 'WindDirection']\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "SYWfWTrx-WQy",
+        "outputId": "89eb1644-7546-4a57-829a-304db5954445"
+      },
+      "outputs": [],
+      "source": [
+        "print(df_X.head())\n",
+        "print(df_y_daily.head())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 89
+        },
+        "id": "OXZLIkjIW1Nn",
+        "outputId": "e8a07e48-ad19-4c85-cfd6-4040a00f95fe"
+      },
+      "outputs": [],
+      "source": [
+        "plt.hist2d(df_X['WindX1'], df_X['WindY1'], bins=(50, 50))\n",
+        "plt.colorbar()\n",
+        "plt.xlabel('Wind X [m/s]')\n",
+        "plt.ylabel('Wind Y [m/s]')\n",
+        "plt.title(\"Wind vector\")\n",
+        "ax = plt.gca()\n",
+        "ax.axis('tight')\n",
+        "plt.show()\n",
+        "\n",
+        "plt.hist2d(df_X['WindXMin1'], df_X['WindYMin1'], bins=(50, 50))\n",
+        "plt.colorbar()\n",
+        "plt.xlabel('Min Wind X [m/s]')\n",
+        "plt.ylabel('Min Wind Y [m/s]')\n",
+        "plt.title(\"Min Wind vector\")\n",
+        "ax = plt.gca()\n",
+        "ax.axis('tight')\n",
+        "plt.show()\n",
+        "\n",
+        "plt.hist2d(df_X['WindXMax1'], df_X['WindYMax1'], bins=(50, 50))\n",
+        "plt.colorbar()\n",
+        "plt.xlabel('Max Wind X [m/s]')\n",
+        "plt.ylabel('Max Wind Y [m/s]')\n",
+        "plt.title(\"Max Wind vector\")\n",
+        "ax = plt.gca()\n",
+        "ax.axis('tight')\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 124
+        },
+        "id": "oXGHnjU62ooH",
+        "outputId": "d0b13b5f-24a6-432e-847d-34edfcb7644f"
+      },
+      "outputs": [],
+      "source": [
+        "x_data = date_time[:(5856 + 5840 + 5840 + 5840):num_of_patches]\n",
+        "\n",
+        "plt.figure(figsize=(16, 4))\n",
+        "plt.plot(x_data, df_X[\"Irradiance1\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.ylabel('kW-hr/m^2/day')\n",
+        "plt.xlabel(\"Date\")\n",
+        "plt.title(\"Solar irradiance\")\n",
+        "plt.show()\n",
+        "\n",
+        "plt.figure(figsize=(16, 4))\n",
+        "plt.plot(x_data, df_X[\"Temp1\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.plot(x_data, df_X[\"TempMin1\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.plot(x_data, df_X[\"TempMax1\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.ylabel('°C')\n",
+        "plt.xlabel(\"Date\")\n",
+        "plt.title(\"Temperature\")\n",
+        "plt.legend([\"Mean\", \"Min\", \"Max\"])\n",
+        "plt.show()\n",
+        "\n",
+        "plt.figure(figsize=(16, 4))\n",
+        "plt.plot(x_data, df_X[\"Humidity1\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.ylabel('%')\n",
+        "plt.xlabel(\"Date\")\n",
+        "plt.title(\"Humidity\")\n",
+        "plt.show()\n",
+        "\n",
+        "plt.figure(figsize=(16, 4))\n",
+        "plt.plot(x_data, df_X[\"Pressure1\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.ylabel('kPa')\n",
+        "plt.xlabel(\"Date\")\n",
+        "plt.title(\"Pressure\")\n",
+        "plt.show()\n",
+        "\n",
+        "plt.figure(figsize=(16, 4))\n",
+        "plt.plot(x_data, df_X[\"DaySin\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.plot(x_data, df_X[\"DayCos\"][:(5856 + 5840 + 5840 + 5840):num_of_patches])\n",
+        "plt.xlabel(\"Date\")\n",
+        "plt.title(\"Time of year signal\")\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Zx4fxfnTSdDE"
+      },
+      "source": [
+        "## Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pg3PA-58zvaW"
+      },
+      "outputs": [],
+      "source": [
+        "def make_dataset(data, sequence_length, sequence_stride, sampling_rate):\n",
+        "    def make_window(data):\n",
+        "        dataset = tf.data.Dataset.from_tensor_slices(data)\n",
+        "        dataset = dataset.window(sequence_length, shift=sequence_stride, stride=sampling_rate, drop_remainder=True)\n",
+        "        dataset = dataset.flat_map(lambda x: x.batch(sequence_length, drop_remainder=True))        \n",
+        "        return dataset\n",
+        "\n",
+        "    data = np.array(data, dtype=np.float32)\n",
+        "    data = np.reshape(data, (-1, num_of_patches, data.shape[-1]))\n",
+        "\n",
+        "    # Split the data\n",
+        "    # (80%, 10%, 10%)\n",
+        "    n = data.shape[0]\n",
+        "    n_train = int(n*0.8)\n",
+        "    n_val = int(n*0.9)\n",
+        "    train_data = data[0:n_train]\n",
+        "    val_data = data[n_train:n_val]\n",
+        "    test_data = data[n_val:]\n",
+        "\n",
+        "    return (\n",
+        "        (n_train, make_window(train_data)),\n",
+        "        (n_val - n_train, make_window(val_data)),\n",
+        "        make_window(test_data)\n",
+        "    )\n",
+        "\n",
+        "def merge_dataset(datasets, batch_size, shuffle):\n",
+        "    dataset = tf.data.Dataset.zip(datasets)\n",
+        "    dataset = dataset.prefetch(tf.data.AUTOTUNE)\n",
+        "\n",
+        "    if shuffle:\n",
+        "        # Shuffle locally at each iteration\n",
+        "        dataset = dataset.shuffle(buffer_size=1000)\n",
+        "    dataset = dataset.batch(batch_size)\n",
+        "    \n",
+        "    return dataset"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "s0-fDcMDRzGS"
+      },
+      "source": [
+        "## Training loop"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ogf0A_urhRCh"
+      },
+      "outputs": [],
+      "source": [
+        "def training_loop(cfg):\n",
+        "    # load dataset\n",
+        "    (n_train_X, train_X_ds), (n_val_X, val_X_ds), _ = make_dataset(df_X, (cfg.window_size + 1), 1, 1)\n",
+        "    (n_train_y, train_y_daily_ds), (n_val_y, val_y_daily_ds), _ = make_dataset(df_y_daily, (cfg.window_size + 1), 1, 1)\n",
+        "    assert n_train_X == n_train_y\n",
+        "    assert n_val_X == n_val_y\n",
+        "\n",
+        "    train_ds = merge_dataset(\n",
+        "        (train_X_ds, train_y_daily_ds),\n",
+        "        cfg.batch_size,\n",
+        "        shuffle=True,\n",
+        "    )\n",
+        "    val_ds = merge_dataset(\n",
+        "        (val_X_ds, val_y_daily_ds),\n",
+        "        cfg.batch_size,\n",
+        "        shuffle=False,\n",
+        "    )\n",
+        "\n",
+        "    # Generate new model\n",
+        "    daily_model = DailyTransformer(\n",
+        "      num_encoder_layers=cfg.num_encoder_layers,\n",
+        "      num_decoder_layers=cfg.num_decoder_layers,\n",
+        "      embed_dim=cfg.embed_layer_size,\n",
+        "      mlp_dim=cfg.fc_layer_size,\n",
+        "      num_heads=cfg.num_heads,\n",
+        "      num_outputs=1,\n",
+        "      dropout_rate=cfg.dropout,\n",
+        "      attention_dropout_rate=cfg.attention_dropout,\n",
+        "    )\n",
+        "\n",
+        "    # adapt on inputs of training dataset - must be before model.compile !!!\n",
+        "    daily_model.input_norm_enc.adapt(train_X_ds)\n",
+        "    print(daily_model.input_norm_enc.variables)\n",
+        "\n",
+        "    # adapt on targets of training dataset - must be before model.compile !!!\n",
+        "    daily_model.input_norm_dec1.adapt(train_y_daily_ds)\n",
+        "    print(daily_model.input_norm_dec1.variables)\n",
+        "    daily_model.input_norm_dec2.adapt(train_y_daily_ds.map(lambda x: x[:, :, -1:]))\n",
+        "    print(daily_model.input_norm_dec2.variables)\n",
+        "\n",
+        "    # Select optimizer\n",
+        "    if cfg.optimizer == \"adam\":\n",
+        "      optim = Adam(\n",
+        "          beta_1=0.9,\n",
+        "          beta_2=0.999,\n",
+        "          epsilon=1e-08,\n",
+        "          global_clipnorm=cfg.global_clipnorm,\n",
+        "      )\n",
+        "    elif cfg.optimizer == \"adamw\":\n",
+        "      optim = tfa.optimizers.AdamW(\n",
+        "          weight_decay=cfg.weight_decay,\n",
+        "          beta_1=0.9,\n",
+        "          beta_2=0.999,\n",
+        "          epsilon=1e-08,\n",
+        "          global_clipnorm=cfg.global_clipnorm,\n",
+        "          exclude_from_weight_decay=[\"layer_normalization\", \"bias\", \"temporal_position\", \"spatial_position\"],\n",
+        "      )\n",
+        "    else:\n",
+        "      raise ValueError(\"The used optimizer is not in list of available\")\n",
+        "\n",
+        "    daily_model.compile(\n",
+        "        optimizer=optim,\n",
+        "        loss=\"log_cosh\",\n",
+        "        metrics=[MeanSquaredError(), RootMeanSquaredError(), MeanAbsoluteError(), RSquare()]   \n",
+        "    )\n",
+        "\n",
+        "    # Train model\n",
+        "    daily_model.fit(\n",
+        "        train_ds,\n",
+        "        epochs=cfg.epochs,\n",
+        "        validation_data=val_ds,\n",
+        "        callbacks=[\n",
+        "            LearningRateScheduler(cosine_schedule(base_lr=cfg.learning_rate, total_steps=cfg.epochs, warmup_steps=cfg.warmup_steps)),\n",
+        "            PrintLR(),\n",
+        "            WandbCallback(monitor=\"val_mean_squared_error\", mode='min', save_weights_only=True),\n",
+        "            EarlyStopping(monitor=\"val_mean_squared_error\", mode='min', min_delta=1e-4, patience=10, restore_best_weights=True, verbose=1),\n",
+        "        ],\n",
+        "        verbose=1\n",
+        "    )\n",
+        "\n",
+        "    daily_model.summary()\n",
+        "\n",
+        "    patch_similarity_plot(daily_model.pos_embs_0.spatial_position[0, 0])\n",
+        "    patch_similarity_plot(daily_model.pos_embs_1.spatial_position[0, 0])\n",
+        "    \n",
+        "    timestep_similarity_plot(daily_model.pos_embs_0.temporal_position[0, :, 0])\n",
+        "    timestep_similarity_plot(daily_model.pos_embs_1.temporal_position[0, :, 0])\n",
+        "\n",
+        "    for inputs in val_ds.take(1):\n",
+        "        plot_prediction(inputs, daily_model)\n",
+        "\n",
+        "    # Resets all state generated by Keras\n",
+        "    tf.keras.backend.clear_session()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RZLpSuL-RzGb"
+      },
+      "outputs": [],
+      "source": [
+        "def run(config=None):\n",
+        "  with wandb.init(config=config):\n",
+        "      config = wandb.config\n",
+        "\n",
+        "      # check rules\n",
+        "      if (config.fc_layer_size < config.embed_layer_size):\n",
+        "          return\n",
+        "      elif (config.warmup_steps >= config.epochs):\n",
+        "          return\n",
+        "\n",
+        "      training_loop(config)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "AXDG0ODuRzGj",
+        "outputId": "aaf73d8c-9c01-4dc7-b3da-df7f9d07a78a"
+      },
+      "outputs": [],
+      "source": [
+        "wandb.agent(sweep_id, run, count=1024)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [
+        "XgDBs9_3l4uD",
+        "oQyRcTjTRzEE",
+        "7O_O6FKlRzE1",
+        "LwEwVCXTRzFx",
+        "GFi9Y2pORzGA",
+        "Zx4fxfnTSdDE"
+      ],
+      "machine_shape": "hm",
+      "name": "Solar_Transformer_2.ipynb",
+      "provenance": []
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3.9.10 ('base')",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.10"
+    },
+    "vscode": {
+      "interpreter": {
+        "hash": "9185113d2128201d66faecd4f34fb34e89a635073a034991399523e584519355"
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

img/Solar_Transformer.png ADDED Viewed

img/output.png ADDED Viewed

models/model-best.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d6846b5ec551c96968b7154d6dba026320eaabcbd36457e9bc555896ff22b21
+size 7534528