{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install wandb tensorflow_probability tensorflow_addons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.layers import Add, Dense, Dropout, Layer, LayerNormalization, MultiHeadAttention\n",
    "from tensorflow.keras.models import Model\n",
    "from tensorflow.keras.initializers import TruncatedNormal\n",
    "from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError\n",
    "from tensorflow_addons.metrics import RSquare\n",
    "\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_prediction(targets, predictions, max_subplots=3):\n",
    "  plt.figure(figsize=(12, 15))\n",
    "  max_n = min(max_subplots, len(targets))\n",
    "  for n in range(max_n):\n",
    "    # input\n",
    "    plt.subplot(max_n, 1, n+1)\n",
    "    plt.ylabel('Solar irradiance [kW-hr/m^2/day]', fontfamily=\"Arial\", fontsize=16)\n",
    "    plt.plot(np.arange(targets.shape[1]-horizon), targets[n, :-horizon, 0, -1], label='Inputs', marker='.', zorder=-10)\n",
    "\n",
    "    # real\n",
    "    plt.scatter(np.arange(1, targets.shape[1]), targets[n, 1:, 0, -1], edgecolors='k', label='Targets', c='#2cb01d', s=64)\n",
    "    \n",
    "    # predicted\n",
    "    plt.scatter(np.arange(1, targets.shape[1]), predictions[n, :, 0, -1], marker='X', edgecolors='k', label='Predictions', c='#fe7e0f', s=64)\n",
    "\n",
    "    if n == 0:\n",
    "      plt.legend()\n",
    "\n",
    "  plt.xlabel('Time [day]', fontfamily=\"Arial\", fontsize=16)\n",
    "  plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def patch_similarity_plot(pos):\n",
    "  similarity_scores = np.dot(\n",
    "      pos, np.transpose(pos)\n",
    "  ) / (\n",
    "      np.linalg.norm(pos, axis=-1)\n",
    "      * np.linalg.norm(pos, axis=-1)\n",
    "  )\n",
    "\n",
    "  plt.figure(figsize=(7, 7), dpi=300)\n",
    "  ax = sns.heatmap(similarity_scores, center=0)\n",
    "  ax.set_title(\"Spatial Positional Embedding\", fontfamily=\"Arial\", fontsize=16)\n",
    "  ax.set_xlabel(\"Patch\", fontfamily=\"Arial\", fontsize=16)\n",
    "  ax.set_ylabel(\"Patch\", fontfamily=\"Arial\", fontsize=16)\n",
    "  plt.show()\n",
    "\n",
    "def timestep_similarity_plot(pos):\n",
    "  similarity_scores = np.dot(\n",
    "      pos, np.transpose(pos)\n",
    "  ) / (\n",
    "      np.linalg.norm(pos, axis=-1)\n",
    "      * np.linalg.norm(pos, axis=-1)\n",
    "  )\n",
    "\n",
    "  plt.figure(figsize=(7, 7), dpi=300)\n",
    "  ax = sns.heatmap(similarity_scores, center=0)\n",
    "  ax.set_title(\"Temporal Positional Embedding\", fontfamily=\"Arial\", fontsize=16)\n",
    "  ax.set_xlabel(\"Timestep\", fontfamily=\"Arial\", fontsize=16)\n",
    "  ax.set_ylabel(\"Timestep\", fontfamily=\"Arial\", fontsize=16)\n",
    "  plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Normalization(tf.keras.layers.experimental.preprocessing.PreprocessingLayer):\n",
    "    \"\"\"A preprocessing layer which normalizes continuous features.\n",
    "    This layer will shift and scale inputs into a distribution centered around\n",
    "    0 with standard deviation 1. It accomplishes this by precomputing the mean\n",
    "    and variance of the data, and calling `(input - mean) / sqrt(var)` at\n",
    "    runtime.\n",
    "    The mean and variance values for the layer must be either supplied on\n",
    "    construction or learned via `adapt()`. `adapt()` will compute the mean and\n",
    "    variance of the data and store them as the layer's weights. `adapt()` should\n",
    "    be called before `fit()`, `evaluate()`, or `predict()`.\n",
    "    For an overview and full list of preprocessing layers, see the preprocessing\n",
    "    [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).\n",
    "    Args:\n",
    "        axis: Integer, tuple of integers, or None. The axis or axes that should\n",
    "          have a separate mean and variance for each index in the shape. For\n",
    "          example, if shape is `(None, 5)` and `axis=1`, the layer will track 5\n",
    "          separate mean and variance values for the last axis. If `axis` is set\n",
    "          to `None`, the layer will normalize all elements in the input by a\n",
    "          scalar mean and variance. Defaults to -1, where the last axis of the\n",
    "          input is assumed to be a feature dimension and is normalized per\n",
    "          index. Note that in the specific case of batched scalar inputs where\n",
    "          the only axis is the batch axis, the default will normalize each index\n",
    "          in the batch separately. In this case, consider passing `axis=None`.\n",
    "        mean: The mean value(s) to use during normalization. The passed value(s)\n",
    "          will be broadcast to the shape of the kept axes above; if the value(s)\n",
    "          cannot be broadcast, an error will be raised when this layer's\n",
    "          `build()` method is called.\n",
    "        variance: The variance value(s) to use during normalization. The passed\n",
    "          value(s) will be broadcast to the shape of the kept axes above; if the\n",
    "          value(s) cannot be broadcast, an error will be raised when this\n",
    "          layer's `build()` method is called.\n",
    "        invert: If True, this layer will apply the inverse transformation\n",
    "          to its inputs: it would turn a normalized input back into its\n",
    "          original form.\n",
    "    Examples:\n",
    "    Calculate a global mean and variance by analyzing the dataset in `adapt()`.\n",
    "    >>> adapt_data = np.array([1., 2., 3., 4., 5.], dtype='float32')\n",
    "    >>> input_data = np.array([1., 2., 3.], dtype='float32')\n",
    "    >>> layer = tf.keras.layers.Normalization(axis=None)\n",
    "    >>> layer.adapt(adapt_data)\n",
    "    >>> layer(input_data)\n",
    "    <tf.Tensor: shape=(3,), dtype=float32, numpy=\n",
    "    array([-1.4142135, -0.70710677, 0.], dtype=float32)>\n",
    "    Calculate a mean and variance for each index on the last axis.\n",
    "    >>> adapt_data = np.array([[0., 7., 4.],\n",
    "    ...                        [2., 9., 6.],\n",
    "    ...                        [0., 7., 4.],\n",
    "    ...                        [2., 9., 6.]], dtype='float32')\n",
    "    >>> input_data = np.array([[0., 7., 4.]], dtype='float32')\n",
    "    >>> layer = tf.keras.layers.Normalization(axis=-1)\n",
    "    >>> layer.adapt(adapt_data)\n",
    "    >>> layer(input_data)\n",
    "    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=\n",
    "    array([-1., -1., -1.], dtype=float32)>\n",
    "    Pass the mean and variance directly.\n",
    "    >>> input_data = np.array([[1.], [2.], [3.]], dtype='float32')\n",
    "    >>> layer = tf.keras.layers.Normalization(mean=3., variance=2.)\n",
    "    >>> layer(input_data)\n",
    "    <tf.Tensor: shape=(3, 1), dtype=float32, numpy=\n",
    "    array([[-1.4142135 ],\n",
    "           [-0.70710677],\n",
    "           [ 0.        ]], dtype=float32)>\n",
    "    Use the layer to de-normalize inputs (after adapting the layer).\n",
    "    >>> adapt_data = np.array([[0., 7., 4.],\n",
    "    ...                        [2., 9., 6.],\n",
    "    ...                        [0., 7., 4.],\n",
    "    ...                        [2., 9., 6.]], dtype='float32')\n",
    "    >>> input_data = np.array([[1., 2., 3.]], dtype='float32')\n",
    "    >>> layer = tf.keras.layers.Normalization(axis=-1, invert=True)\n",
    "    >>> layer.adapt(adapt_data)\n",
    "    >>> layer(input_data)\n",
    "    <tf.Tensor: shape=(1, 3), dtype=float32, numpy=\n",
    "    array([2., 10., 8.], dtype=float32)>\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(\n",
    "        self, axis=-1, mean=None, variance=None, invert=False, **kwargs\n",
    "    ):\n",
    "        super().__init__(**kwargs)\n",
    "\n",
    "        # Standardize `axis` to a tuple.\n",
    "        if axis is None:\n",
    "            axis = ()\n",
    "        elif isinstance(axis, int):\n",
    "            axis = (axis,)\n",
    "        else:\n",
    "            axis = tuple(axis)\n",
    "        self.axis = axis\n",
    "\n",
    "        # Set `mean` and `variance` if passed.\n",
    "        if isinstance(mean, tf.Variable):\n",
    "            raise ValueError(\n",
    "                \"Normalization does not support passing a Variable \"\n",
    "                \"for the `mean` init arg.\"\n",
    "            )\n",
    "        if isinstance(variance, tf.Variable):\n",
    "            raise ValueError(\n",
    "                \"Normalization does not support passing a Variable \"\n",
    "                \"for the `variance` init arg.\"\n",
    "            )\n",
    "        if (mean is not None) != (variance is not None):\n",
    "            raise ValueError(\n",
    "                \"When setting values directly, both `mean` and `variance` \"\n",
    "                \"must be set. Got mean: {} and variance: {}\".format(\n",
    "                    mean, variance\n",
    "                )\n",
    "            )\n",
    "        self.input_mean = mean\n",
    "        self.input_variance = variance\n",
    "        self.invert = invert\n",
    "\n",
    "    def build(self, input_shape):\n",
    "        super().build(input_shape)\n",
    "\n",
    "        if isinstance(input_shape, (list, tuple)) and all(\n",
    "            isinstance(shape, tf.TensorShape) for shape in input_shape\n",
    "        ):\n",
    "            raise ValueError(\n",
    "                \"Normalization only accepts a single input. If you are \"\n",
    "                \"passing a python list or tuple as a single input, \"\n",
    "                \"please convert to a numpy array or `tf.Tensor`.\"\n",
    "            )\n",
    "\n",
    "        input_shape = tf.TensorShape(input_shape).as_list()\n",
    "        ndim = len(input_shape)\n",
    "\n",
    "        if any(a < -ndim or a >= ndim for a in self.axis):\n",
    "            raise ValueError(\n",
    "                \"All `axis` values must be in the range [-ndim, ndim). \"\n",
    "                \"Found ndim: `{}`, axis: {}\".format(ndim, self.axis)\n",
    "            )\n",
    "\n",
    "        # Axes to be kept, replacing negative values with positive equivalents.\n",
    "        # Sorted to avoid transposing axes.\n",
    "        self._keep_axis = sorted([d if d >= 0 else d + ndim for d in self.axis])\n",
    "        # All axes to be kept should have known shape.\n",
    "        for d in self._keep_axis:\n",
    "            if input_shape[d] is None:\n",
    "                raise ValueError(\n",
    "                    \"All `axis` values to be kept must have known shape. \"\n",
    "                    \"Got axis: {}, \"\n",
    "                    \"input shape: {}, with unknown axis at index: {}\".format(\n",
    "                        self.axis, input_shape, d\n",
    "                    )\n",
    "                )\n",
    "        # Axes to be reduced.\n",
    "        self._reduce_axis = [d for d in range(ndim) if d not in self._keep_axis]\n",
    "        # 1 if an axis should be reduced, 0 otherwise.\n",
    "        self._reduce_axis_mask = [\n",
    "            0 if d in self._keep_axis else 1 for d in range(ndim)\n",
    "        ]\n",
    "        # Broadcast any reduced axes.\n",
    "        self._broadcast_shape = [\n",
    "            input_shape[d] if d in self._keep_axis else 1 for d in range(ndim)\n",
    "        ]\n",
    "        mean_and_var_shape = tuple(input_shape[d] for d in self._keep_axis)\n",
    "\n",
    "        if self.input_mean is None:\n",
    "            self.adapt_mean = self.add_weight(\n",
    "                name=\"mean\",\n",
    "                shape=mean_and_var_shape,\n",
    "                dtype=self.compute_dtype,\n",
    "                initializer=\"zeros\",\n",
    "                trainable=False,\n",
    "            )\n",
    "            self.adapt_variance = self.add_weight(\n",
    "                name=\"variance\",\n",
    "                shape=mean_and_var_shape,\n",
    "                dtype=self.compute_dtype,\n",
    "                initializer=\"ones\",\n",
    "                trainable=False,\n",
    "            )\n",
    "            self.count = self.add_weight(\n",
    "                name=\"count\",\n",
    "                shape=(),\n",
    "                dtype=tf.int64,\n",
    "                initializer=\"zeros\",\n",
    "                trainable=False,\n",
    "            )\n",
    "            self.finalize_state()\n",
    "        else:\n",
    "            # In the no adapt case, make constant tensors for mean and variance\n",
    "            # with proper broadcast shape for use during call.\n",
    "            mean = self.input_mean * np.ones(mean_and_var_shape)\n",
    "            variance = self.input_variance * np.ones(mean_and_var_shape)\n",
    "            mean = tf.reshape(mean, self._broadcast_shape)\n",
    "            variance = tf.reshape(variance, self._broadcast_shape)\n",
    "            self.mean = tf.cast(mean, self.compute_dtype)\n",
    "            self.variance = tf.cast(variance, self.compute_dtype)\n",
    "\n",
    "    # We override this method solely to generate a docstring.\n",
    "    def adapt(self, data, batch_size=None, steps=None):\n",
    "        \"\"\"Computes the mean and variance of values in a dataset.\n",
    "        Calling `adapt()` on a `Normalization` layer is an alternative to\n",
    "        passing in `mean` and `variance` arguments during layer construction. A\n",
    "        `Normalization` layer should always either be adapted over a dataset or\n",
    "        passed `mean` and `variance`.\n",
    "        During `adapt()`, the layer will compute a `mean` and `variance`\n",
    "        separately for each position in each axis specified by the `axis`\n",
    "        argument. To calculate a single `mean` and `variance` over the input\n",
    "        data, simply pass `axis=None`.\n",
    "        In order to make `Normalization` efficient in any distribution context,\n",
    "        the computed mean and variance are kept static with respect to any\n",
    "        compiled `tf.Graph`s that call the layer. As a consequence, if the layer\n",
    "        is adapted a second time, any models using the layer should be\n",
    "        re-compiled. For more information see\n",
    "        `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.\n",
    "        `adapt()` is meant only as a single machine utility to compute layer\n",
    "        state.  To analyze a dataset that cannot fit on a single machine, see\n",
    "        [Tensorflow Transform](\n",
    "        https://www.tensorflow.org/tfx/transform/get_started)\n",
    "        for a multi-machine, map-reduce solution.\n",
    "        Arguments:\n",
    "          data: The data to train on. It can be passed either as a\n",
    "              `tf.data.Dataset`, or as a numpy array.\n",
    "          batch_size: Integer or `None`.\n",
    "              Number of samples per state update.\n",
    "              If unspecified, `batch_size` will default to 32.\n",
    "              Do not specify the `batch_size` if your data is in the\n",
    "              form of datasets, generators, or `keras.utils.Sequence` instances\n",
    "              (since they generate batches).\n",
    "          steps: Integer or `None`.\n",
    "              Total number of steps (batches of samples)\n",
    "              When training with input tensors such as\n",
    "              TensorFlow data tensors, the default `None` is equal to\n",
    "              the number of samples in your dataset divided by\n",
    "              the batch size, or 1 if that cannot be determined. If x is a\n",
    "              `tf.data` dataset, and 'steps' is None, the epoch will run until\n",
    "              the input dataset is exhausted. When passing an infinitely\n",
    "              repeating dataset, you must specify the `steps` argument. This\n",
    "              argument is not supported with array inputs.\n",
    "        \"\"\"\n",
    "        super().adapt(data, batch_size=batch_size, steps=steps)\n",
    "\n",
    "    def update_state(self, data):\n",
    "        if self.input_mean is not None:\n",
    "            raise ValueError(\n",
    "                \"Cannot `adapt` a Normalization layer that is initialized with \"\n",
    "                \"static `mean` and `variance`, \"\n",
    "                \"you passed mean {} and variance {}.\".format(\n",
    "                    self.input_mean, self.input_variance\n",
    "                )\n",
    "            )\n",
    "\n",
    "        if not self.built:\n",
    "            raise RuntimeError(\"`build` must be called before `update_state`.\")\n",
    "\n",
    "        data = self._standardize_inputs(data)\n",
    "        data = tf.cast(data, self.adapt_mean.dtype)\n",
    "        batch_mean, batch_variance = tf.nn.moments(data, axes=self._reduce_axis)\n",
    "        batch_shape = tf.shape(data, out_type=self.count.dtype)\n",
    "        if self._reduce_axis:\n",
    "            batch_reduce_shape = tf.gather(batch_shape, self._reduce_axis)\n",
    "            batch_count = tf.reduce_prod(batch_reduce_shape)\n",
    "        else:\n",
    "            batch_count = 1\n",
    "\n",
    "        total_count = batch_count + self.count\n",
    "        batch_weight = tf.cast(batch_count, dtype=self.compute_dtype) / tf.cast(\n",
    "            total_count, dtype=self.compute_dtype\n",
    "        )\n",
    "        existing_weight = 1.0 - batch_weight\n",
    "\n",
    "        total_mean = (\n",
    "            self.adapt_mean * existing_weight + batch_mean * batch_weight\n",
    "        )\n",
    "        # The variance is computed using the lack-of-fit sum of squares\n",
    "        # formula (see\n",
    "        # https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).\n",
    "        total_variance = (\n",
    "            self.adapt_variance + (self.adapt_mean - total_mean) ** 2\n",
    "        ) * existing_weight + (\n",
    "            batch_variance + (batch_mean - total_mean) ** 2\n",
    "        ) * batch_weight\n",
    "        self.adapt_mean.assign(total_mean)\n",
    "        self.adapt_variance.assign(total_variance)\n",
    "        self.count.assign(total_count)\n",
    "\n",
    "    def reset_state(self):\n",
    "        if self.input_mean is not None or not self.built:\n",
    "            return\n",
    "\n",
    "        self.adapt_mean.assign(tf.zeros_like(self.adapt_mean))\n",
    "        self.adapt_variance.assign(tf.ones_like(self.adapt_variance))\n",
    "        self.count.assign(tf.zeros_like(self.count))\n",
    "\n",
    "    def finalize_state(self):\n",
    "        if self.input_mean is not None or not self.built:\n",
    "            return\n",
    "\n",
    "        # In the adapt case, we make constant tensors for mean and variance with\n",
    "        # proper broadcast shape and dtype each time `finalize_state` is called.\n",
    "        self.mean = tf.reshape(self.adapt_mean, self._broadcast_shape)\n",
    "        self.mean = tf.cast(self.mean, self.compute_dtype)\n",
    "        self.variance = tf.reshape(self.adapt_variance, self._broadcast_shape)\n",
    "        self.variance = tf.cast(self.variance, self.compute_dtype)\n",
    "\n",
    "    def call(self, inputs):\n",
    "        inputs = self._standardize_inputs(inputs)\n",
    "        # The base layer automatically casts floating-point inputs, but we\n",
    "        # explicitly cast here to also allow integer inputs to be passed\n",
    "        inputs = tf.cast(inputs, self.compute_dtype)\n",
    "        if self.invert:\n",
    "            return (inputs + self.mean) * tf.maximum(\n",
    "                tf.sqrt(self.variance), tf.keras.backend.epsilon()\n",
    "            )\n",
    "        else:\n",
    "            return (inputs - self.mean) / tf.maximum(\n",
    "                tf.sqrt(self.variance), tf.keras.backend.epsilon()\n",
    "            )\n",
    "\n",
    "    def compute_output_shape(self, input_shape):\n",
    "        return input_shape\n",
    "\n",
    "    def compute_output_signature(self, input_spec):\n",
    "        return input_spec\n",
    "\n",
    "    def get_config(self):\n",
    "        config = super().get_config()\n",
    "        config.update(\n",
    "            {\n",
    "                \"axis\": self.axis,\n",
    "                \"mean\": tf.keras.layers.experimental.preprocessing.preprocessing_utils.utils.listify_tensors(self.input_mean),\n",
    "                \"variance\": tf.keras.layers.experimental.preprocessing.preprocessing_utils.utils.listify_tensors(self.input_variance),\n",
    "            }\n",
    "        )\n",
    "        return config\n",
    "\n",
    "    def _standardize_inputs(self, inputs):\n",
    "        inputs = tf.convert_to_tensor(inputs)\n",
    "        if inputs.dtype != self.compute_dtype:\n",
    "            inputs = tf.cast(inputs, self.compute_dtype)\n",
    "        return inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PositionalEmbedding(Layer):\n",
    "    def __init__(self, units, dropout_rate, **kwargs):\n",
    "        super(PositionalEmbedding, self).__init__(**kwargs)\n",
    "\n",
    "        self.units = units\n",
    "\n",
    "        self.projection = Dense(units, kernel_initializer=TruncatedNormal(stddev=0.02))\n",
    "        self.dropout = Dropout(rate=dropout_rate)\n",
    "\n",
    "    def build(self, input_shape):\n",
    "        super(PositionalEmbedding, self).build(input_shape)\n",
    "\n",
    "        print(\"pos_embbeding: \", input_shape)\n",
    "        self.temporal_position = self.add_weight(\n",
    "            name=\"temporal_position\",\n",
    "            shape=(1, input_shape[1], 1, self.units),\n",
    "            initializer=TruncatedNormal(stddev=0.02),\n",
    "            trainable=True,\n",
    "        )\n",
    "        self.spatial_position = self.add_weight(\n",
    "            name=\"spatial_position\",\n",
    "            shape=(1, 1, input_shape[2], self.units),\n",
    "            initializer=TruncatedNormal(stddev=0.02),\n",
    "            trainable=True,\n",
    "        )\n",
    "\n",
    "    def call(self, inputs, training):\n",
    "        x = self.projection(inputs)\n",
    "        x += self.temporal_position\n",
    "        x += self.spatial_position\n",
    "\n",
    "        return self.dropout(x, training=training)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Encoder(Layer):\n",
    "    def __init__(\n",
    "        self, embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate, **kwargs\n",
    "    ):\n",
    "        super(Encoder, self).__init__(**kwargs)\n",
    "\n",
    "        # Multi-head Attention\n",
    "        self.mha = MultiHeadAttention(\n",
    "            num_heads=num_heads,\n",
    "            key_dim=embed_dim,\n",
    "            dropout=attention_dropout_rate,\n",
    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
    "            attention_axes=(1, 2),       # 2D attention (timestep, patch)\n",
    "        )\n",
    "\n",
    "        # Point wise feed forward network\n",
    "        self.dense_0 = Dense(\n",
    "            units=mlp_dim,\n",
    "            activation=\"gelu\",\n",
    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
    "        )\n",
    "        self.dense_1 = Dense(\n",
    "            units=embed_dim, kernel_initializer=TruncatedNormal(stddev=0.02)\n",
    "        )\n",
    "\n",
    "        self.dropout_0 = Dropout(rate=dropout_rate)\n",
    "        self.dropout_1 = Dropout(rate=dropout_rate)\n",
    "\n",
    "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
    "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
    "\n",
    "        self.add_0 = Add()\n",
    "        self.add_1 = Add()\n",
    "\n",
    "    def call(self, inputs, training):\n",
    "        # Attention block\n",
    "        x = self.norm_0(inputs)\n",
    "        x = self.mha(\n",
    "            query=x,\n",
    "            key=x,\n",
    "            value=x,\n",
    "            training=training,\n",
    "        )\n",
    "        x = self.dropout_0(x, training=training)\n",
    "        x = self.add_0([x, inputs])\n",
    "\n",
    "        # MLP block\n",
    "        y = self.norm_1(x)\n",
    "        y = self.dense_0(y)\n",
    "        y = self.dense_1(y)\n",
    "        y = self.dropout_1(y, training=training)\n",
    "\n",
    "        return self.add_1([x, y])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Decoder(Layer):\n",
    "    def __init__(\n",
    "        self, embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate, **kwargs\n",
    "    ):\n",
    "        super(Decoder, self).__init__(**kwargs)\n",
    "\n",
    "        # MultiHeadAttention\n",
    "        self.mha_0 = MultiHeadAttention(\n",
    "            num_heads=num_heads,\n",
    "            key_dim=embed_dim,\n",
    "            dropout=attention_dropout_rate,\n",
    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
    "            attention_axes=(1, 2),          # 2D attention (timestep, patch)\n",
    "        )\n",
    "        self.mha_1 = MultiHeadAttention(\n",
    "            num_heads=num_heads,\n",
    "            key_dim=embed_dim,\n",
    "            dropout=attention_dropout_rate,\n",
    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
    "            attention_axes=(1, 2),          # 2D attention (timestep, patch)\n",
    "        )\n",
    "\n",
    "        # Point wise feed forward network\n",
    "        self.dense_0 = Dense(\n",
    "            units=mlp_dim,\n",
    "            activation=\"gelu\",\n",
    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
    "        )\n",
    "        self.dense_1 = Dense(\n",
    "            units=embed_dim, kernel_initializer=TruncatedNormal(stddev=0.02)\n",
    "        )\n",
    "\n",
    "        self.dropout_0 = Dropout(rate=dropout_rate)\n",
    "        self.dropout_1 = Dropout(rate=dropout_rate)\n",
    "        self.dropout_2 = Dropout(rate=dropout_rate)\n",
    "\n",
    "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
    "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
    "        self.norm_2 = LayerNormalization(epsilon=1e-12)\n",
    "\n",
    "        self.add_0 = Add()\n",
    "        self.add_1 = Add()\n",
    "        self.add_2 = Add()\n",
    "\n",
    "    def call(self, inputs, enc_output, training):\n",
    "        # Attention block\n",
    "        x = self.norm_0(inputs)\n",
    "        x = self.mha_0(\n",
    "            query=x,\n",
    "            key=x,\n",
    "            value=x,\n",
    "            training=training,\n",
    "        )\n",
    "        x = self.dropout_0(x, training=training)\n",
    "        x = self.add_0([x, inputs])\n",
    "\n",
    "        # Attention block\n",
    "        y = self.norm_1(x)\n",
    "        y = self.mha_1(\n",
    "            query=y,\n",
    "            key=enc_output,\n",
    "            value=enc_output,\n",
    "            training=training,\n",
    "        )\n",
    "        y = self.dropout_1(y, training=training)\n",
    "        y = self.add_1([x, y])\n",
    "\n",
    "        # MLP block\n",
    "        z = self.norm_2(y)\n",
    "        z = self.dense_0(z)\n",
    "        z = self.dense_1(z)\n",
    "        z = self.dropout_2(z, training=training)\n",
    "\n",
    "        return self.add_2([y, z])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Transformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DailyTransformer(Model):\n",
    "    def __init__(\n",
    "        self,\n",
    "        num_encoder_layers,\n",
    "        num_decoder_layers,\n",
    "        embed_dim,\n",
    "        mlp_dim,\n",
    "        num_heads,\n",
    "        num_outputs,\n",
    "        dropout_rate,\n",
    "        attention_dropout_rate,\n",
    "        **kwargs\n",
    "    ):\n",
    "        super(DailyTransformer, self).__init__(**kwargs)\n",
    "\n",
    "        # Input (normalization of RAW measurements)\n",
    "        self.input_norm_enc = Normalization(invert=False)\n",
    "        self.input_norm_dec1 = Normalization(invert=False)\n",
    "        self.input_norm_dec2 = Normalization(invert=True)\n",
    "\n",
    "        # Input\n",
    "        self.pos_embs_0 = PositionalEmbedding(embed_dim, dropout_rate)\n",
    "        self.pos_embs_1 = PositionalEmbedding(embed_dim, dropout_rate)\n",
    "\n",
    "        # Encoder\n",
    "        self.enc_layers = [\n",
    "            Encoder(embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate)\n",
    "            for _ in range(num_encoder_layers)\n",
    "        ]\n",
    "        self.norm_0 = LayerNormalization(epsilon=1e-12)\n",
    "\n",
    "        # Decoder\n",
    "        self.dec_layers = [\n",
    "            Decoder(embed_dim, mlp_dim, num_heads, dropout_rate, attention_dropout_rate)\n",
    "            for _ in range(num_decoder_layers)\n",
    "        ]\n",
    "        self.norm_1 = LayerNormalization(epsilon=1e-12)\n",
    "\n",
    "        # Output\n",
    "        self.final_layer = Dense(\n",
    "            units=num_outputs,\n",
    "            kernel_initializer=TruncatedNormal(stddev=0.02),\n",
    "        )\n",
    "\n",
    "    def call(self, inputs, training):\n",
    "        inputs, targets = inputs\n",
    "\n",
    "        # Encoder input\n",
    "        x_e = self.input_norm_enc(inputs)\n",
    "        x_e = self.pos_embs_0(x_e, training=training)\n",
    "\n",
    "        # Encoder\n",
    "        for layer in self.enc_layers:\n",
    "            x_e = layer(x_e, training=training)\n",
    "        x_e = self.norm_0(x_e)\n",
    "\n",
    "        # Decoder input\n",
    "        x_d = self.input_norm_dec1(targets)\n",
    "        x_d = self.pos_embs_1(x_d, training=training)\n",
    "\n",
    "        # Decoder\n",
    "        for layer in self.dec_layers:\n",
    "            x_d = layer(x_d, x_e, training=training)\n",
    "        x_d = self.norm_1(x_d)\n",
    "\n",
    "        # Output\n",
    "        final_output = self.final_layer(x_d)\n",
    "        final_output = self.input_norm_dec2(final_output)\n",
    "\n",
    "        return final_output\n",
    "\n",
    "    def train_step(self, inputs):\n",
    "        inputs, targets = inputs\n",
    "        inputs = inputs[:, :-1]\n",
    "        targets_inputs = targets[:, :-1]\n",
    "        targets_real = targets[:, 1:, :, -1:]\n",
    "\n",
    "        with tf.GradientTape() as tape:\n",
    "            y_pred = self([inputs, targets_inputs], training=True)\n",
    "            loss = self.compiled_loss(targets_real, y_pred, regularization_losses=self.losses)\n",
    "\n",
    "        print(y_pred)\n",
    "        print(targets_real)\n",
    "\n",
    "        # Compute gradients\n",
    "        trainable_vars = self.trainable_variables\n",
    "        gradients = tape.gradient(loss, trainable_vars)\n",
    "\n",
    "        # Update weights\n",
    "        self.optimizer.apply_gradients(zip(gradients, trainable_vars))\n",
    "\n",
    "        # Update metrics (includes the metric that tracks the loss)\n",
    "        self.compiled_metrics.update_state(targets_real[:, -1], y_pred[:, -1])\n",
    "\n",
    "        # Return a dict mapping metric names to current value\n",
    "        return {m.name: m.result() for m in self.metrics}\n",
    "    \n",
    "    def test_step(self, inputs):\n",
    "        inputs, targets = inputs\n",
    "        inputs = inputs[:, :-1]\n",
    "        targets_inputs = targets[:, :-1]\n",
    "        targets_real = targets[:, 1:, :, -1:]\n",
    "\n",
    "        # Compute predictions\n",
    "        y_pred = self([inputs, targets_inputs], training=False)\n",
    "\n",
    "        # Updates the metrics tracking the loss\n",
    "        self.compiled_loss(targets_real, y_pred, regularization_losses=self.losses)\n",
    "\n",
    "        # Update the metrics\n",
    "        self.compiled_metrics.update_state(targets_real[:, -1], y_pred[:, -1])\n",
    "\n",
    "        # Return a dict mapping metric names to current value\n",
    "        # Note that it will include the loss (tracked in self.metrics)\n",
    "        return {m.name: m.result() for m in self.metrics}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Simulator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Simulator(tf.Module):\n",
    "  def __init__(self, transformer):\n",
    "    self.transformer = transformer\n",
    "    self.pi = tf.constant(np.pi)\n",
    "\n",
    "  def __call__(self, inputs, horizon_length):\n",
    "    inputs, targets = inputs\n",
    "    output_array = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)\n",
    "\n",
    "    for i in tf.range(horizon_length):\n",
    "      tar = targets[:, i:]\n",
    "      #print(\"target_old:\", tar[0])\n",
    "      \n",
    "      # Concatenate history with the predicted future\n",
    "      if i > 0:\n",
    "        output = tf.transpose(output_array.stack(), perm=[1, 0, 2, 3])\n",
    "        if i > tf.shape(inputs)[1]:\n",
    "          tar = tf.concat([tar, output[:, (i - tf.shape(inputs)[1]):]], axis=1)\n",
    "        else:\n",
    "          tar = tf.concat([tar, output], axis=1)\n",
    "        #print(\"target_new[\", i, \"]:\", tar[0])\n",
    "\n",
    "      #print(\"day sin/cos_OLD:\", tar[0, -1, 0, :-1])\n",
    "\n",
    "      day = (tf.atan2(tar[:, -1, :, 0], tar[:, -1, :, 1]) * 183.0) / self.pi\n",
    "      day = tf.round(tf.where(day > 0, day, day + 366))\n",
    "      \n",
    "      day_sin = tf.expand_dims(tf.sin(2.0 * self.pi * (day + 1) / 366.0), axis=-1)\n",
    "      day_cos = tf.expand_dims(tf.cos(2.0 * self.pi * (day + 1) / 366.0), axis=-1)\n",
    "\n",
    "      #print(\"day: \", day)\n",
    "      #print(\"day sin/cos_NEW:\", day_sin[0], day_cos[0])\n",
    "\n",
    "      predictions = self.transformer([inputs, tar], training=False)\n",
    "      #print(\"predictions: \", predictions[0])\n",
    "\n",
    "      if i == 0:\n",
    "        zero_predictions = predictions[:, :-1]\n",
    "\n",
    "      # concatentate the prediction to the output which is given to the decoder as its input\n",
    "      output_array = output_array.write(i, tf.concat([day_sin, day_cos, predictions[:, -1]], axis=-1))\n",
    "\n",
    "    output = tf.transpose(output_array.stack(), perm=[1, 0, 2, 3])\n",
    "    #print(output.shape)\n",
    "\n",
    "    return tf.concat([zero_predictions, output[:, :, :, -1:]], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_X = pd.read_csv(\"./dataset/1984_2022/X_all_daily.csv\")\n",
    "df_y_daily = pd.read_csv(\"./dataset/1984_2022/y_all_daily.csv\")\n",
    "\n",
    "num_of_patches = df_X['Name'].nunique()\n",
    "\n",
    "df_X = df_X.drop(\n",
    "    columns=['DateTime', 'Name', 'Latitude', 'Longitude'] +\n",
    "            [c for c in df_X.columns if c[:9] == 'WindSpeed'] +\n",
    "            [c for c in df_X.columns if c[:12] == 'WindSpeedMin'] +\n",
    "            [c for c in df_X.columns if c[:12] == 'WindSpeedMax'] +\n",
    "            [c for c in df_X.columns if c[:13] == 'WindDirection']\n",
    ")\n",
    "df_y_daily = df_y_daily.drop(\n",
    "    columns=['DateTime', 'Name', 'Latitude', 'Longitude'] +\n",
    "            [c for c in df_y_daily.columns if c[:9] == 'WindSpeed'] +\n",
    "            [c for c in df_y_daily.columns if c[:12] == 'WindSpeedMin'] +\n",
    "            [c for c in df_y_daily.columns if c[:12] == 'WindSpeedMax'] +\n",
    "            [c for c in df_y_daily.columns if c[:13] == 'WindDirection']\n",
    ")\n",
    "\n",
    "loc_names = [\n",
    "    \"54 MW PV SOLAR POWER PLANT\",\n",
    "    \"5MW Solar Power Plant Varroc\",\n",
    "    \"Adani Green Energy Tamilnadu Limited\",\n",
    "    \"Arete Elena Energy Pvt Ltd\",\n",
    "    \"Bitta Solar Power Plant\",\n",
    "    \"Charanka Solar Park\",\n",
    "    \"Chennai Metropolitan Area\",\n",
    "    \"Ctrls Data Center Mumbai\",\n",
    "    \"Indira Paryavaran Bhawan\",\n",
    "    \"Kurnool Ultra Mega Solar Park\",\n",
    "    \"Pavagada Solar Park\",\n",
    "    \"Rewa Ultra Mega Solar\",\n",
    "    \"Solar Power Plant Chandasar\",\n",
    "    \"Solar Power Plant Khera Silajit\",\n",
    "    \"Solar power plant Koppal\",\n",
    "    \"Target 1\",\n",
    "    \"Target 2\",\n",
    "    \"Welspun Solar MP project\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_X.head())\n",
    "print(df_y_daily.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_dataset(data, sequence_length, sequence_stride, sampling_rate):\n",
    "    def make_window(data):\n",
    "        dataset = tf.data.Dataset.from_tensor_slices(data)\n",
    "        dataset = dataset.window(sequence_length, shift=sequence_stride, stride=sampling_rate, drop_remainder=True)\n",
    "        dataset = dataset.flat_map(lambda x: x.batch(sequence_length, drop_remainder=True))        \n",
    "        return dataset\n",
    "\n",
    "    data = np.array(data, dtype=np.float32)\n",
    "    data = np.reshape(data, (-1, num_of_patches, data.shape[-1]))\n",
    "\n",
    "    # Split the data\n",
    "    # (80%, 10%, 10%)\n",
    "    n = data.shape[0]\n",
    "    n_train = int(n*0.8)\n",
    "    n_val = int(n*0.9)\n",
    "    train_data = data[0:n_train]\n",
    "    val_data = data[n_train:n_val]\n",
    "    test_data = data[n_val:]\n",
    "\n",
    "    return (\n",
    "        (n_train, make_window(train_data)),\n",
    "        (n_val - n_train, make_window(val_data)),\n",
    "        make_window(test_data)\n",
    "    )\n",
    "\n",
    "def merge_dataset(datasets, batch_size, shuffle):\n",
    "    dataset = tf.data.Dataset.zip(datasets)\n",
    "    dataset = dataset.prefetch(tf.data.AUTOTUNE)\n",
    "\n",
    "    if shuffle:\n",
    "        # Shuffle locally at each iteration\n",
    "        dataset = dataset.shuffle(buffer_size=1000)\n",
    "    dataset = dataset.batch(batch_size)\n",
    "    \n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simulation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "horizon = 7\n",
    "window_size = 7\n",
    "batch_size = 32\n",
    "\n",
    "_, _, test_X_ds = make_dataset(df_X, (window_size + horizon), 1, 1)\n",
    "_, _, test_y_daily_ds = make_dataset(df_y_daily, (window_size + horizon), 1, 1)\n",
    "\n",
    "test_ds = merge_dataset(\n",
    "    (test_X_ds, test_y_daily_ds),\n",
    "    batch_size,\n",
    "    shuffle=False,\n",
    ")\n",
    "\n",
    "daily_model = DailyTransformer(\n",
    "    attention_dropout_rate=0.25,\n",
    "    dropout_rate=0.15,\n",
    "    embed_dim=64,\n",
    "    mlp_dim=256,\n",
    "    num_decoder_layers=6,\n",
    "    num_encoder_layers=3,\n",
    "    num_heads=6,\n",
    "    num_outputs=1,\n",
    ")\n",
    "daily_model.build([(None, window_size, num_of_patches, 302), (None, window_size, num_of_patches, 3)])\n",
    "daily_model.load_weights(\"./models/model-best.h5\")\n",
    "simulator = Simulator(daily_model)\n",
    "\n",
    "print(daily_model.input_norm_enc.variables)\n",
    "print(daily_model.input_norm_dec1.variables)\n",
    "print(daily_model.input_norm_dec2.variables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "patch_similarity_plot(daily_model.pos_embs_0.spatial_position[0, 0])\n",
    "patch_similarity_plot(daily_model.pos_embs_1.spatial_position[0, 0])\n",
    "\n",
    "timestep_similarity_plot(daily_model.pos_embs_0.temporal_position[0, :, 0])\n",
    "timestep_similarity_plot(daily_model.pos_embs_1.temporal_position[0, :, 0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrics = [MeanSquaredError(), RootMeanSquaredError(), MeanAbsoluteError(), MeanAbsolutePercentageError(), RSquare()]\n",
    "\n",
    "# Location 1 = 15 (64.67 % na 4 dni), (80.6 % na 1 den)\n",
    "# Location 2 = 16 (69.8 % na 4 dni), (83.67 % na 1 den)\n",
    "\n",
    "# Chennai = 6 (69.8 % na 4 dni), (83.67 % na 1 den)\n",
    "# Mumbai = 7 (69.8 % na 4 dni), (83.67 % na 1 den)\n",
    "\n",
    "for loc in range(num_of_patches):\n",
    "    print(\"Location: \", loc_names[loc])\n",
    "    print(\"-----------------------------------------------------\")\n",
    "    for inputs in test_ds:\n",
    "        inputs, targets = inputs\n",
    "        inputs = inputs[:, :-horizon]\n",
    "        targets_inputs = targets[:, :-horizon]\n",
    "        targets_real = targets[:, 1:, loc, -1:]\n",
    "\n",
    "        #y_pred = daily_model([inputs, targets_inputs], training=False)\n",
    "        y_pred = simulator([inputs, targets_inputs], horizon_length=horizon)\n",
    "\n",
    "        # Update the metrics\n",
    "        for m in metrics:\n",
    "            m.update_state(targets_real, y_pred[:, :, loc, -1:])\n",
    "\n",
    "    # visualize the last results\n",
    "    plot_prediction(targets, y_pred)\n",
    "\n",
    "    print({m.name: m.result() for m in metrics}, \"\\n\")\n",
    "    for m in metrics:\n",
    "        m.reset_states()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.10 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "9185113d2128201d66faecd4f34fb34e89a635073a034991399523e584519355"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}