{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## initialize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import dotenv\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dotenv.load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## voices eda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(34, 15)\n"
     ]
    }
   ],
   "source": [
    "# df = pd.read_csv('data/11labs_tts_voices.csv')\n",
    "df = pd.read_csv('data/11labs_available_tts_voices.reviewed.csv')\n",
    "df[\"age\"] = df[\"age\"].str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['voice_id', 'name', 'preview_url', 'manual_quality_review', 'owner_id',\n",
       "       'permission_on_resource', 'is_legacy', 'is_mixed', 'accent',\n",
       "       'description', 'age', 'gender', 'category', 'language', 'descriptive'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "language\n",
       "NaN    25\n",
       "en      9\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['language'].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "gender\n",
       "female    17\n",
       "male      17\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['gender'].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age\n",
       "middle_aged    13\n",
       "young          11\n",
       "old            10\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['age'].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>gender</th>\n",
       "      <th>female</th>\n",
       "      <th>male</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>middle_aged</th>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>old</th>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>young</th>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender       female  male\n",
       "age                      \n",
       "middle_aged       4     9\n",
       "old               5     5\n",
       "young             8     3"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(['age', 'gender'], dropna=False)['voice_id'].count().unstack(fill_value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>middle_aged</th>\n",
       "      <th>old</th>\n",
       "      <th>young</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>manual_quality_review</th>\n",
       "      <th>gender</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">bad</th>\n",
       "      <th>female</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">medium</th>\n",
       "      <th>female</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">ok</th>\n",
       "      <th>female</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>very bad</th>\n",
       "      <th>male</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "age                           middle_aged  old  young\n",
       "manual_quality_review gender                         \n",
       "bad                   female            1    2      1\n",
       "                      male              0    3      1\n",
       "medium                female            1    0      0\n",
       "                      male              2    1      0\n",
       "ok                    female            2    3      7\n",
       "                      male              6    1      2\n",
       "very bad              male              1    0      0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby([\"manual_quality_review\", 'gender', \"age\"], dropna=False)[\n",
    "    \"voice_id\"\n",
    "].count().unstack(fill_value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>age</th>\n",
       "      <th>middle_aged</th>\n",
       "      <th>old</th>\n",
       "      <th>young</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gender</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>female</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "age     middle_aged  old  young\n",
       "gender                         \n",
       "female            2    3      7\n",
       "male              6    1      2"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['manual_quality_review'].isin(['ok'])].groupby(['gender', \"age\"], dropna=False)[\n",
    "    \"voice_id\"\n",
    "].count().unstack(fill_value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>age</th>\n",
       "      <th>middle_aged</th>\n",
       "      <th>old</th>\n",
       "      <th>young</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gender</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>female</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>male</th>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "age     middle_aged  old  young\n",
       "gender                         \n",
       "female            3    3      7\n",
       "male              8    2      2"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df[df['manual_quality_review'].isin(['ok', 'medium'])].groupby(['gender', \"age\"], dropna=False)[\n",
    "    \"voice_id\"\n",
    "].count().unstack(fill_value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gender</th>\n",
       "      <th>female</th>\n",
       "      <th>male</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>language</th>\n",
       "      <th>age</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">en</th>\n",
       "      <th>middle_aged</th>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>young</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">NaN</th>\n",
       "      <th>middle_aged</th>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>old</th>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>young</th>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "gender                female  male\n",
       "language age                      \n",
       "en       middle_aged       1     5\n",
       "         young             2     1\n",
       "NaN      middle_aged       3     4\n",
       "         old               5     5\n",
       "         young             6     2"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(['language', 'age', 'gender'], dropna=False)['voice_id'].count().unstack(fill_value=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "descriptive\n",
       "pleasant        6\n",
       "casual          5\n",
       "confident       3\n",
       "calm            3\n",
       "NaN             3\n",
       "intense         3\n",
       "chill           2\n",
       "formal          1\n",
       "serious         1\n",
       "mature          1\n",
       "cute            1\n",
       "crisp           1\n",
       "upbeat          1\n",
       "professional    1\n",
       "excited         1\n",
       "wise            1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['descriptive'].value_counts(dropna=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(39, 14)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_group = 'old'\n",
    "gender = 'male'\n",
    "df_filtered = df[(df['age'] == age_group) & (df['gender'] == gender)]\n",
    "df_filtered.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>voice_id</th>\n",
       "      <th>name</th>\n",
       "      <th>preview_url</th>\n",
       "      <th>owner_id</th>\n",
       "      <th>permission_on_resource</th>\n",
       "      <th>is_legacy</th>\n",
       "      <th>is_mixed</th>\n",
       "      <th>accent</th>\n",
       "      <th>description</th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>category</th>\n",
       "      <th>language</th>\n",
       "      <th>descriptive</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>245</th>\n",
       "      <td>ugI9yHu7QMtMOjozITa3</td>\n",
       "      <td>Nimbus - deep &amp; meditative</td>\n",
       "      <td>https://storage.googleapis.com/eleven-public-p...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>admin</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>american</td>\n",
       "      <td>NaN</td>\n",
       "      <td>old</td>\n",
       "      <td>male</td>\n",
       "      <td>entertainment_tv</td>\n",
       "      <td>NaN</td>\n",
       "      <td>neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>284</th>\n",
       "      <td>1SJjcjy45jFu6erSHVWq</td>\n",
       "      <td>Howard - American Radio Voice</td>\n",
       "      <td>https://storage.googleapis.com/eleven-public-p...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>admin</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>american</td>\n",
       "      <td>NaN</td>\n",
       "      <td>old</td>\n",
       "      <td>male</td>\n",
       "      <td>advertisement</td>\n",
       "      <td>NaN</td>\n",
       "      <td>modulated</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>362</th>\n",
       "      <td>oUAzGw71wG6JCbHMK33s</td>\n",
       "      <td>Mark - calm and wise teacher</td>\n",
       "      <td>https://storage.googleapis.com/eleven-public-p...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>admin</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>british</td>\n",
       "      <td>NaN</td>\n",
       "      <td>old</td>\n",
       "      <td>male</td>\n",
       "      <td>informative_educational</td>\n",
       "      <td>NaN</td>\n",
       "      <td>deep</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 voice_id                           name  \\\n",
       "245  ugI9yHu7QMtMOjozITa3     Nimbus - deep & meditative   \n",
       "284  1SJjcjy45jFu6erSHVWq  Howard - American Radio Voice   \n",
       "362  oUAzGw71wG6JCbHMK33s   Mark - calm and wise teacher   \n",
       "\n",
       "                                           preview_url  owner_id  \\\n",
       "245  https://storage.googleapis.com/eleven-public-p...       NaN   \n",
       "284  https://storage.googleapis.com/eleven-public-p...       NaN   \n",
       "362  https://storage.googleapis.com/eleven-public-p...       NaN   \n",
       "\n",
       "    permission_on_resource  is_legacy  is_mixed    accent description  age  \\\n",
       "245                  admin      False     False  american         NaN  old   \n",
       "284                  admin      False     False  american         NaN  old   \n",
       "362                  admin      False     False   british         NaN  old   \n",
       "\n",
       "    gender                 category language descriptive  \n",
       "245   male         entertainment_tv      NaN     neutral  \n",
       "284   male            advertisement      NaN   modulated  \n",
       "362   male  informative_educational      NaN        deep  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_filtered.sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['HrciSEXYMv69BAJ4ixOW', 'oUAzGw71wG6JCbHMK33s', 'Zl8mecngHM53e1hl151S']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_filtered.sample(3)['voice_id'].to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ai-audio-books",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}