Commit
·
0bcc91b
1
Parent(s):
293c45c
Upload text_sentiment_analysis_blog_notebook.ipynb
Browse files
text_sentiment_analysis_blog_notebook.ipynb
CHANGED
@@ -2768,7 +2768,7 @@
|
|
2768 |
},
|
2769 |
{
|
2770 |
"cell_type": "code",
|
2771 |
-
"execution_count":
|
2772 |
"metadata": {
|
2773 |
"colab": {
|
2774 |
"base_uri": "https://localhost:8080/"
|
@@ -2893,7 +2893,8 @@
|
|
2893 |
{
|
2894 |
"cell_type": "markdown",
|
2895 |
"source": [
|
2896 |
-
"# Get and process Dataset"
|
|
|
2897 |
],
|
2898 |
"metadata": {
|
2899 |
"id": "1f0WYksXHPAM"
|
@@ -3003,7 +3004,7 @@
|
|
3003 |
"id": "mqwYf9xWHUuh",
|
3004 |
"outputId": "66965e88-8115-4d73-db90-6706371a0654"
|
3005 |
},
|
3006 |
-
"execution_count":
|
3007 |
"outputs": [
|
3008 |
{
|
3009 |
"output_type": "display_data",
|
@@ -3155,7 +3156,7 @@
|
|
3155 |
"metadata": {
|
3156 |
"id": "GuCNFWV_HXlf"
|
3157 |
},
|
3158 |
-
"execution_count":
|
3159 |
"outputs": []
|
3160 |
},
|
3161 |
{
|
@@ -3169,7 +3170,7 @@
|
|
3169 |
"metadata": {
|
3170 |
"id": "NiSQuUUgHdLn"
|
3171 |
},
|
3172 |
-
"execution_count":
|
3173 |
"outputs": []
|
3174 |
},
|
3175 |
{
|
@@ -3181,7 +3182,7 @@
|
|
3181 |
"metadata": {
|
3182 |
"id": "W_50sqXGHetc"
|
3183 |
},
|
3184 |
-
"execution_count":
|
3185 |
"outputs": []
|
3186 |
},
|
3187 |
{
|
@@ -3197,7 +3198,7 @@
|
|
3197 |
"id": "TntoBCAvHg0D",
|
3198 |
"outputId": "3820572d-2293-4b08-b9c5-7a6953c19ce8"
|
3199 |
},
|
3200 |
-
"execution_count":
|
3201 |
"outputs": [
|
3202 |
{
|
3203 |
"output_type": "execute_result",
|
@@ -3385,7 +3386,7 @@
|
|
3385 |
"metadata": {
|
3386 |
"id": "asZstARkHjz_"
|
3387 |
},
|
3388 |
-
"execution_count":
|
3389 |
"outputs": []
|
3390 |
},
|
3391 |
{
|
@@ -3403,7 +3404,7 @@
|
|
3403 |
"id": "qPKqxIVwHu_9",
|
3404 |
"outputId": "112271bf-d4db-47cd-9410-b92482488dd2"
|
3405 |
},
|
3406 |
-
"execution_count":
|
3407 |
"outputs": [
|
3408 |
{
|
3409 |
"output_type": "execute_result",
|
@@ -3575,7 +3576,7 @@
|
|
3575 |
"id": "QwE34N-1Hw4O",
|
3576 |
"outputId": "7a0ec80d-1c2e-497b-bc3a-a5eceae4fc11"
|
3577 |
},
|
3578 |
-
"execution_count":
|
3579 |
"outputs": [
|
3580 |
{
|
3581 |
"output_type": "execute_result",
|
@@ -3601,7 +3602,7 @@
|
|
3601 |
"id": "R-bQ7kIfH4Lr",
|
3602 |
"outputId": "1514f226-26dc-4ed4-adbc-edfef2c25e31"
|
3603 |
},
|
3604 |
-
"execution_count":
|
3605 |
"outputs": [
|
3606 |
{
|
3607 |
"output_type": "execute_result",
|
@@ -3629,7 +3630,7 @@
|
|
3629 |
"id": "vZ91HtgCH5-B",
|
3630 |
"outputId": "45ffe43b-9718-421d-c8d0-8fc2be5d9c5c"
|
3631 |
},
|
3632 |
-
"execution_count":
|
3633 |
"outputs": [
|
3634 |
{
|
3635 |
"output_type": "stream",
|
@@ -3651,7 +3652,8 @@
|
|
3651 |
{
|
3652 |
"cell_type": "markdown",
|
3653 |
"source": [
|
3654 |
-
"# Limit dataset for quick training"
|
|
|
3655 |
],
|
3656 |
"metadata": {
|
3657 |
"id": "6PQddaR0H-g-"
|
@@ -3666,7 +3668,7 @@
|
|
3666 |
"metadata": {
|
3667 |
"id": "DrtZRy3gH8IC"
|
3668 |
},
|
3669 |
-
"execution_count":
|
3670 |
"outputs": []
|
3671 |
},
|
3672 |
{
|
@@ -3681,7 +3683,8 @@
|
|
3681 |
{
|
3682 |
"cell_type": "markdown",
|
3683 |
"source": [
|
3684 |
-
"# Train test split"
|
|
|
3685 |
],
|
3686 |
"metadata": {
|
3687 |
"id": "bUEBfxgkIHQv"
|
@@ -3698,7 +3701,7 @@
|
|
3698 |
"metadata": {
|
3699 |
"id": "W7qXhZvdIJni"
|
3700 |
},
|
3701 |
-
"execution_count":
|
3702 |
"outputs": []
|
3703 |
},
|
3704 |
{
|
@@ -3716,7 +3719,7 @@
|
|
3716 |
"id": "fxqHXivAILVc",
|
3717 |
"outputId": "d5ce0660-92b9-445a-9e26-52445a7c1bb7"
|
3718 |
},
|
3719 |
-
"execution_count":
|
3720 |
"outputs": [
|
3721 |
{
|
3722 |
"output_type": "stream",
|
@@ -3739,7 +3742,7 @@
|
|
3739 |
"metadata": {
|
3740 |
"id": "c4a1lNf2INRx"
|
3741 |
},
|
3742 |
-
"execution_count":
|
3743 |
"outputs": []
|
3744 |
},
|
3745 |
{
|
@@ -3755,6 +3758,7 @@
|
|
3755 |
"cell_type": "markdown",
|
3756 |
"source": [
|
3757 |
"# Pre process steps \n",
|
|
|
3758 |
"\n",
|
3759 |
"1. Stemming and Lemmatization\n",
|
3760 |
"2. Tokenizer\n",
|
@@ -3810,7 +3814,7 @@
|
|
3810 |
"metadata": {
|
3811 |
"id": "wrs4BuXGIaQP"
|
3812 |
},
|
3813 |
-
"execution_count":
|
3814 |
"outputs": []
|
3815 |
},
|
3816 |
{
|
@@ -3838,7 +3842,7 @@
|
|
3838 |
"metadata": {
|
3839 |
"id": "z4bO5f5KIinV"
|
3840 |
},
|
3841 |
-
"execution_count":
|
3842 |
"outputs": []
|
3843 |
},
|
3844 |
{
|
@@ -3865,7 +3869,7 @@
|
|
3865 |
"id": "gWpOtBFtIqnm",
|
3866 |
"outputId": "d4a8c94c-65c8-4f7a-9248-071d0d427772"
|
3867 |
},
|
3868 |
-
"execution_count":
|
3869 |
"outputs": [
|
3870 |
{
|
3871 |
"output_type": "execute_result",
|
@@ -3896,7 +3900,7 @@
|
|
3896 |
"metadata": {
|
3897 |
"id": "kropkedQI0zE"
|
3898 |
},
|
3899 |
-
"execution_count":
|
3900 |
"outputs": []
|
3901 |
},
|
3902 |
{
|
@@ -3927,7 +3931,7 @@
|
|
3927 |
"id": "x0Kp2C5LI9nP",
|
3928 |
"outputId": "b13990c0-8aaf-42a2-ee6d-7122bf0f72a7"
|
3929 |
},
|
3930 |
-
"execution_count":
|
3931 |
"outputs": [
|
3932 |
{
|
3933 |
"output_type": "execute_result",
|
@@ -3950,7 +3954,8 @@
|
|
3950 |
{
|
3951 |
"cell_type": "markdown",
|
3952 |
"source": [
|
3953 |
-
"# Create Model"
|
|
|
3954 |
],
|
3955 |
"metadata": {
|
3956 |
"id": "OnZDEGl2JJ2c"
|
@@ -3970,7 +3975,7 @@
|
|
3970 |
"metadata": {
|
3971 |
"id": "8ENmS8h0JHcm"
|
3972 |
},
|
3973 |
-
"execution_count":
|
3974 |
"outputs": []
|
3975 |
},
|
3976 |
{
|
@@ -3992,7 +3997,7 @@
|
|
3992 |
"id": "7sByLvCUJQZP",
|
3993 |
"outputId": "16daa2e5-3955-4885-d8bc-84f05399bd52"
|
3994 |
},
|
3995 |
-
"execution_count":
|
3996 |
"outputs": [
|
3997 |
{
|
3998 |
"output_type": "stream",
|
@@ -4023,7 +4028,8 @@
|
|
4023 |
{
|
4024 |
"cell_type": "markdown",
|
4025 |
"source": [
|
4026 |
-
"# Train Model"
|
|
|
4027 |
],
|
4028 |
"metadata": {
|
4029 |
"id": "CxpgHz4AJZm1"
|
@@ -4041,7 +4047,7 @@
|
|
4041 |
"id": "m5nEI4mUJV0C",
|
4042 |
"outputId": "7fb00a14-e2c2-41ee-b0b1-0d9e7325937a"
|
4043 |
},
|
4044 |
-
"execution_count":
|
4045 |
"outputs": [
|
4046 |
{
|
4047 |
"output_type": "stream",
|
@@ -4105,7 +4111,7 @@
|
|
4105 |
"source": [
|
4106 |
"# Evaluate model \n",
|
4107 |
"\n",
|
4108 |
-
"A very bad case of overfitting happening in this trained model because of the limited data I used. Can be improved by increasing training data and tuning other parameters."
|
4109 |
],
|
4110 |
"metadata": {
|
4111 |
"id": "J69DUun-JtTH"
|
@@ -4136,7 +4142,7 @@
|
|
4136 |
"id": "I0bm7lZLJwZH",
|
4137 |
"outputId": "f92ef16a-dac1-454b-ceb6-d9077dea4d59"
|
4138 |
},
|
4139 |
-
"execution_count":
|
4140 |
"outputs": [
|
4141 |
{
|
4142 |
"output_type": "execute_result",
|
@@ -4163,7 +4169,8 @@
|
|
4163 |
{
|
4164 |
"cell_type": "markdown",
|
4165 |
"source": [
|
4166 |
-
"# Peform Inference"
|
|
|
4167 |
],
|
4168 |
"metadata": {
|
4169 |
"id": "AWOv0GINKJGj"
|
@@ -4189,7 +4196,7 @@
|
|
4189 |
"id": "lF-CiYmbJ1Eb",
|
4190 |
"outputId": "c70f4f33-a9b7-4338-83e8-5f3357cbd787"
|
4191 |
},
|
4192 |
-
"execution_count":
|
4193 |
"outputs": [
|
4194 |
{
|
4195 |
"output_type": "stream",
|
@@ -4204,7 +4211,8 @@
|
|
4204 |
{
|
4205 |
"cell_type": "markdown",
|
4206 |
"source": [
|
4207 |
-
"# Save the Model files"
|
|
|
4208 |
],
|
4209 |
"metadata": {
|
4210 |
"id": "_GfgPLt6KQRn"
|
@@ -4250,7 +4258,7 @@
|
|
4250 |
"metadata": {
|
4251 |
"id": "tVlsCgDMKkom"
|
4252 |
},
|
4253 |
-
"execution_count":
|
4254 |
"outputs": []
|
4255 |
},
|
4256 |
{
|
|
|
2768 |
},
|
2769 |
{
|
2770 |
"cell_type": "code",
|
2771 |
+
"execution_count": null,
|
2772 |
"metadata": {
|
2773 |
"colab": {
|
2774 |
"base_uri": "https://localhost:8080/"
|
|
|
2893 |
{
|
2894 |
"cell_type": "markdown",
|
2895 |
"source": [
|
2896 |
+
"# Get and process Dataset\n",
|
2897 |
+
"Downloading and loading the dataset from Huggingface. The dataset package is used to get the dataset."
|
2898 |
],
|
2899 |
"metadata": {
|
2900 |
"id": "1f0WYksXHPAM"
|
|
|
3004 |
"id": "mqwYf9xWHUuh",
|
3005 |
"outputId": "66965e88-8115-4d73-db90-6706371a0654"
|
3006 |
},
|
3007 |
+
"execution_count": null,
|
3008 |
"outputs": [
|
3009 |
{
|
3010 |
"output_type": "display_data",
|
|
|
3156 |
"metadata": {
|
3157 |
"id": "GuCNFWV_HXlf"
|
3158 |
},
|
3159 |
+
"execution_count": null,
|
3160 |
"outputs": []
|
3161 |
},
|
3162 |
{
|
|
|
3170 |
"metadata": {
|
3171 |
"id": "NiSQuUUgHdLn"
|
3172 |
},
|
3173 |
+
"execution_count": null,
|
3174 |
"outputs": []
|
3175 |
},
|
3176 |
{
|
|
|
3182 |
"metadata": {
|
3183 |
"id": "W_50sqXGHetc"
|
3184 |
},
|
3185 |
+
"execution_count": null,
|
3186 |
"outputs": []
|
3187 |
},
|
3188 |
{
|
|
|
3198 |
"id": "TntoBCAvHg0D",
|
3199 |
"outputId": "3820572d-2293-4b08-b9c5-7a6953c19ce8"
|
3200 |
},
|
3201 |
+
"execution_count": null,
|
3202 |
"outputs": [
|
3203 |
{
|
3204 |
"output_type": "execute_result",
|
|
|
3386 |
"metadata": {
|
3387 |
"id": "asZstARkHjz_"
|
3388 |
},
|
3389 |
+
"execution_count": null,
|
3390 |
"outputs": []
|
3391 |
},
|
3392 |
{
|
|
|
3404 |
"id": "qPKqxIVwHu_9",
|
3405 |
"outputId": "112271bf-d4db-47cd-9410-b92482488dd2"
|
3406 |
},
|
3407 |
+
"execution_count": null,
|
3408 |
"outputs": [
|
3409 |
{
|
3410 |
"output_type": "execute_result",
|
|
|
3576 |
"id": "QwE34N-1Hw4O",
|
3577 |
"outputId": "7a0ec80d-1c2e-497b-bc3a-a5eceae4fc11"
|
3578 |
},
|
3579 |
+
"execution_count": null,
|
3580 |
"outputs": [
|
3581 |
{
|
3582 |
"output_type": "execute_result",
|
|
|
3602 |
"id": "R-bQ7kIfH4Lr",
|
3603 |
"outputId": "1514f226-26dc-4ed4-adbc-edfef2c25e31"
|
3604 |
},
|
3605 |
+
"execution_count": null,
|
3606 |
"outputs": [
|
3607 |
{
|
3608 |
"output_type": "execute_result",
|
|
|
3630 |
"id": "vZ91HtgCH5-B",
|
3631 |
"outputId": "45ffe43b-9718-421d-c8d0-8fc2be5d9c5c"
|
3632 |
},
|
3633 |
+
"execution_count": null,
|
3634 |
"outputs": [
|
3635 |
{
|
3636 |
"output_type": "stream",
|
|
|
3652 |
{
|
3653 |
"cell_type": "markdown",
|
3654 |
"source": [
|
3655 |
+
"# Limit dataset for quick training\n",
|
3656 |
+
"This step is only done for this post example. In real scenario, good amount of data will be needed for the training."
|
3657 |
],
|
3658 |
"metadata": {
|
3659 |
"id": "6PQddaR0H-g-"
|
|
|
3668 |
"metadata": {
|
3669 |
"id": "DrtZRy3gH8IC"
|
3670 |
},
|
3671 |
+
"execution_count": null,
|
3672 |
"outputs": []
|
3673 |
},
|
3674 |
{
|
|
|
3683 |
{
|
3684 |
"cell_type": "markdown",
|
3685 |
"source": [
|
3686 |
+
"# Train test split \n",
|
3687 |
+
"Splitting the dataset into Training and Testing sets. The Train set will be used for training and the Test one will be used for evaluating the model."
|
3688 |
],
|
3689 |
"metadata": {
|
3690 |
"id": "bUEBfxgkIHQv"
|
|
|
3701 |
"metadata": {
|
3702 |
"id": "W7qXhZvdIJni"
|
3703 |
},
|
3704 |
+
"execution_count": null,
|
3705 |
"outputs": []
|
3706 |
},
|
3707 |
{
|
|
|
3719 |
"id": "fxqHXivAILVc",
|
3720 |
"outputId": "d5ce0660-92b9-445a-9e26-52445a7c1bb7"
|
3721 |
},
|
3722 |
+
"execution_count": null,
|
3723 |
"outputs": [
|
3724 |
{
|
3725 |
"output_type": "stream",
|
|
|
3742 |
"metadata": {
|
3743 |
"id": "c4a1lNf2INRx"
|
3744 |
},
|
3745 |
+
"execution_count": null,
|
3746 |
"outputs": []
|
3747 |
},
|
3748 |
{
|
|
|
3758 |
"cell_type": "markdown",
|
3759 |
"source": [
|
3760 |
"# Pre process steps \n",
|
3761 |
+
"For an efficient training, dataset need to be pre-processed to get better results. Below are the steps I am handling here.\n",
|
3762 |
"\n",
|
3763 |
"1. Stemming and Lemmatization\n",
|
3764 |
"2. Tokenizer\n",
|
|
|
3814 |
"metadata": {
|
3815 |
"id": "wrs4BuXGIaQP"
|
3816 |
},
|
3817 |
+
"execution_count": null,
|
3818 |
"outputs": []
|
3819 |
},
|
3820 |
{
|
|
|
3842 |
"metadata": {
|
3843 |
"id": "z4bO5f5KIinV"
|
3844 |
},
|
3845 |
+
"execution_count": null,
|
3846 |
"outputs": []
|
3847 |
},
|
3848 |
{
|
|
|
3869 |
"id": "gWpOtBFtIqnm",
|
3870 |
"outputId": "d4a8c94c-65c8-4f7a-9248-071d0d427772"
|
3871 |
},
|
3872 |
+
"execution_count": null,
|
3873 |
"outputs": [
|
3874 |
{
|
3875 |
"output_type": "execute_result",
|
|
|
3900 |
"metadata": {
|
3901 |
"id": "kropkedQI0zE"
|
3902 |
},
|
3903 |
+
"execution_count": null,
|
3904 |
"outputs": []
|
3905 |
},
|
3906 |
{
|
|
|
3931 |
"id": "x0Kp2C5LI9nP",
|
3932 |
"outputId": "b13990c0-8aaf-42a2-ee6d-7122bf0f72a7"
|
3933 |
},
|
3934 |
+
"execution_count": null,
|
3935 |
"outputs": [
|
3936 |
{
|
3937 |
"output_type": "execute_result",
|
|
|
3954 |
{
|
3955 |
"cell_type": "markdown",
|
3956 |
"source": [
|
3957 |
+
"# Create Model \n",
|
3958 |
+
"I am creating an LSTM model with dropout layer for this example"
|
3959 |
],
|
3960 |
"metadata": {
|
3961 |
"id": "OnZDEGl2JJ2c"
|
|
|
3975 |
"metadata": {
|
3976 |
"id": "8ENmS8h0JHcm"
|
3977 |
},
|
3978 |
+
"execution_count": null,
|
3979 |
"outputs": []
|
3980 |
},
|
3981 |
{
|
|
|
3997 |
"id": "7sByLvCUJQZP",
|
3998 |
"outputId": "16daa2e5-3955-4885-d8bc-84f05399bd52"
|
3999 |
},
|
4000 |
+
"execution_count": null,
|
4001 |
"outputs": [
|
4002 |
{
|
4003 |
"output_type": "stream",
|
|
|
4028 |
{
|
4029 |
"cell_type": "markdown",
|
4030 |
"source": [
|
4031 |
+
"# Train Model \n",
|
4032 |
+
"The actual training step for the model"
|
4033 |
],
|
4034 |
"metadata": {
|
4035 |
"id": "CxpgHz4AJZm1"
|
|
|
4047 |
"id": "m5nEI4mUJV0C",
|
4048 |
"outputId": "7fb00a14-e2c2-41ee-b0b1-0d9e7325937a"
|
4049 |
},
|
4050 |
+
"execution_count": null,
|
4051 |
"outputs": [
|
4052 |
{
|
4053 |
"output_type": "stream",
|
|
|
4111 |
"source": [
|
4112 |
"# Evaluate model \n",
|
4113 |
"\n",
|
4114 |
+
"Evaluting the performance of the model. A very bad case of overfitting happening in this trained model because of the limited data I used. Can be improved by increasing training data and tuning other parameters."
|
4115 |
],
|
4116 |
"metadata": {
|
4117 |
"id": "J69DUun-JtTH"
|
|
|
4142 |
"id": "I0bm7lZLJwZH",
|
4143 |
"outputId": "f92ef16a-dac1-454b-ceb6-d9077dea4d59"
|
4144 |
},
|
4145 |
+
"execution_count": null,
|
4146 |
"outputs": [
|
4147 |
{
|
4148 |
"output_type": "execute_result",
|
|
|
4169 |
{
|
4170 |
"cell_type": "markdown",
|
4171 |
"source": [
|
4172 |
+
"# Peform Inference \n",
|
4173 |
+
"Here the model is being tested with some text input"
|
4174 |
],
|
4175 |
"metadata": {
|
4176 |
"id": "AWOv0GINKJGj"
|
|
|
4196 |
"id": "lF-CiYmbJ1Eb",
|
4197 |
"outputId": "c70f4f33-a9b7-4338-83e8-5f3357cbd787"
|
4198 |
},
|
4199 |
+
"execution_count": null,
|
4200 |
"outputs": [
|
4201 |
{
|
4202 |
"output_type": "stream",
|
|
|
4211 |
{
|
4212 |
"cell_type": "markdown",
|
4213 |
"source": [
|
4214 |
+
"# Save the Model files \n",
|
4215 |
+
"Using MLEM package to save the model files for deployment"
|
4216 |
],
|
4217 |
"metadata": {
|
4218 |
"id": "_GfgPLt6KQRn"
|
|
|
4258 |
"metadata": {
|
4259 |
"id": "tVlsCgDMKkom"
|
4260 |
},
|
4261 |
+
"execution_count": null,
|
4262 |
"outputs": []
|
4263 |
},
|
4264 |
{
|