S-MurilloG commited on
Commit
ffe4842
·
1 Parent(s): a89a71e

Data Cleaning & Preparation

Browse files
CARSE_00_Cleaning.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -21,7 +21,7 @@
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": 2,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
@@ -64,7 +64,7 @@
64
  },
65
  {
66
  "cell_type": "code",
67
- "execution_count": 3,
68
  "metadata": {},
69
  "outputs": [
70
  {
@@ -115,7 +115,7 @@
115
  },
116
  {
117
  "cell_type": "code",
118
- "execution_count": 4,
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
@@ -152,7 +152,7 @@
152
  },
153
  {
154
  "cell_type": "code",
155
- "execution_count": 5,
156
  "metadata": {},
157
  "outputs": [
158
  {
@@ -209,7 +209,7 @@
209
  },
210
  {
211
  "cell_type": "code",
212
- "execution_count": 6,
213
  "metadata": {},
214
  "outputs": [],
215
  "source": [
@@ -230,7 +230,7 @@
230
  },
231
  {
232
  "cell_type": "code",
233
- "execution_count": 7,
234
  "metadata": {},
235
  "outputs": [
236
  {
@@ -288,7 +288,7 @@
288
  },
289
  {
290
  "cell_type": "code",
291
- "execution_count": 8,
292
  "metadata": {},
293
  "outputs": [],
294
  "source": [
@@ -345,7 +345,7 @@
345
  },
346
  {
347
  "cell_type": "code",
348
- "execution_count": 9,
349
  "metadata": {},
350
  "outputs": [
351
  {
@@ -461,7 +461,7 @@
461
  "10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
462
  ]
463
  },
464
- "execution_count": 9,
465
  "metadata": {},
466
  "output_type": "execute_result"
467
  }
@@ -505,7 +505,7 @@
505
  },
506
  {
507
  "cell_type": "code",
508
- "execution_count": 10,
509
  "metadata": {},
510
  "outputs": [],
511
  "source": [
@@ -529,7 +529,7 @@
529
  },
530
  {
531
  "cell_type": "code",
532
- "execution_count": 11,
533
  "metadata": {},
534
  "outputs": [
535
  {
@@ -589,33 +589,33 @@
589
  " <td>...</td>\n",
590
  " </tr>\n",
591
  " <tr>\n",
592
- " <th>665</th>\n",
593
- " <td>Dime algo bonito</td>\n",
594
- " <td>Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ...</td>\n",
595
  " </tr>\n",
596
  " <tr>\n",
597
- " <th>666</th>\n",
598
- " <td>Dime algo bonito</td>\n",
599
- " <td>Yo digo que tal vez el universo ha dicho que e...</td>\n",
600
  " </tr>\n",
601
  " <tr>\n",
602
- " <th>667</th>\n",
603
- " <td>Oye dime algo bonito</td>\n",
604
- " <td>Deja veo si ya me depositaste en mi cuenta y v...</td>\n",
605
  " </tr>\n",
606
  " <tr>\n",
607
- " <th>668</th>\n",
608
- " <td>Oye dime algo bonito</td>\n",
609
- " <td>Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng...</td>\n",
610
  " </tr>\n",
611
  " <tr>\n",
612
- " <th>669</th>\n",
613
- " <td>Oye dime algo bonito</td>\n",
614
- " <td>Solo si veo un depósito en mi cuenta en menos ...</td>\n",
615
  " </tr>\n",
616
  " </tbody>\n",
617
  "</table>\n",
618
- "<p>670 rows × 2 columns</p>\n",
619
  "</div>"
620
  ],
621
  "text/plain": [
@@ -626,11 +626,11 @@
626
  "3 Buenos días mi amor, espero que hayas podido d... \n",
627
  "4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
628
  ".. ... \n",
629
- "665 Dime algo bonito \n",
630
- "666 Dime algo bonito \n",
631
- "667 Oye dime algo bonito \n",
632
- "668 Oye dime algo bonito \n",
633
- "669 Oye dime algo bonito \n",
634
  "\n",
635
  " Sebas \n",
636
  "0 Buenos d��as mi amorrrr\\nBien bien, pero hacía ... \n",
@@ -639,22 +639,26 @@
639
  "3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
640
  "4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
641
  ".. ... \n",
642
- "665 Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ... \n",
643
- "666 Yo digo que tal vez el universo ha dicho que e... \n",
644
- "667 Deja veo si ya me depositaste en mi cuenta y v... \n",
645
- "668 Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng... \n",
646
- "669 Solo si veo un depósito en mi cuenta en menos ... \n",
647
  "\n",
648
- "[670 rows x 2 columns]"
649
  ]
650
  },
651
- "execution_count": 11,
652
  "metadata": {},
653
  "output_type": "execute_result"
654
  }
655
  ],
656
  "source": [
657
- "chat_df = crear_dataset(texto_sin_timestamp)\n",
 
 
 
 
658
  "chat_df"
659
  ]
660
  },
@@ -667,7 +671,7 @@
667
  },
668
  {
669
  "cell_type": "code",
670
- "execution_count": 12,
671
  "metadata": {},
672
  "outputs": [],
673
  "source": [
@@ -683,7 +687,7 @@
683
  },
684
  {
685
  "cell_type": "code",
686
- "execution_count": 13,
687
  "metadata": {},
688
  "outputs": [
689
  {
@@ -750,38 +754,38 @@
750
  " <td>...</td>\n",
751
  " </tr>\n",
752
  " <tr>\n",
753
- " <th>665</th>\n",
754
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
755
- " <td>Dime algo bonito</td>\n",
756
- " <td>Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ...</td>\n",
757
  " </tr>\n",
758
  " <tr>\n",
759
- " <th>666</th>\n",
760
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
761
- " <td>Dime algo bonito</td>\n",
762
- " <td>Yo digo que tal vez el universo ha dicho que e...</td>\n",
763
  " </tr>\n",
764
  " <tr>\n",
765
- " <th>667</th>\n",
766
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
767
- " <td>Oye dime algo bonito</td>\n",
768
- " <td>Deja veo si ya me depositaste en mi cuenta y v...</td>\n",
769
  " </tr>\n",
770
  " <tr>\n",
771
- " <th>668</th>\n",
772
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
773
- " <td>Oye dime algo bonito</td>\n",
774
- " <td>Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng...</td>\n",
775
  " </tr>\n",
776
  " <tr>\n",
777
- " <th>669</th>\n",
778
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
779
- " <td>Oye dime algo bonito</td>\n",
780
- " <td>Solo si veo un depósito en mi cuenta en menos ...</td>\n",
781
  " </tr>\n",
782
  " </tbody>\n",
783
  "</table>\n",
784
- "<p>670 rows × 3 columns</p>\n",
785
  "</div>"
786
  ],
787
  "text/plain": [
@@ -792,11 +796,11 @@
792
  "3 CARSE es un chatbot que imita el estilo en que... \n",
793
  "4 CARSE es un chatbot que imita el estilo en que... \n",
794
  ".. ... \n",
795
- "665 CARSE es un chatbot que imita el estilo en que... \n",
796
- "666 CARSE es un chatbot que imita el estilo en que... \n",
797
- "667 CARSE es un chatbot que imita el estilo en que... \n",
798
- "668 CARSE es un chatbot que imita el estilo en que... \n",
799
- "669 CARSE es un chatbot que imita el estilo en que... \n",
800
  "\n",
801
  " CarmenQ \\\n",
802
  "0 Buenos días mi amorchis, cómo dormiste hoy? \n",
@@ -805,11 +809,11 @@
805
  "3 Buenos días mi amor, espero que hayas podido d... \n",
806
  "4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
807
  ".. ... \n",
808
- "665 Dime algo bonito \n",
809
- "666 Dime algo bonito \n",
810
- "667 Oye dime algo bonito \n",
811
- "668 Oye dime algo bonito \n",
812
- "669 Oye dime algo bonito \n",
813
  "\n",
814
  " Sebas \n",
815
  "0 Buenos días mi amorrrr\\nBien bien, pero hacía ... \n",
@@ -818,16 +822,16 @@
818
  "3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
819
  "4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
820
  ".. ... \n",
821
- "665 Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ... \n",
822
- "666 Yo digo que tal vez el universo ha dicho que e... \n",
823
- "667 Deja veo si ya me depositaste en mi cuenta y v... \n",
824
- "668 Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng... \n",
825
- "669 Solo si veo un depósito en mi cuenta en menos ... \n",
826
  "\n",
827
- "[670 rows x 3 columns]"
828
  ]
829
  },
830
- "execution_count": 13,
831
  "metadata": {},
832
  "output_type": "execute_result"
833
  }
@@ -840,7 +844,7 @@
840
  },
841
  {
842
  "cell_type": "code",
843
- "execution_count": 14,
844
  "metadata": {},
845
  "outputs": [],
846
  "source": [
@@ -853,7 +857,7 @@
853
  },
854
  {
855
  "cell_type": "code",
856
- "execution_count": 15,
857
  "metadata": {},
858
  "outputs": [
859
  {
@@ -920,38 +924,38 @@
920
  " <td>...</td>\n",
921
  " </tr>\n",
922
  " <tr>\n",
923
- " <th>665</th>\n",
924
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
925
- " <td>Dime algo bonito</td>\n",
926
- " <td>Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ...</td>\n",
927
  " </tr>\n",
928
  " <tr>\n",
929
- " <th>666</th>\n",
930
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
931
- " <td>Dime algo bonito</td>\n",
932
- " <td>Yo digo que tal vez el universo ha dicho que e...</td>\n",
933
  " </tr>\n",
934
  " <tr>\n",
935
- " <th>667</th>\n",
936
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
937
- " <td>Oye dime algo bonito</td>\n",
938
- " <td>Deja veo si ya me depositaste en mi cuenta y v...</td>\n",
939
  " </tr>\n",
940
  " <tr>\n",
941
- " <th>668</th>\n",
942
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
943
- " <td>Oye dime algo bonito</td>\n",
944
- " <td>Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng...</td>\n",
945
  " </tr>\n",
946
  " <tr>\n",
947
- " <th>669</th>\n",
948
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
949
- " <td>Oye dime algo bonito</td>\n",
950
- " <td>Solo si veo un depósito en mi cuenta en menos ...</td>\n",
951
  " </tr>\n",
952
  " </tbody>\n",
953
  "</table>\n",
954
- "<p>670 rows × 3 columns</p>\n",
955
  "</div>"
956
  ],
957
  "text/plain": [
@@ -962,11 +966,11 @@
962
  "3 CARSE es un chatbot que imita el estilo en que... \n",
963
  "4 CARSE es un chatbot que imita el estilo en que... \n",
964
  ".. ... \n",
965
- "665 CARSE es un chatbot que imita el estilo en que... \n",
966
- "666 CARSE es un chatbot que imita el estilo en que... \n",
967
- "667 CARSE es un chatbot que imita el estilo en que... \n",
968
- "668 CARSE es un chatbot que imita el estilo en que... \n",
969
- "669 CARSE es un chatbot que imita el estilo en que... \n",
970
  "\n",
971
  " user \\\n",
972
  "0 Buenos días mi amorchis, cómo dormiste hoy? \n",
@@ -975,11 +979,11 @@
975
  "3 Buenos días mi amor, espero que hayas podido d... \n",
976
  "4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
977
  ".. ... \n",
978
- "665 Dime algo bonito \n",
979
- "666 Dime algo bonito \n",
980
- "667 Oye dime algo bonito \n",
981
- "668 Oye dime algo bonito \n",
982
- "669 Oye dime algo bonito \n",
983
  "\n",
984
  " assistant \n",
985
  "0 Buenos días mi amorrrr\\nBien bien, pero hacía ... \n",
@@ -988,16 +992,16 @@
988
  "3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
989
  "4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
990
  ".. ... \n",
991
- "665 Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ... \n",
992
- "666 Yo digo que tal vez el universo ha dicho que e... \n",
993
- "667 Deja veo si ya me depositaste en mi cuenta y v... \n",
994
- "668 Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng... \n",
995
- "669 Solo si veo un depósito en mi cuenta en menos ... \n",
996
  "\n",
997
- "[670 rows x 3 columns]"
998
  ]
999
  },
1000
- "execution_count": 15,
1001
  "metadata": {},
1002
  "output_type": "execute_result"
1003
  }
@@ -1010,7 +1014,7 @@
1010
  },
1011
  {
1012
  "cell_type": "code",
1013
- "execution_count": 16,
1014
  "metadata": {},
1015
  "outputs": [
1016
  {
@@ -1043,32 +1047,32 @@
1043
  " <tr>\n",
1044
  " <th>0</th>\n",
1045
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1046
- " <td>Tengo mucho sueño aún</td>\n",
1047
- " <td>Ay amorcito, a ver si te echas a dormir en el ...</td>\n",
1048
  " </tr>\n",
1049
  " <tr>\n",
1050
  " <th>1</th>\n",
1051
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1052
- " <td>Ay amor que frío hace</td>\n",
1053
- " <td>Pues que bueno, mínimo no estamos sudando</td>\n",
1054
  " </tr>\n",
1055
  " <tr>\n",
1056
  " <th>2</th>\n",
1057
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1058
- " <td>Ya adivina\\nLo que se te ocurra</td>\n",
1059
- " <td>Esque no se me ocurrió nada mas</td>\n",
1060
  " </tr>\n",
1061
  " <tr>\n",
1062
  " <th>3</th>\n",
1063
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1064
- " <td>Biennnn, pero tengo sueño aún</td>\n",
1065
- " <td>Si tienes descanso, duerme</td>\n",
1066
  " </tr>\n",
1067
  " <tr>\n",
1068
  " <th>4</th>\n",
1069
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1070
- " <td>Yo te amo massssss</td>\n",
1071
- " <td>Eso es mega feik eh</td>\n",
1072
  " </tr>\n",
1073
  " <tr>\n",
1074
  " <th>...</th>\n",
@@ -1077,38 +1081,38 @@
1077
  " <td>...</td>\n",
1078
  " </tr>\n",
1079
  " <tr>\n",
1080
- " <th>665</th>\n",
1081
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1082
  " <td>Pues ya toca casarnos</td>\n",
1083
  " <td>Por fiiiin, ya te habías tardado en decir eso</td>\n",
1084
  " </tr>\n",
1085
  " <tr>\n",
1086
- " <th>666</th>\n",
1087
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1088
  " <td>Biennnnn\\n¿Y tu?</td>\n",
1089
  " <td>Bien igual</td>\n",
1090
  " </tr>\n",
1091
  " <tr>\n",
1092
- " <th>667</th>\n",
1093
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1094
  " <td>JAJAJJAJAJA no</td>\n",
1095
  " <td>Ya dimeeeee</td>\n",
1096
  " </tr>\n",
1097
  " <tr>\n",
1098
- " <th>668</th>\n",
1099
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1100
  " <td>Casarme contigo algún día</td>\n",
1101
  " <td>Awwww amoorr, sí quiero casarme contigo algún ...</td>\n",
1102
  " </tr>\n",
1103
  " <tr>\n",
1104
- " <th>669</th>\n",
1105
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1106
  " <td>Me estaba acabando el café</td>\n",
1107
  " <td>Disfruta tu cafecito</td>\n",
1108
  " </tr>\n",
1109
  " </tbody>\n",
1110
  "</table>\n",
1111
- "<p>670 rows × 3 columns</p>\n",
1112
  "</div>"
1113
  ],
1114
  "text/plain": [
@@ -1119,42 +1123,42 @@
1119
  "3 CARSE es un chatbot que imita el estilo en que... \n",
1120
  "4 CARSE es un chatbot que imita el estilo en que... \n",
1121
  ".. ... \n",
1122
- "665 CARSE es un chatbot que imita el estilo en que... \n",
1123
- "666 CARSE es un chatbot que imita el estilo en que... \n",
1124
- "667 CARSE es un chatbot que imita el estilo en que... \n",
1125
- "668 CARSE es un chatbot que imita el estilo en que... \n",
1126
- "669 CARSE es un chatbot que imita el estilo en que... \n",
1127
  "\n",
1128
- " user \\\n",
1129
- "0 Tengo mucho sueño aún \n",
1130
- "1 Ay amor que frío hace \n",
1131
- "2 Ya adivina\\nLo que se te ocurra \n",
1132
- "3 Biennnn, pero tengo sueño aún \n",
1133
- "4 Yo te amo massssss \n",
1134
- ".. ... \n",
1135
- "665 Pues ya toca casarnos \n",
1136
- "666 Biennnnn\\n¿Y tu? \n",
1137
- "667 JAJAJJAJAJA no \n",
1138
- "668 Casarme contigo algún día \n",
1139
- "669 Me estaba acabando el café \n",
1140
  "\n",
1141
  " assistant \n",
1142
- "0 Ay amorcito, a ver si te echas a dormir en el ... \n",
1143
- "1 Pues que bueno, mínimo no estamos sudando \n",
1144
- "2 Esque no se me ocurrió nada mas \n",
1145
- "3 Si tienes descanso, duerme \n",
1146
- "4 Eso es mega feik eh \n",
1147
  ".. ... \n",
1148
- "665 Por fiiiin, ya te habías tardado en decir eso \n",
1149
- "666 Bien igual \n",
1150
- "667 Ya dimeeeee \n",
1151
- "668 Awwww amoorr, sí quiero casarme contigo algún ... \n",
1152
- "669 Disfruta tu cafecito \n",
1153
  "\n",
1154
- "[670 rows x 3 columns]"
1155
  ]
1156
  },
1157
- "execution_count": 16,
1158
  "metadata": {},
1159
  "output_type": "execute_result"
1160
  }
@@ -1174,7 +1178,7 @@
1174
  },
1175
  {
1176
  "cell_type": "code",
1177
- "execution_count": 17,
1178
  "metadata": {},
1179
  "outputs": [],
1180
  "source": [
@@ -1198,7 +1202,7 @@
1198
  },
1199
  {
1200
  "cell_type": "code",
1201
- "execution_count": 18,
1202
  "metadata": {},
1203
  "outputs": [],
1204
  "source": [
@@ -1209,14 +1213,14 @@
1209
  },
1210
  {
1211
  "cell_type": "code",
1212
- "execution_count": 19,
1213
  "metadata": {},
1214
  "outputs": [
1215
  {
1216
  "name": "stdout",
1217
  "output_type": "stream",
1218
  "text": [
1219
- "El archivo tiene 670 líneas.\n"
1220
  ]
1221
  }
1222
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 12,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": 13,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
 
64
  },
65
  {
66
  "cell_type": "code",
67
+ "execution_count": 14,
68
  "metadata": {},
69
  "outputs": [
70
  {
 
115
  },
116
  {
117
  "cell_type": "code",
118
+ "execution_count": 15,
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
 
152
  },
153
  {
154
  "cell_type": "code",
155
+ "execution_count": 16,
156
  "metadata": {},
157
  "outputs": [
158
  {
 
209
  },
210
  {
211
  "cell_type": "code",
212
+ "execution_count": 17,
213
  "metadata": {},
214
  "outputs": [],
215
  "source": [
 
230
  },
231
  {
232
  "cell_type": "code",
233
+ "execution_count": 18,
234
  "metadata": {},
235
  "outputs": [
236
  {
 
288
  },
289
  {
290
  "cell_type": "code",
291
+ "execution_count": 19,
292
  "metadata": {},
293
  "outputs": [],
294
  "source": [
 
345
  },
346
  {
347
  "cell_type": "code",
348
+ "execution_count": 20,
349
  "metadata": {},
350
  "outputs": [
351
  {
 
461
  "10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
462
  ]
463
  },
464
+ "execution_count": 20,
465
  "metadata": {},
466
  "output_type": "execute_result"
467
  }
 
505
  },
506
  {
507
  "cell_type": "code",
508
+ "execution_count": 21,
509
  "metadata": {},
510
  "outputs": [],
511
  "source": [
 
529
  },
530
  {
531
  "cell_type": "code",
532
+ "execution_count": 23,
533
  "metadata": {},
534
  "outputs": [
535
  {
 
589
  " <td>...</td>\n",
590
  " </tr>\n",
591
  " <tr>\n",
592
+ " <th>745</th>\n",
593
+ " <td>Descansaste?\\nQue soñaste?</td>\n",
594
+ " <td>No muy bien amor, la verdad es que me desperté...</td>\n",
595
  " </tr>\n",
596
  " <tr>\n",
597
+ " <th>746</th>\n",
598
+ " <td>Hay que ver una peli de miedoooo</td>\n",
599
+ " <td> amooor, vamos al cine a verla</td>\n",
600
  " </tr>\n",
601
  " <tr>\n",
602
+ " <th>747</th>\n",
603
+ " <td>Hay que venir a comer aquí</td>\n",
604
+ " <td>Cuando quieras bb, ya sabes que yo estoy super...</td>\n",
605
  " </tr>\n",
606
  " <tr>\n",
607
+ " <th>748</th>\n",
608
+ " <td>Estoy muy triste amor</td>\n",
609
+ " <td>Por qué mi amor? Cuéntame si quieres</td>\n",
610
  " </tr>\n",
611
  " <tr>\n",
612
+ " <th>749</th>\n",
613
+ " <td>Te extraño aquí en el gym :(</td>\n",
614
+ " <td>Yo también te extraño bb, pero por ahora disfr...</td>\n",
615
  " </tr>\n",
616
  " </tbody>\n",
617
  "</table>\n",
618
+ "<p>750 rows × 2 columns</p>\n",
619
  "</div>"
620
  ],
621
  "text/plain": [
 
626
  "3 Buenos días mi amor, espero que hayas podido d... \n",
627
  "4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
628
  ".. ... \n",
629
+ "745 Descansaste?\\nQue soñaste? \n",
630
+ "746 Hay que ver una peli de miedoooo \n",
631
+ "747 Hay que venir a comer aquí \n",
632
+ "748 Estoy muy triste amor \n",
633
+ "749 Te extraño aquí en el gym :( \n",
634
  "\n",
635
  " Sebas \n",
636
  "0 Buenos d��as mi amorrrr\\nBien bien, pero hacía ... \n",
 
639
  "3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
640
  "4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
641
  ".. ... \n",
642
+ "745 No muy bien amor, la verdad es que me desperté... \n",
643
+ "746 Sí amooor, vamos al cine a verla \n",
644
+ "747 Cuando quieras bb, ya sabes que yo estoy super... \n",
645
+ "748 Por qué mi amor? Cuéntame si quieres \n",
646
+ "749 Yo también te extraño bb, pero por ahora disfr... \n",
647
  "\n",
648
+ "[750 rows x 2 columns]"
649
  ]
650
  },
651
+ "execution_count": 23,
652
  "metadata": {},
653
  "output_type": "execute_result"
654
  }
655
  ],
656
  "source": [
657
+ "# Leer el contenido del archivo\n",
658
+ "with open(\"Raw_Data/Transformed_Prompts.txt\", 'r', encoding='utf-8') as archivo:\n",
659
+ " texto = archivo.read()\n",
660
+ "\n",
661
+ "chat_df = crear_dataset(texto)\n",
662
  "chat_df"
663
  ]
664
  },
 
671
  },
672
  {
673
  "cell_type": "code",
674
+ "execution_count": 24,
675
  "metadata": {},
676
  "outputs": [],
677
  "source": [
 
687
  },
688
  {
689
  "cell_type": "code",
690
+ "execution_count": 25,
691
  "metadata": {},
692
  "outputs": [
693
  {
 
754
  " <td>...</td>\n",
755
  " </tr>\n",
756
  " <tr>\n",
757
+ " <th>745</th>\n",
758
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
759
+ " <td>Descansaste?\\nQue soñaste?</td>\n",
760
+ " <td>No muy bien amor, la verdad es que me desperté...</td>\n",
761
  " </tr>\n",
762
  " <tr>\n",
763
+ " <th>746</th>\n",
764
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
765
+ " <td>Hay que ver una peli de miedoooo</td>\n",
766
+ " <td> amooor, vamos al cine a verla</td>\n",
767
  " </tr>\n",
768
  " <tr>\n",
769
+ " <th>747</th>\n",
770
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
771
+ " <td>Hay que venir a comer aquí</td>\n",
772
+ " <td>Cuando quieras bb, ya sabes que yo estoy super...</td>\n",
773
  " </tr>\n",
774
  " <tr>\n",
775
+ " <th>748</th>\n",
776
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
777
+ " <td>Estoy muy triste amor</td>\n",
778
+ " <td>Por qué mi amor? Cuéntame si quieres</td>\n",
779
  " </tr>\n",
780
  " <tr>\n",
781
+ " <th>749</th>\n",
782
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
783
+ " <td>Te extraño aquí en el gym :(</td>\n",
784
+ " <td>Yo también te extraño bb, pero por ahora disfr...</td>\n",
785
  " </tr>\n",
786
  " </tbody>\n",
787
  "</table>\n",
788
+ "<p>750 rows × 3 columns</p>\n",
789
  "</div>"
790
  ],
791
  "text/plain": [
 
796
  "3 CARSE es un chatbot que imita el estilo en que... \n",
797
  "4 CARSE es un chatbot que imita el estilo en que... \n",
798
  ".. ... \n",
799
+ "745 CARSE es un chatbot que imita el estilo en que... \n",
800
+ "746 CARSE es un chatbot que imita el estilo en que... \n",
801
+ "747 CARSE es un chatbot que imita el estilo en que... \n",
802
+ "748 CARSE es un chatbot que imita el estilo en que... \n",
803
+ "749 CARSE es un chatbot que imita el estilo en que... \n",
804
  "\n",
805
  " CarmenQ \\\n",
806
  "0 Buenos días mi amorchis, cómo dormiste hoy? \n",
 
809
  "3 Buenos días mi amor, espero que hayas podido d... \n",
810
  "4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
811
  ".. ... \n",
812
+ "745 Descansaste?\\nQue soñaste? \n",
813
+ "746 Hay que ver una peli de miedoooo \n",
814
+ "747 Hay que venir a comer aquí \n",
815
+ "748 Estoy muy triste amor \n",
816
+ "749 Te extraño aquí en el gym :( \n",
817
  "\n",
818
  " Sebas \n",
819
  "0 Buenos días mi amorrrr\\nBien bien, pero hacía ... \n",
 
822
  "3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
823
  "4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
824
  ".. ... \n",
825
+ "745 No muy bien amor, la verdad es que me desperté... \n",
826
+ "746 Sí amooor, vamos al cine a verla \n",
827
+ "747 Cuando quieras bb, ya sabes que yo estoy super... \n",
828
+ "748 Por qué mi amor? Cuéntame si quieres \n",
829
+ "749 Yo también te extraño bb, pero por ahora disfr... \n",
830
  "\n",
831
+ "[750 rows x 3 columns]"
832
  ]
833
  },
834
+ "execution_count": 25,
835
  "metadata": {},
836
  "output_type": "execute_result"
837
  }
 
844
  },
845
  {
846
  "cell_type": "code",
847
+ "execution_count": 26,
848
  "metadata": {},
849
  "outputs": [],
850
  "source": [
 
857
  },
858
  {
859
  "cell_type": "code",
860
+ "execution_count": 27,
861
  "metadata": {},
862
  "outputs": [
863
  {
 
924
  " <td>...</td>\n",
925
  " </tr>\n",
926
  " <tr>\n",
927
+ " <th>745</th>\n",
928
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
929
+ " <td>Descansaste?\\nQue soñaste?</td>\n",
930
+ " <td>No muy bien amor, la verdad es que me desperté...</td>\n",
931
  " </tr>\n",
932
  " <tr>\n",
933
+ " <th>746</th>\n",
934
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
935
+ " <td>Hay que ver una peli de miedoooo</td>\n",
936
+ " <td> amooor, vamos al cine a verla</td>\n",
937
  " </tr>\n",
938
  " <tr>\n",
939
+ " <th>747</th>\n",
940
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
941
+ " <td>Hay que venir a comer aquí</td>\n",
942
+ " <td>Cuando quieras bb, ya sabes que yo estoy super...</td>\n",
943
  " </tr>\n",
944
  " <tr>\n",
945
+ " <th>748</th>\n",
946
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
947
+ " <td>Estoy muy triste amor</td>\n",
948
+ " <td>Por qué mi amor? Cuéntame si quieres</td>\n",
949
  " </tr>\n",
950
  " <tr>\n",
951
+ " <th>749</th>\n",
952
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
953
+ " <td>Te extraño aquí en el gym :(</td>\n",
954
+ " <td>Yo también te extraño bb, pero por ahora disfr...</td>\n",
955
  " </tr>\n",
956
  " </tbody>\n",
957
  "</table>\n",
958
+ "<p>750 rows × 3 columns</p>\n",
959
  "</div>"
960
  ],
961
  "text/plain": [
 
966
  "3 CARSE es un chatbot que imita el estilo en que... \n",
967
  "4 CARSE es un chatbot que imita el estilo en que... \n",
968
  ".. ... \n",
969
+ "745 CARSE es un chatbot que imita el estilo en que... \n",
970
+ "746 CARSE es un chatbot que imita el estilo en que... \n",
971
+ "747 CARSE es un chatbot que imita el estilo en que... \n",
972
+ "748 CARSE es un chatbot que imita el estilo en que... \n",
973
+ "749 CARSE es un chatbot que imita el estilo en que... \n",
974
  "\n",
975
  " user \\\n",
976
  "0 Buenos días mi amorchis, cómo dormiste hoy? \n",
 
979
  "3 Buenos días mi amor, espero que hayas podido d... \n",
980
  "4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
981
  ".. ... \n",
982
+ "745 Descansaste?\\nQue soñaste? \n",
983
+ "746 Hay que ver una peli de miedoooo \n",
984
+ "747 Hay que venir a comer aquí \n",
985
+ "748 Estoy muy triste amor \n",
986
+ "749 Te extraño aquí en el gym :( \n",
987
  "\n",
988
  " assistant \n",
989
  "0 Buenos días mi amorrrr\\nBien bien, pero hacía ... \n",
 
992
  "3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
993
  "4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
994
  ".. ... \n",
995
+ "745 No muy bien amor, la verdad es que me desperté... \n",
996
+ "746 Sí amooor, vamos al cine a verla \n",
997
+ "747 Cuando quieras bb, ya sabes que yo estoy super... \n",
998
+ "748 Por qué mi amor? Cuéntame si quieres \n",
999
+ "749 Yo también te extraño bb, pero por ahora disfr... \n",
1000
  "\n",
1001
+ "[750 rows x 3 columns]"
1002
  ]
1003
  },
1004
+ "execution_count": 27,
1005
  "metadata": {},
1006
  "output_type": "execute_result"
1007
  }
 
1014
  },
1015
  {
1016
  "cell_type": "code",
1017
+ "execution_count": 28,
1018
  "metadata": {},
1019
  "outputs": [
1020
  {
 
1047
  " <tr>\n",
1048
  " <th>0</th>\n",
1049
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1050
+ " <td>Yo me iré al gym en unos minutos</td>\n",
1051
+ " <td>Super mi amorrr, disfruta mucho tu ejercicio, ...</td>\n",
1052
  " </tr>\n",
1053
  " <tr>\n",
1054
  " <th>1</th>\n",
1055
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1056
+ " <td>Ya ando en el gym amor</td>\n",
1057
+ " <td>Super mi amorrr, disfruta mucho tu ejercicio\\n...</td>\n",
1058
  " </tr>\n",
1059
  " <tr>\n",
1060
  " <th>2</th>\n",
1061
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1062
+ " <td>Te mandaré nota</td>\n",
1063
+ " <td>Vale amor, mándame lo que quieras, que sí me g...</td>\n",
1064
  " </tr>\n",
1065
  " <tr>\n",
1066
  " <th>3</th>\n",
1067
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1068
+ " <td>Te extraño poquis</td>\n",
1069
+ " <td>Igual te extraño poquis\\nIntentaré organizarme...</td>\n",
1070
  " </tr>\n",
1071
  " <tr>\n",
1072
  " <th>4</th>\n",
1073
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1074
+ " <td>He dormido mejor\\nSiento que no descansé nada</td>\n",
1075
+ " <td>Y por qué no duermes un rato más?</td>\n",
1076
  " </tr>\n",
1077
  " <tr>\n",
1078
  " <th>...</th>\n",
 
1081
  " <td>...</td>\n",
1082
  " </tr>\n",
1083
  " <tr>\n",
1084
+ " <th>745</th>\n",
1085
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1086
  " <td>Pues ya toca casarnos</td>\n",
1087
  " <td>Por fiiiin, ya te habías tardado en decir eso</td>\n",
1088
  " </tr>\n",
1089
  " <tr>\n",
1090
+ " <th>746</th>\n",
1091
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1092
  " <td>Biennnnn\\n¿Y tu?</td>\n",
1093
  " <td>Bien igual</td>\n",
1094
  " </tr>\n",
1095
  " <tr>\n",
1096
+ " <th>747</th>\n",
1097
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1098
  " <td>JAJAJJAJAJA no</td>\n",
1099
  " <td>Ya dimeeeee</td>\n",
1100
  " </tr>\n",
1101
  " <tr>\n",
1102
+ " <th>748</th>\n",
1103
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1104
  " <td>Casarme contigo algún día</td>\n",
1105
  " <td>Awwww amoorr, sí quiero casarme contigo algún ...</td>\n",
1106
  " </tr>\n",
1107
  " <tr>\n",
1108
+ " <th>749</th>\n",
1109
  " <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
1110
  " <td>Me estaba acabando el café</td>\n",
1111
  " <td>Disfruta tu cafecito</td>\n",
1112
  " </tr>\n",
1113
  " </tbody>\n",
1114
  "</table>\n",
1115
+ "<p>750 rows × 3 columns</p>\n",
1116
  "</div>"
1117
  ],
1118
  "text/plain": [
 
1123
  "3 CARSE es un chatbot que imita el estilo en que... \n",
1124
  "4 CARSE es un chatbot que imita el estilo en que... \n",
1125
  ".. ... \n",
1126
+ "745 CARSE es un chatbot que imita el estilo en que... \n",
1127
+ "746 CARSE es un chatbot que imita el estilo en que... \n",
1128
+ "747 CARSE es un chatbot que imita el estilo en que... \n",
1129
+ "748 CARSE es un chatbot que imita el estilo en que... \n",
1130
+ "749 CARSE es un chatbot que imita el estilo en que... \n",
1131
  "\n",
1132
+ " user \\\n",
1133
+ "0 Yo me iré al gym en unos minutos \n",
1134
+ "1 Ya ando en el gym amor \n",
1135
+ "2 Te mandaré nota \n",
1136
+ "3 Te extraño poquis \n",
1137
+ "4 He dormido mejor\\nSiento que no descansé nada \n",
1138
+ ".. ... \n",
1139
+ "745 Pues ya toca casarnos \n",
1140
+ "746 Biennnnn\\n¿Y tu? \n",
1141
+ "747 JAJAJJAJAJA no \n",
1142
+ "748 Casarme contigo algún día \n",
1143
+ "749 Me estaba acabando el café \n",
1144
  "\n",
1145
  " assistant \n",
1146
+ "0 Super mi amorrr, disfruta mucho tu ejercicio, ... \n",
1147
+ "1 Super mi amorrr, disfruta mucho tu ejercicio\\n... \n",
1148
+ "2 Vale amor, mándame lo que quieras, que sí me g... \n",
1149
+ "3 Igual te extraño poquis\\nIntentaré organizarme... \n",
1150
+ "4 Y por qué no duermes un rato más? \n",
1151
  ".. ... \n",
1152
+ "745 Por fiiiin, ya te habías tardado en decir eso \n",
1153
+ "746 Bien igual \n",
1154
+ "747 Ya dimeeeee \n",
1155
+ "748 Awwww amoorr, sí quiero casarme contigo algún ... \n",
1156
+ "749 Disfruta tu cafecito \n",
1157
  "\n",
1158
+ "[750 rows x 3 columns]"
1159
  ]
1160
  },
1161
+ "execution_count": 28,
1162
  "metadata": {},
1163
  "output_type": "execute_result"
1164
  }
 
1178
  },
1179
  {
1180
  "cell_type": "code",
1181
+ "execution_count": 29,
1182
  "metadata": {},
1183
  "outputs": [],
1184
  "source": [
 
1202
  },
1203
  {
1204
  "cell_type": "code",
1205
+ "execution_count": 30,
1206
  "metadata": {},
1207
  "outputs": [],
1208
  "source": [
 
1213
  },
1214
  {
1215
  "cell_type": "code",
1216
+ "execution_count": 31,
1217
  "metadata": {},
1218
  "outputs": [
1219
  {
1220
  "name": "stdout",
1221
  "output_type": "stream",
1222
  "text": [
1223
+ "El archivo tiene 750 líneas.\n"
1224
  ]
1225
  }
1226
  ],
CARSE_01_Prep.ipynb CHANGED
@@ -1 +1 @@
1
- {"cells":[{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":173,"status":"ok","timestamp":1702156660830,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"pEb6CJl3ZFKP"},"outputs":[],"source":["#! pip install tiktoken"]},{"cell_type":"code","execution_count":4,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1702156660831,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"zj6dSJC3Y01I"},"outputs":[],"source":["import json\n","import tiktoken # for token counting\n","import numpy as np\n","from collections import defaultdict"]},{"cell_type":"markdown","metadata":{"id":"SmN-A6KJZQPo"},"source":["<br>\n","<br>\n","\n","## Data Loading"]},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"-geBXCMTZObN","outputId":"924e87bd-4c01-4eff-ac93-10f07e27fe41"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","Num examples: 670\n","First example:\n","{'role': 'system', 'content': 'CARSE es un chatbot que imita el estilo en que Sebas conversa. Fue creado con amor solo para Carmen. Su objetivo es ofrecer una experiencia de chat divertida y familiar que exprese la personalidad, el humor y el amor de Sebas hacia Carmen.'}\n","{'role': 'user', 'content': 'Tengo mucho sueño aún'}\n","{'role': 'assistant', 'content': 'Ay amorcito, a ver si te echas a dormir en el coche un rato'}\n"]}],"source":["data_path = \"Training_data/Training_Prompts.jsonl\"\n","\n","# Load the dataset\n","with open(data_path, 'r', encoding='utf-8') as f:\n"," dataset = [json.loads(line) for line in f]\n","\n","# Initial dataset stats\n","print(\"\\nNum examples:\", len(dataset))\n","print(\"First example:\")\n","for message in dataset[0][\"messages\"]:\n"," print(message)"]},{"cell_type":"markdown","metadata":{"id":"UfxL9TQMaCCG"},"source":["<br>\n","<br>\n","\n","## Format Validation"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"bOXkqnLgZakM","outputId":"8c716779-0f83-4fbf-c302-56d834735327"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","No errors found\n"]}],"source":["# Format error checks\n","format_errors = defaultdict(int)\n","\n","for ex in dataset:\n"," if not isinstance(ex, dict):\n"," format_errors[\"data_type\"] += 1\n"," continue\n","\n"," messages = ex.get(\"messages\", None)\n"," if not messages:\n"," format_errors[\"missing_messages_list\"] += 1\n"," continue\n","\n"," for message in messages:\n"," if \"role\" not in message or \"content\" not in message:\n"," format_errors[\"message_missing_key\"] += 1\n","\n"," if any(k not in (\"role\", \"content\", \"name\", \"function_call\") for k in message):\n"," format_errors[\"message_unrecognized_key\"] += 1\n","\n"," if message.get(\"role\", None) not in (\"system\", \"user\", \"assistant\", \"function\"):\n"," format_errors[\"unrecognized_role\"] += 1\n","\n"," content = message.get(\"content\", None)\n"," function_call = message.get(\"function_call\", None)\n","\n"," if (not content and not function_call) or not isinstance(content, str):\n"," format_errors[\"missing_content\"] += 1\n","\n"," if not any(message.get(\"role\", None) == \"assistant\" for message in messages):\n"," format_errors[\"example_missing_assistant_message\"] += 1\n","\n","if format_errors:\n"," print(\"\\nFound errors:\")\n"," for k, v in format_errors.items():\n"," print(f\"{k}: {v}\")\n","else:\n"," print(\"\\nNo errors found\")"]},{"cell_type":"markdown","metadata":{"id":"UXrIdBKtaHkx"},"source":["<br>\n","<br>\n","\n","## Token Counting Utilities"]},{"cell_type":"code","execution_count":8,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"A6ev3s_TaGeJ"},"outputs":[],"source":["encoding = tiktoken.get_encoding(\"cl100k_base\")\n","\n","# not exact!\n","# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n","def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):\n"," num_tokens = 0\n"," for message in messages:\n"," num_tokens += tokens_per_message\n"," for key, value in message.items():\n"," num_tokens += len(encoding.encode(value))\n"," if key == \"name\":\n"," num_tokens += tokens_per_name\n"," num_tokens += 3\n"," return num_tokens\n","\n","def num_assistant_tokens_from_messages(messages):\n"," num_tokens = 0\n"," for message in messages:\n"," if message[\"role\"] == \"assistant\":\n"," num_tokens += len(encoding.encode(message[\"content\"]))\n"," return num_tokens\n","\n","def print_distribution(values, name):\n"," print(f\"\\n#### Distribution of {name}:\")\n"," print(f\"min / max: {min(values)}, {max(values)}\")\n"," print(f\"mean / median: {np.mean(values)}, {np.median(values)}\")\n"," print(f\"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}\")"]},{"cell_type":"markdown","metadata":{"id":"2duSc8L7aYki"},"source":["<br>\n","<br>\n","\n","## Data Warnings and Token Counts"]},{"cell_type":"code","execution_count":9,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"CfDEmpgiaLuS","outputId":"f1e8ed95-f77b-4340-80ce-b08ee6025a2e"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","Num examples missing system message: 0\n","Num examples missing user message: 0\n","\n","#### Distribution of num_messages_per_example:\n","min / max: 3, 3\n","mean / median: 3.0, 3.0\n","p5 / p95: 3.0, 3.0\n","\n","#### Distribution of num_total_tokens_per_example:\n","min / max: 77, 263\n","mean / median: 102.54776119402985, 97.0\n","p5 / p95: 84.0, 120.10000000000002\n","\n","#### Distribution of num_assistant_tokens_per_example:\n","min / max: 2, 184\n","mean / median: 19.9955223880597, 15.0\n","p5 / p95: 6.0, 34.0\n","\n","0 examples may be over the 4096 token limit, they will be truncated during fine-tuning\n"]}],"source":["# Warnings and tokens counts\n","n_missing_system = 0\n","n_missing_user = 0\n","n_messages = []\n","convo_lens = []\n","assistant_message_lens = []\n","\n","for ex in dataset:\n"," messages = ex[\"messages\"]\n"," if not any(message[\"role\"] == \"system\" for message in messages):\n"," n_missing_system += 1\n"," if not any(message[\"role\"] == \"user\" for message in messages):\n"," n_missing_user += 1\n"," n_messages.append(len(messages))\n"," convo_lens.append(num_tokens_from_messages(messages))\n"," assistant_message_lens.append(num_assistant_tokens_from_messages(messages))\n","\n","print(\"\\nNum examples missing system message:\", n_missing_system)\n","print(\"Num examples missing user message:\", n_missing_user)\n","print_distribution(n_messages, \"num_messages_per_example\")\n","print_distribution(convo_lens, \"num_total_tokens_per_example\")\n","print_distribution(assistant_message_lens, \"num_assistant_tokens_per_example\")\n","n_too_long = sum(l > 4096 for l in convo_lens)\n","print(f\"\\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning\")"]},{"cell_type":"markdown","metadata":{"id":"x8GusuUOapUB"},"source":["<br>\n","<br>\n","\n","## Cost Estimation"]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":194,"status":"ok","timestamp":1702156661200,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"EnAWqf3SadOj","outputId":"3d0527e9-a3cf-4b6e-9ce4-89a99611d219"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","Dataset has ~68707 tokens that will be charged for during training\n","By default, you'll train for 3 epochs on this dataset\n","By default, you'll be charged for ~206121 tokens\n"]}],"source":["# Pricing and default n_epochs estimate\n","MAX_TOKENS_PER_EXAMPLE = 4096\n","\n","TARGET_EPOCHS = 3\n","MIN_TARGET_EXAMPLES = 100\n","MAX_TARGET_EXAMPLES = 25000\n","MIN_DEFAULT_EPOCHS = 1\n","MAX_DEFAULT_EPOCHS = 25\n","\n","n_epochs = TARGET_EPOCHS\n","n_train_examples = len(dataset)\n","if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:\n"," n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)\n","elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:\n"," n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)\n","\n","n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)\n","print(f\"\\nDataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n","print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n","print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")"]},{"cell_type":"code","execution_count":11,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2,"status":"ok","timestamp":1702156777183,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"-a820sVicdat","outputId":"c96c5167-c91e-4e1c-e8a4-0bd16a6aeb70"},"outputs":[{"data":{"text/plain":["1.648968"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["# Suponiendo que cargan $0.008/1k tokens\n","value = 0.008\n","\n","final_money = (n_epochs * n_billing_tokens_in_dataset)*value/1000\n","final_money"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"0anBNWtKczWR"},"outputs":[],"source":[]}],"metadata":{"colab":{"authorship_tag":"ABX9TyMzN+lsyb4smMC9hf/gYgCs","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.7"}},"nbformat":4,"nbformat_minor":0}
 
1
+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":173,"status":"ok","timestamp":1702156660830,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"pEb6CJl3ZFKP"},"outputs":[],"source":["#! pip install tiktoken"]},{"cell_type":"code","execution_count":2,"metadata":{"executionInfo":{"elapsed":2,"status":"ok","timestamp":1702156660831,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"zj6dSJC3Y01I"},"outputs":[],"source":["import json\n","import tiktoken # for token counting\n","import numpy as np\n","from collections import defaultdict"]},{"cell_type":"markdown","metadata":{"id":"SmN-A6KJZQPo"},"source":["<br>\n","<br>\n","\n","## Data Loading"]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"-geBXCMTZObN","outputId":"924e87bd-4c01-4eff-ac93-10f07e27fe41"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","Num examples: 750\n","First example:\n","{'role': 'system', 'content': 'CARSE es un chatbot que imita el estilo en que Sebas conversa. Fue creado con amor solo para Carmen. Su objetivo es ofrecer una experiencia de chat divertida y familiar que exprese la personalidad, el humor y el amor de Sebas hacia Carmen.'}\n","{'role': 'user', 'content': 'Yo me iré al gym en unos minutos'}\n","{'role': 'assistant', 'content': 'Super mi amorrr, disfruta mucho tu ejercicio, ponte muy fuerte'}\n"]}],"source":["data_path = \"Training_data/Training_Prompts.jsonl\"\n","\n","# Load the dataset\n","with open(data_path, 'r', encoding='utf-8') as f:\n"," dataset = [json.loads(line) for line in f]\n","\n","# Initial dataset stats\n","print(\"\\nNum examples:\", len(dataset))\n","print(\"First example:\")\n","for message in dataset[0][\"messages\"]:\n"," print(message)"]},{"cell_type":"markdown","metadata":{"id":"UfxL9TQMaCCG"},"source":["<br>\n","<br>\n","\n","## Format Validation"]},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"bOXkqnLgZakM","outputId":"8c716779-0f83-4fbf-c302-56d834735327"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","No errors found\n"]}],"source":["# Format error checks\n","format_errors = defaultdict(int)\n","\n","for ex in dataset:\n"," if not isinstance(ex, dict):\n"," format_errors[\"data_type\"] += 1\n"," continue\n","\n"," messages = ex.get(\"messages\", None)\n"," if not messages:\n"," format_errors[\"missing_messages_list\"] += 1\n"," continue\n","\n"," for message in messages:\n"," if \"role\" not in message or \"content\" not in message:\n"," format_errors[\"message_missing_key\"] += 1\n","\n"," if any(k not in (\"role\", \"content\", \"name\", \"function_call\") for k in message):\n"," format_errors[\"message_unrecognized_key\"] += 1\n","\n"," if message.get(\"role\", None) not in (\"system\", \"user\", \"assistant\", \"function\"):\n"," format_errors[\"unrecognized_role\"] += 1\n","\n"," content = message.get(\"content\", None)\n"," function_call = message.get(\"function_call\", None)\n","\n"," if (not content and not function_call) or not isinstance(content, str):\n"," format_errors[\"missing_content\"] += 1\n","\n"," if not any(message.get(\"role\", None) == \"assistant\" for message in messages):\n"," format_errors[\"example_missing_assistant_message\"] += 1\n","\n","if format_errors:\n"," print(\"\\nFound errors:\")\n"," for k, v in format_errors.items():\n"," print(f\"{k}: {v}\")\n","else:\n"," print(\"\\nNo errors found\")"]},{"cell_type":"markdown","metadata":{"id":"UXrIdBKtaHkx"},"source":["<br>\n","<br>\n","\n","## Token Counting Utilities"]},{"cell_type":"code","execution_count":5,"metadata":{"executionInfo":{"elapsed":3,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"A6ev3s_TaGeJ"},"outputs":[],"source":["encoding = tiktoken.get_encoding(\"cl100k_base\")\n","\n","# not exact!\n","# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n","def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):\n"," num_tokens = 0\n"," for message in messages:\n"," num_tokens += tokens_per_message\n"," for key, value in message.items():\n"," num_tokens += len(encoding.encode(value))\n"," if key == \"name\":\n"," num_tokens += tokens_per_name\n"," num_tokens += 3\n"," return num_tokens\n","\n","def num_assistant_tokens_from_messages(messages):\n"," num_tokens = 0\n"," for message in messages:\n"," if message[\"role\"] == \"assistant\":\n"," num_tokens += len(encoding.encode(message[\"content\"]))\n"," return num_tokens\n","\n","def print_distribution(values, name):\n"," print(f\"\\n#### Distribution of {name}:\")\n"," print(f\"min / max: {min(values)}, {max(values)}\")\n"," print(f\"mean / median: {np.mean(values)}, {np.median(values)}\")\n"," print(f\"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}\")"]},{"cell_type":"markdown","metadata":{"id":"2duSc8L7aYki"},"source":["<br>\n","<br>\n","\n","## Data Warnings and Token Counts"]},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1702156661009,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"CfDEmpgiaLuS","outputId":"f1e8ed95-f77b-4340-80ce-b08ee6025a2e"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","Num examples missing system message: 0\n","Num examples missing user message: 0\n","\n","#### Distribution of num_messages_per_example:\n","min / max: 3, 3\n","mean / median: 3.0, 3.0\n","p5 / p95: 3.0, 3.0\n","\n","#### Distribution of num_total_tokens_per_example:\n","min / max: 77, 263\n","mean / median: 102.228, 97.0\n","p5 / p95: 85.0, 119.10000000000002\n","\n","#### Distribution of num_assistant_tokens_per_example:\n","min / max: 2, 184\n","mean / median: 19.618666666666666, 15.0\n","p5 / p95: 6.0, 33.0\n","\n","0 examples may be over the 4096 token limit, they will be truncated during fine-tuning\n"]}],"source":["# Warnings and tokens counts\n","n_missing_system = 0\n","n_missing_user = 0\n","n_messages = []\n","convo_lens = []\n","assistant_message_lens = []\n","\n","for ex in dataset:\n"," messages = ex[\"messages\"]\n"," if not any(message[\"role\"] == \"system\" for message in messages):\n"," n_missing_system += 1\n"," if not any(message[\"role\"] == \"user\" for message in messages):\n"," n_missing_user += 1\n"," n_messages.append(len(messages))\n"," convo_lens.append(num_tokens_from_messages(messages))\n"," assistant_message_lens.append(num_assistant_tokens_from_messages(messages))\n","\n","print(\"\\nNum examples missing system message:\", n_missing_system)\n","print(\"Num examples missing user message:\", n_missing_user)\n","print_distribution(n_messages, \"num_messages_per_example\")\n","print_distribution(convo_lens, \"num_total_tokens_per_example\")\n","print_distribution(assistant_message_lens, \"num_assistant_tokens_per_example\")\n","n_too_long = sum(l > 4096 for l in convo_lens)\n","print(f\"\\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning\")"]},{"cell_type":"markdown","metadata":{"id":"x8GusuUOapUB"},"source":["<br>\n","<br>\n","\n","## Cost Estimation"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":194,"status":"ok","timestamp":1702156661200,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"EnAWqf3SadOj","outputId":"3d0527e9-a3cf-4b6e-9ce4-89a99611d219"},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","Dataset has ~76671 tokens that will be charged for during training\n","By default, you'll train for 3 epochs on this dataset\n","By default, you'll be charged for ~230013 tokens\n"]}],"source":["# Pricing and default n_epochs estimate\n","MAX_TOKENS_PER_EXAMPLE = 4096\n","\n","TARGET_EPOCHS = 3\n","MIN_TARGET_EXAMPLES = 100\n","MAX_TARGET_EXAMPLES = 25000\n","MIN_DEFAULT_EPOCHS = 1\n","MAX_DEFAULT_EPOCHS = 25\n","\n","n_epochs = TARGET_EPOCHS\n","n_train_examples = len(dataset)\n","if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:\n"," n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)\n","elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:\n"," n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)\n","\n","n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)\n","print(f\"\\nDataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n","print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n","print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")"]},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2,"status":"ok","timestamp":1702156777183,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"},"user_tz":360},"id":"-a820sVicdat","outputId":"c96c5167-c91e-4e1c-e8a4-0bd16a6aeb70"},"outputs":[{"data":{"text/plain":["1.840104"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["# Suponiendo que cargan $0.008/1k tokens\n","value = 0.008\n","\n","final_money = (n_epochs * n_billing_tokens_in_dataset)*value/1000\n","final_money"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"0anBNWtKczWR"},"outputs":[],"source":[]}],"metadata":{"colab":{"authorship_tag":"ABX9TyMzN+lsyb4smMC9hf/gYgCs","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.7"}},"nbformat":4,"nbformat_minor":0}
Training_Data/Training_Prompts.jsonl CHANGED
The diff for this file is too large to render. See raw diff