Commit
·
2b4b309
1
Parent(s):
e4b6cc5
feat: add dataset upload
Browse files- .DS_Store +0 -0
- app.py +3 -2
- assets/logo.png +0 -0
- assets/logo.svg +42 -0
- pdm.lock +150 -1
- pyproject.toml +1 -1
- requirements.txt +1 -1
- src/distilabel_dataset_generator/sft.py +64 -30
- src/distilabel_dataset_generator/utils.py +41 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
app.py
CHANGED
@@ -5,8 +5,9 @@ from distilabel_dataset_generator.sft import demo
|
|
5 |
demo = gr.TabbedInterface(
|
6 |
[demo],
|
7 |
["Supervised Fine-Tuning"],
|
8 |
-
title="
|
9 |
head="⚗️ Distilabel Dataset Generator",
|
10 |
)
|
11 |
|
12 |
-
|
|
|
|
5 |
demo = gr.TabbedInterface(
|
6 |
[demo],
|
7 |
["Supervised Fine-Tuning"],
|
8 |
+
title="Distilabel Dataset Generator",
|
9 |
head="⚗️ Distilabel Dataset Generator",
|
10 |
)
|
11 |
|
12 |
+
if __name__ == "__main__":
|
13 |
+
demo.launch()
|
assets/logo.png
ADDED
assets/logo.svg
ADDED
pdm.lock
CHANGED
@@ -5,7 +5,7 @@
|
|
5 |
groups = ["default"]
|
6 |
strategy = ["inherit_metadata"]
|
7 |
lock_version = "4.5.0"
|
8 |
-
content_hash = "sha256:
|
9 |
|
10 |
[[metadata.targets]]
|
11 |
requires_python = ">=3.10"
|
@@ -185,6 +185,20 @@ files = [
|
|
185 |
{file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
|
186 |
]
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
[[package]]
|
189 |
name = "certifi"
|
190 |
version = "2024.8.30"
|
@@ -196,6 +210,66 @@ files = [
|
|
196 |
{file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
|
197 |
]
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
[[package]]
|
200 |
name = "charset-normalizer"
|
201 |
version = "3.3.2"
|
@@ -343,6 +417,41 @@ files = [
|
|
343 |
{file = "contourpy-1.3.0.tar.gz", hash = "sha256:7ffa0db17717a8ffb127efd0c95a4362d996b892c2904db72428d5b52e1938a4"},
|
344 |
]
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
[[package]]
|
347 |
name = "cycler"
|
348 |
version = "0.12.1"
|
@@ -663,6 +772,23 @@ files = [
|
|
663 |
{file = "gradio_client-1.3.0.tar.gz", hash = "sha256:d904afeae4f5682add0a6a263542c10e7669ff6c9de0a53a5c2fc9b719a24bb8"},
|
664 |
]
|
665 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
[[package]]
|
667 |
name = "h11"
|
668 |
version = "0.14.0"
|
@@ -755,6 +881,17 @@ files = [
|
|
755 |
{file = "importlib_resources-6.4.4.tar.gz", hash = "sha256:20600c8b7361938dc0bb2d5ec0297802e575df486f5a544fa414da65e13721f7"},
|
756 |
]
|
757 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
758 |
[[package]]
|
759 |
name = "jinja2"
|
760 |
version = "3.1.4"
|
@@ -1328,6 +1465,18 @@ files = [
|
|
1328 |
{file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
|
1329 |
]
|
1330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1331 |
[[package]]
|
1332 |
name = "pydantic"
|
1333 |
version = "2.9.1"
|
|
|
5 |
groups = ["default"]
|
6 |
strategy = ["inherit_metadata"]
|
7 |
lock_version = "4.5.0"
|
8 |
+
content_hash = "sha256:957e0276f679a9f7e65c68e3dbd1a1565f4b515468943faf6bdeb36b65a271a0"
|
9 |
|
10 |
[[metadata.targets]]
|
11 |
requires_python = ">=3.10"
|
|
|
185 |
{file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
|
186 |
]
|
187 |
|
188 |
+
[[package]]
|
189 |
+
name = "authlib"
|
190 |
+
version = "1.3.2"
|
191 |
+
requires_python = ">=3.8"
|
192 |
+
summary = "The ultimate Python library in building OAuth and OpenID Connect servers and clients."
|
193 |
+
groups = ["default"]
|
194 |
+
dependencies = [
|
195 |
+
"cryptography",
|
196 |
+
]
|
197 |
+
files = [
|
198 |
+
{file = "Authlib-1.3.2-py2.py3-none-any.whl", hash = "sha256:ede026a95e9f5cdc2d4364a52103f5405e75aa156357e831ef2bfd0bc5094dfc"},
|
199 |
+
{file = "authlib-1.3.2.tar.gz", hash = "sha256:4b16130117f9eb82aa6eec97f6dd4673c3f960ac0283ccdae2897ee4bc030ba2"},
|
200 |
+
]
|
201 |
+
|
202 |
[[package]]
|
203 |
name = "certifi"
|
204 |
version = "2024.8.30"
|
|
|
210 |
{file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"},
|
211 |
]
|
212 |
|
213 |
+
[[package]]
|
214 |
+
name = "cffi"
|
215 |
+
version = "1.17.1"
|
216 |
+
requires_python = ">=3.8"
|
217 |
+
summary = "Foreign Function Interface for Python calling C code."
|
218 |
+
groups = ["default"]
|
219 |
+
marker = "platform_python_implementation != \"PyPy\""
|
220 |
+
dependencies = [
|
221 |
+
"pycparser",
|
222 |
+
]
|
223 |
+
files = [
|
224 |
+
{file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
|
225 |
+
{file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
|
226 |
+
{file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"},
|
227 |
+
{file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"},
|
228 |
+
{file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"},
|
229 |
+
{file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"},
|
230 |
+
{file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"},
|
231 |
+
{file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"},
|
232 |
+
{file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"},
|
233 |
+
{file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"},
|
234 |
+
{file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"},
|
235 |
+
{file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"},
|
236 |
+
{file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"},
|
237 |
+
{file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"},
|
238 |
+
{file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"},
|
239 |
+
{file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"},
|
240 |
+
{file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"},
|
241 |
+
{file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"},
|
242 |
+
{file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"},
|
243 |
+
{file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"},
|
244 |
+
{file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"},
|
245 |
+
{file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"},
|
246 |
+
{file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"},
|
247 |
+
{file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"},
|
248 |
+
{file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"},
|
249 |
+
{file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"},
|
250 |
+
{file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"},
|
251 |
+
{file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"},
|
252 |
+
{file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"},
|
253 |
+
{file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"},
|
254 |
+
{file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"},
|
255 |
+
{file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"},
|
256 |
+
{file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"},
|
257 |
+
{file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"},
|
258 |
+
{file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"},
|
259 |
+
{file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"},
|
260 |
+
{file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"},
|
261 |
+
{file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"},
|
262 |
+
{file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"},
|
263 |
+
{file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"},
|
264 |
+
{file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"},
|
265 |
+
{file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"},
|
266 |
+
{file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"},
|
267 |
+
{file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"},
|
268 |
+
{file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"},
|
269 |
+
{file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"},
|
270 |
+
{file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
|
271 |
+
]
|
272 |
+
|
273 |
[[package]]
|
274 |
name = "charset-normalizer"
|
275 |
version = "3.3.2"
|
|
|
417 |
{file = "contourpy-1.3.0.tar.gz", hash = "sha256:7ffa0db17717a8ffb127efd0c95a4362d996b892c2904db72428d5b52e1938a4"},
|
418 |
]
|
419 |
|
420 |
+
[[package]]
|
421 |
+
name = "cryptography"
|
422 |
+
version = "43.0.1"
|
423 |
+
requires_python = ">=3.7"
|
424 |
+
summary = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
425 |
+
groups = ["default"]
|
426 |
+
dependencies = [
|
427 |
+
"cffi>=1.12; platform_python_implementation != \"PyPy\"",
|
428 |
+
]
|
429 |
+
files = [
|
430 |
+
{file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
|
431 |
+
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
|
432 |
+
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
|
433 |
+
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
|
434 |
+
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
|
435 |
+
{file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
|
436 |
+
{file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
|
437 |
+
{file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
|
438 |
+
{file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
|
439 |
+
{file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
|
440 |
+
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
|
441 |
+
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
|
442 |
+
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
|
443 |
+
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
|
444 |
+
{file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
|
445 |
+
{file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
|
446 |
+
{file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
|
447 |
+
{file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
|
448 |
+
{file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
|
449 |
+
{file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
|
450 |
+
{file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
|
451 |
+
{file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
|
452 |
+
{file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
|
453 |
+
]
|
454 |
+
|
455 |
[[package]]
|
456 |
name = "cycler"
|
457 |
version = "0.12.1"
|
|
|
772 |
{file = "gradio_client-1.3.0.tar.gz", hash = "sha256:d904afeae4f5682add0a6a263542c10e7669ff6c9de0a53a5c2fc9b719a24bb8"},
|
773 |
]
|
774 |
|
775 |
+
[[package]]
|
776 |
+
name = "gradio"
|
777 |
+
version = "4.43.0"
|
778 |
+
extras = ["oauth"]
|
779 |
+
requires_python = ">=3.8"
|
780 |
+
summary = "Python library for easily interacting with trained machine learning models"
|
781 |
+
groups = ["default"]
|
782 |
+
dependencies = [
|
783 |
+
"authlib",
|
784 |
+
"gradio==4.43.0",
|
785 |
+
"itsdangerous",
|
786 |
+
]
|
787 |
+
files = [
|
788 |
+
{file = "gradio-4.43.0-py3-none-any.whl", hash = "sha256:a8a785af95b7985d1d17287b1f79add3c913ed79180c4b89ef30cea375329bb9"},
|
789 |
+
{file = "gradio-4.43.0.tar.gz", hash = "sha256:0d510e98b7fcb5d829fdc9452dcfef9f060c86130c9e85a539b9e709ba460823"},
|
790 |
+
]
|
791 |
+
|
792 |
[[package]]
|
793 |
name = "h11"
|
794 |
version = "0.14.0"
|
|
|
881 |
{file = "importlib_resources-6.4.4.tar.gz", hash = "sha256:20600c8b7361938dc0bb2d5ec0297802e575df486f5a544fa414da65e13721f7"},
|
882 |
]
|
883 |
|
884 |
+
[[package]]
|
885 |
+
name = "itsdangerous"
|
886 |
+
version = "2.2.0"
|
887 |
+
requires_python = ">=3.8"
|
888 |
+
summary = "Safely pass data to untrusted environments and back."
|
889 |
+
groups = ["default"]
|
890 |
+
files = [
|
891 |
+
{file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"},
|
892 |
+
{file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"},
|
893 |
+
]
|
894 |
+
|
895 |
[[package]]
|
896 |
name = "jinja2"
|
897 |
version = "3.1.4"
|
|
|
1465 |
{file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
|
1466 |
]
|
1467 |
|
1468 |
+
[[package]]
|
1469 |
+
name = "pycparser"
|
1470 |
+
version = "2.22"
|
1471 |
+
requires_python = ">=3.8"
|
1472 |
+
summary = "C parser in Python"
|
1473 |
+
groups = ["default"]
|
1474 |
+
marker = "platform_python_implementation != \"PyPy\""
|
1475 |
+
files = [
|
1476 |
+
{file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
|
1477 |
+
{file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
|
1478 |
+
]
|
1479 |
+
|
1480 |
[[package]]
|
1481 |
name = "pydantic"
|
1482 |
version = "2.9.1"
|
pyproject.toml
CHANGED
@@ -7,7 +7,7 @@ authors = [
|
|
7 |
]
|
8 |
dependencies = [
|
9 |
"distilabel[hf-inference-endpoints]>=1.3.2",
|
10 |
-
"gradio",
|
11 |
"transformers>=4.44.2",
|
12 |
]
|
13 |
requires-python = ">=3.10"
|
|
|
7 |
]
|
8 |
dependencies = [
|
9 |
"distilabel[hf-inference-endpoints]>=1.3.2",
|
10 |
+
"gradio[oauth]<5,>=4.38",
|
11 |
"transformers>=4.44.2",
|
12 |
]
|
13 |
requires-python = ">=3.10"
|
requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
distilabel[hf-inference-endpoints]>=1.3.2
|
2 |
transformers
|
3 |
-
gradio
|
|
|
1 |
distilabel[hf-inference-endpoints]>=1.3.2
|
2 |
transformers
|
3 |
+
gradio[oauth]
|
src/distilabel_dataset_generator/sft.py
CHANGED
@@ -1,9 +1,13 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
from distilabel.llms import InferenceEndpointsLLM
|
4 |
from distilabel.pipeline import Pipeline
|
5 |
from distilabel.steps.tasks import MagpieGenerator, TextGeneration
|
6 |
|
|
|
|
|
7 |
INFORMATION_SEEKING_PROMPT = (
|
8 |
"You are an AI assistant designed to provide accurate and concise information on a wide"
|
9 |
" range of topics. Your purpose is to assist users in finding specific facts,"
|
@@ -112,7 +116,7 @@ The prompt you write should follow the same style and structure as the following
|
|
112 |
User dataset description:
|
113 |
"""
|
114 |
|
115 |
-
MODEL = "meta-llama/Meta-Llama-3.1-
|
116 |
|
117 |
generate_description = TextGeneration(
|
118 |
llm=InferenceEndpointsLLM(
|
@@ -129,20 +133,7 @@ generate_description = TextGeneration(
|
|
129 |
generate_description.load()
|
130 |
|
131 |
|
132 |
-
def
|
133 |
-
return next(
|
134 |
-
generate_description.process(
|
135 |
-
[
|
136 |
-
{
|
137 |
-
"system_prompt": PROMPT_CREATION_PROMPT,
|
138 |
-
"instruction": _dataset_description,
|
139 |
-
}
|
140 |
-
]
|
141 |
-
)
|
142 |
-
)[0]["generation"]
|
143 |
-
|
144 |
-
|
145 |
-
def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=5):
|
146 |
with Pipeline(name="sft") as pipeline:
|
147 |
magpie_step = MagpieGenerator(
|
148 |
llm=InferenceEndpointsLLM(
|
@@ -157,25 +148,69 @@ def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=5):
|
|
157 |
num_rows=_num_rows,
|
158 |
system_prompt=_system_prompt,
|
159 |
)
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
else:
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
outputs
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
|
175 |
with gr.Blocks(
|
176 |
title="⚗️ Distilabel Dataset Generator",
|
177 |
head="⚗️ Distilabel Dataset Generator",
|
178 |
) as demo:
|
|
|
|
|
179 |
dataset_description = gr.Textbox(
|
180 |
label="Provide a description of the dataset",
|
181 |
value="A chemistry dataset for an assistant that explains chemical reactions and formulas",
|
@@ -212,7 +247,7 @@ with gr.Blocks(
|
|
212 |
)
|
213 |
with gr.Column():
|
214 |
num_rows = gr.Number(
|
215 |
-
value=
|
216 |
)
|
217 |
|
218 |
dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
|
@@ -223,8 +258,7 @@ with gr.Blocks(
|
|
223 |
|
224 |
btn_generate_full_dataset.click(
|
225 |
fn=_generate_dataset,
|
226 |
-
inputs=[system_prompt, num_turns, num_rows],
|
227 |
-
outputs=[table],
|
228 |
)
|
229 |
|
230 |
demo
|
|
|
1 |
+
import multiprocessing
|
2 |
+
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
from distilabel.llms import InferenceEndpointsLLM
|
6 |
from distilabel.pipeline import Pipeline
|
7 |
from distilabel.steps.tasks import MagpieGenerator, TextGeneration
|
8 |
|
9 |
+
from distilabel_dataset_generator.utils import OAuthToken, get_login_button
|
10 |
+
|
11 |
INFORMATION_SEEKING_PROMPT = (
|
12 |
"You are an AI assistant designed to provide accurate and concise information on a wide"
|
13 |
" range of topics. Your purpose is to assist users in finding specific facts,"
|
|
|
116 |
User dataset description:
|
117 |
"""
|
118 |
|
119 |
+
MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
120 |
|
121 |
generate_description = TextGeneration(
|
122 |
llm=InferenceEndpointsLLM(
|
|
|
133 |
generate_description.load()
|
134 |
|
135 |
|
136 |
+
def _run_pipeline(result_queue, _num_turns, _num_rows, _system_prompt):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
with Pipeline(name="sft") as pipeline:
|
138 |
magpie_step = MagpieGenerator(
|
139 |
llm=InferenceEndpointsLLM(
|
|
|
148 |
num_rows=_num_rows,
|
149 |
system_prompt=_system_prompt,
|
150 |
)
|
151 |
+
distiset = pipeline.run()
|
152 |
+
result_queue.put(distiset)
|
153 |
+
|
154 |
+
|
155 |
+
def _generate_system_prompt(_dataset_description):
|
156 |
+
return next(
|
157 |
+
generate_description.process(
|
158 |
+
[
|
159 |
+
{
|
160 |
+
"system_prompt": PROMPT_CREATION_PROMPT,
|
161 |
+
"instruction": _dataset_description,
|
162 |
+
}
|
163 |
+
]
|
164 |
+
)
|
165 |
+
)[0]["generation"]
|
166 |
+
|
167 |
+
|
168 |
+
def _generate_dataset(
|
169 |
+
_system_prompt,
|
170 |
+
_num_turns=1,
|
171 |
+
_num_rows=5,
|
172 |
+
_dataset_name=None,
|
173 |
+
_token: OAuthToken = None,
|
174 |
+
):
|
175 |
+
gr.Info("Started pipeline execution.")
|
176 |
+
result_queue = multiprocessing.Queue()
|
177 |
+
p = multiprocessing.Process(
|
178 |
+
target=_run_pipeline, args=(result_queue, _num_turns, _num_rows, _system_prompt)
|
179 |
+
)
|
180 |
+
p.start()
|
181 |
+
p.join()
|
182 |
+
distiset = result_queue.get()
|
183 |
+
|
184 |
+
if _dataset_name is not None:
|
185 |
+
gr.Info("Pushing dataset to Hugging Face Hub...")
|
186 |
+
distiset.push_to_hub(
|
187 |
+
repo_id=_dataset_name, private=False, include_script=True, token=_token
|
188 |
+
)
|
189 |
+
gr.Info("Dataset pushed to Hugging Face Hub: https://huggingface.co")
|
190 |
else:
|
191 |
+
# If not pushing to hub, generate the dataset directly
|
192 |
+
distiset = distiset["default"]["train"]
|
193 |
+
if _num_turns == 1:
|
194 |
+
outputs = distiset.to_pandas()[["instruction", "response"]]
|
195 |
+
else:
|
196 |
+
outputs = {"conversation_id": [], "role": [], "content": []}
|
197 |
+
conversations = distiset["conversation"]
|
198 |
+
for idx, entry in enumerate(conversations):
|
199 |
+
for message in entry["conversation"]:
|
200 |
+
outputs["conversation_id"].append(idx + 1)
|
201 |
+
outputs["role"].append(message["role"])
|
202 |
+
outputs["content"].append(message["content"])
|
203 |
+
return pd.DataFrame(outputs)
|
204 |
+
|
205 |
+
return pd.DataFrame(distiset.to_pandas())
|
206 |
|
207 |
|
208 |
with gr.Blocks(
|
209 |
title="⚗️ Distilabel Dataset Generator",
|
210 |
head="⚗️ Distilabel Dataset Generator",
|
211 |
) as demo:
|
212 |
+
get_login_button()
|
213 |
+
|
214 |
dataset_description = gr.Textbox(
|
215 |
label="Provide a description of the dataset",
|
216 |
value="A chemistry dataset for an assistant that explains chemical reactions and formulas",
|
|
|
247 |
)
|
248 |
with gr.Column():
|
249 |
num_rows = gr.Number(
|
250 |
+
value=100, label="Number of rows in the dataset", minimum=1
|
251 |
)
|
252 |
|
253 |
dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
|
|
|
258 |
|
259 |
btn_generate_full_dataset.click(
|
260 |
fn=_generate_dataset,
|
261 |
+
inputs=[system_prompt, num_turns, num_rows, dataset_name_push_to_hub],
|
|
|
262 |
)
|
263 |
|
264 |
demo
|
src/distilabel_dataset_generator/utils.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from gradio.oauth import (
|
3 |
+
OAUTH_CLIENT_ID,
|
4 |
+
OAUTH_CLIENT_SECRET,
|
5 |
+
OAUTH_SCOPES,
|
6 |
+
OPENID_PROVIDER_URL,
|
7 |
+
get_space,
|
8 |
+
)
|
9 |
+
|
10 |
+
if (
|
11 |
+
all(
|
12 |
+
[
|
13 |
+
OAUTH_CLIENT_ID,
|
14 |
+
OAUTH_CLIENT_SECRET,
|
15 |
+
OAUTH_SCOPES,
|
16 |
+
OPENID_PROVIDER_URL,
|
17 |
+
]
|
18 |
+
)
|
19 |
+
or get_space() is None
|
20 |
+
):
|
21 |
+
from gradio.oauth import OAuthToken
|
22 |
+
else:
|
23 |
+
OAuthToken = str
|
24 |
+
|
25 |
+
|
26 |
+
def get_login_button():
|
27 |
+
if (
|
28 |
+
all(
|
29 |
+
[
|
30 |
+
OAUTH_CLIENT_ID,
|
31 |
+
OAUTH_CLIENT_SECRET,
|
32 |
+
OAUTH_SCOPES,
|
33 |
+
OPENID_PROVIDER_URL,
|
34 |
+
]
|
35 |
+
)
|
36 |
+
or get_space() is None
|
37 |
+
):
|
38 |
+
return gr.LoginButton(
|
39 |
+
value="Sign in with Hugging Face - a login will reset the data!",
|
40 |
+
size="lg",
|
41 |
+
)
|