amaye15 commited on
Commit
5150064
·
1 Parent(s): 60283f6
Files changed (2) hide show
  1. app.py +22 -7
  2. dev.ipynb +123 -7
app.py CHANGED
@@ -7,6 +7,26 @@ from huggingface_hub import WebhooksServer, WebhookPayload
7
  from datasets import Dataset, load_dataset, disable_caching
8
  from fastapi import BackgroundTasks, Response, status
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Disable caching globally for Hugging Face datasets
11
  disable_caching()
12
 
@@ -37,13 +57,11 @@ def get_data():
37
  """
38
  ds = load_dataset(
39
  DS_NAME,
40
- cache_dir=DATA_DIR,
41
  streaming=True,
42
- download_mode="force_redownload",
43
  )
44
  for row in ds["train"]:
45
  yield row
46
- gc.collect()
47
 
48
 
49
  def process_and_push_data():
@@ -53,16 +71,13 @@ def process_and_push_data():
53
  Removes existing data directory if it exists, recreates it, processes
54
  the dataset, and pushes the processed dataset to the hub.
55
  """
56
- if DATA_DIR.exists():
57
- shutil.rmtree(DATA_DIR)
58
- DATA_DIR.mkdir(parents=True, exist_ok=True)
59
 
60
  # Process data using the generator and push it to the hub
61
  ds_processed = Dataset.from_generator(get_data)
62
  ds_processed.push_to_hub(TARGET_REPO)
63
 
64
  logger.info("Data processed and pushed to the hub.")
65
- gc.collect()
66
 
67
 
68
  # Initialize the WebhooksServer with Gradio interface (if needed)
 
7
  from datasets import Dataset, load_dataset, disable_caching
8
  from fastapi import BackgroundTasks, Response, status
9
 
10
+ import shutil
11
+ from pathlib import Path
12
+
13
+
14
+ def clear_huggingface_cache():
15
+ # Path to the Hugging Face cache directory
16
+ cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
17
+
18
+ # Remove the entire datasets directory
19
+ if cache_dir.exists() and cache_dir.is_dir():
20
+ shutil.rmtree(cache_dir)
21
+ print(f"Removed cache directory: {cache_dir}")
22
+ else:
23
+ print("Cache directory does not exist.")
24
+
25
+
26
+ # Example usage
27
+ clear_huggingface_cache()
28
+
29
+
30
  # Disable caching globally for Hugging Face datasets
31
  disable_caching()
32
 
 
57
  """
58
  ds = load_dataset(
59
  DS_NAME,
 
60
  streaming=True,
 
61
  )
62
  for row in ds["train"]:
63
  yield row
64
+ clear_huggingface_cache()
65
 
66
 
67
  def process_and_push_data():
 
71
  Removes existing data directory if it exists, recreates it, processes
72
  the dataset, and pushes the processed dataset to the hub.
73
  """
 
 
 
74
 
75
  # Process data using the generator and push it to the hub
76
  ds_processed = Dataset.from_generator(get_data)
77
  ds_processed.push_to_hub(TARGET_REPO)
78
 
79
  logger.info("Data processed and pushed to the hub.")
80
+ clear_huggingface_cache()
81
 
82
 
83
  # Initialize the WebhooksServer with Gradio interface (if needed)
dev.ipynb CHANGED
@@ -2,9 +2,52 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 28,
6
  "metadata": {},
7
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
  "\n",
10
  "import os\n",
@@ -34,19 +77,92 @@
34
  "DATA_DIR = \"data\"\n",
35
  "p = os.path.join(os.getcwd(), DATA_DIR)\n",
36
  "\n",
37
- "if os.path.exists(p):\n",
38
- " shutil.rmtree(p)\n",
39
  "\n",
40
  "\n",
41
- "os.mkdir(p)\n",
42
  "\n",
43
  "def get_data():\n",
44
- " ds = load_dataset(DS_NAME, cache_dir=p, streaming=True)\n",
45
  " for row in ds[\"train\"]:\n",
46
  " yield row\n",
47
  "\n",
 
 
 
 
 
 
 
 
 
 
 
48
  "ds_processed = Dataset.from_generator(get_data)\n",
49
- "# ds_processed.push_to_hub(\"amaye15/tmp\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  ]
51
  },
52
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 7,
6
  "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/vnd.jupyter.widget-view+json": {
11
+ "model_id": "4b4f99f9ac7940a894807b88d339f866",
12
+ "version_major": 2,
13
+ "version_minor": 0
14
+ },
15
+ "text/plain": [
16
+ "Generating train split: 0 examples [00:00, ? examples/s]"
17
+ ]
18
+ },
19
+ "metadata": {},
20
+ "output_type": "display_data"
21
+ },
22
+ {
23
+ "data": {
24
+ "application/vnd.jupyter.widget-view+json": {
25
+ "model_id": "0ce6f3f8cc7f49f7a42d7b2219a12a7e",
26
+ "version_major": 2,
27
+ "version_minor": 0
28
+ },
29
+ "text/plain": [
30
+ "Downloading readme: 0%| | 0.00/5.24k [00:00<?, ?B/s]"
31
+ ]
32
+ },
33
+ "metadata": {},
34
+ "output_type": "display_data"
35
+ },
36
+ {
37
+ "data": {
38
+ "application/vnd.jupyter.widget-view+json": {
39
+ "model_id": "3d596e0ae9594943905996935ef84329",
40
+ "version_major": 2,
41
+ "version_minor": 0
42
+ },
43
+ "text/plain": [
44
+ "Resolving data files: 0%| | 0/60 [00:00<?, ?it/s]"
45
+ ]
46
+ },
47
+ "metadata": {},
48
+ "output_type": "display_data"
49
+ }
50
+ ],
51
  "source": [
52
  "\n",
53
  "import os\n",
 
77
  "DATA_DIR = \"data\"\n",
78
  "p = os.path.join(os.getcwd(), DATA_DIR)\n",
79
  "\n",
80
+ "# if os.path.exists(p):\n",
81
+ "# shutil.rmtree(p)\n",
82
  "\n",
83
  "\n",
84
+ "# os.mkdir(p)\n",
85
  "\n",
86
  "def get_data():\n",
87
+ " ds = load_dataset(DS_NAME, streaming=True)\n",
88
  " for row in ds[\"train\"]:\n",
89
  " yield row\n",
90
  "\n",
91
+ "\n",
92
+ "\n",
93
+ "# def main():\n",
94
+ "# ds_processed = Dataset.from_generator(get_data)\n",
95
+ "# return\n",
96
+ "# # ds_processed.push_to_hub(\"amaye15/tmp\")\n",
97
+ "\n",
98
+ "\n",
99
+ "# main()\n",
100
+ "# import gc\n",
101
+ "\n",
102
  "ds_processed = Dataset.from_generator(get_data)\n",
103
+ "\n",
104
+ "# gc.collect()\n",
105
+ "# gc.c"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 4,
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "data": {
115
+ "text/plain": [
116
+ "Dataset({\n",
117
+ " features: ['image', 'masked_image', 'mask'],\n",
118
+ " num_rows: 59\n",
119
+ "})"
120
+ ]
121
+ },
122
+ "execution_count": 4,
123
+ "metadata": {},
124
+ "output_type": "execute_result"
125
+ }
126
+ ],
127
+ "source": [
128
+ "ds_processed"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 6,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "name": "stdout",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "Removed cache directory: /Users/andrewmayes/.cache/huggingface/datasets\n"
141
+ ]
142
+ }
143
+ ],
144
+ "source": [
145
+ "import shutil\n",
146
+ "from pathlib import Path\n",
147
+ "\n",
148
+ "# Path to the Hugging Face cache directory\n",
149
+ "cache_dir = Path.home() / \".cache\" / \"huggingface\" / \"datasets\"\n",
150
+ "\n",
151
+ "# Remove the entire datasets directory\n",
152
+ "if cache_dir.exists() and cache_dir.is_dir():\n",
153
+ " shutil.rmtree(cache_dir)\n",
154
+ " print(f\"Removed cache directory: {cache_dir}\")\n",
155
+ "else:\n",
156
+ " print(\"Cache directory does not exist.\")"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": null,
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": [
165
+ "/Users/andrewmayes/.cache/huggingface/datasets"
166
  ]
167
  },
168
  {