bagbreizh commited on
Commit
640ae52
·
1 Parent(s): b7383a5
Files changed (3) hide show
  1. .gitignore +165 -0
  2. app.py +4 -6
  3. requirements.txt +46 -1
.gitignore ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ env_dev/
127
+ venv/
128
+ ENV/
129
+ env.bak/
130
+ venv.bak/
131
+
132
+ # Spyder project settings
133
+ .spyderproject
134
+ .spyproject
135
+
136
+ # Rope project settings
137
+ .ropeproject
138
+
139
+ # mkdocs documentation
140
+ /site
141
+
142
+ # mypy
143
+ .mypy_cache/
144
+ .dmypy.json
145
+ dmypy.json
146
+
147
+ # Pyre type checker
148
+ .pyre/
149
+
150
+ # pytype static type analyzer
151
+ .pytype/
152
+
153
+ # Cython debug symbols
154
+ cython_debug/
155
+
156
+ # PyCharm
157
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
160
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
+ #.idea/
162
+ .DS_Store
163
+ actilex/private_dossiers/
164
+ actilex/core/credentials.json
165
+ actilex/core/token.json
app.py CHANGED
@@ -3,9 +3,8 @@ import requests
3
  import streamlit as st
4
  from huggingface_hub import InferenceClient
5
  from prompt import default_prompt, prompt_enhanced
6
- from transformers import GPT2Tokenizer
7
- # Initialize LLaMA tokenizer
8
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
9
  # Function to load the README.md directly from the Hugging Face API
10
  def load_readme(dataset_name: str):
11
  api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md"
@@ -21,9 +20,8 @@ def load_readme(dataset_name: str):
21
 
22
  # Function to check if the README content exceeds the token limit
23
  def check_token_limit(content: str, max_tokens: int = 7500):
24
- tokens = tokenizer.encode(content)
25
- if len(tokens) > max_tokens:
26
- truncated_content = tokenizer.decode(tokens[:max_tokens])
27
  st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.")
28
  return truncated_content
29
  return content
 
3
  import streamlit as st
4
  from huggingface_hub import InferenceClient
5
  from prompt import default_prompt, prompt_enhanced
6
+
7
+
 
8
  # Function to load the README.md directly from the Hugging Face API
9
  def load_readme(dataset_name: str):
10
  api_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/README.md"
 
20
 
21
  # Function to check if the README content exceeds the token limit
22
  def check_token_limit(content: str, max_tokens: int = 7500):
23
+ if len(content)//4 > max_tokens:
24
+ truncated_content = content[:max_tokens]
 
25
  st.warning("Warning: The README.md exceeds 8192 tokens. It has been truncated for evaluation. This may affect the quality of the evaluation results.")
26
  return truncated_content
27
  return content
requirements.txt CHANGED
@@ -1 +1,46 @@
1
- transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.4.1
2
+ attrs==24.2.0
3
+ blinker==1.8.2
4
+ cachetools==5.5.0
5
+ certifi==2024.8.30
6
+ charset-normalizer==3.4.0
7
+ click==8.1.7
8
+ filelock==3.16.1
9
+ fsspec==2024.9.0
10
+ gitdb==4.0.11
11
+ GitPython==3.1.43
12
+ huggingface-hub==0.25.2
13
+ idna==3.10
14
+ Jinja2==3.1.4
15
+ jsonschema==4.23.0
16
+ jsonschema-specifications==2024.10.1
17
+ markdown-it-py==3.0.0
18
+ MarkupSafe==3.0.1
19
+ mdurl==0.1.2
20
+ narwhals==1.9.3
21
+ numpy==2.1.2
22
+ packaging==24.1
23
+ pandas==2.2.3
24
+ pillow==10.4.0
25
+ protobuf==5.28.2
26
+ pyarrow==17.0.0
27
+ pydeck==0.9.1
28
+ Pygments==2.18.0
29
+ python-dateutil==2.9.0.post0
30
+ python-dotenv==1.0.1
31
+ pytz==2024.2
32
+ PyYAML==6.0.2
33
+ referencing==0.35.1
34
+ requests==2.32.3
35
+ rich==13.9.2
36
+ rpds-py==0.20.0
37
+ six==1.16.0
38
+ smmap==5.0.1
39
+ streamlit==1.39.0
40
+ tenacity==9.0.0
41
+ toml==0.10.2
42
+ tornado==6.4.1
43
+ tqdm==4.66.5
44
+ typing_extensions==4.12.2
45
+ tzdata==2024.2
46
+ urllib3==2.2.3