Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
e919aa3
1
Parent(s):
acbdf2a
move from complicated multi-container to simpler design with a shell script
Browse files- Dockerfile.python +8 -4
- README.org +2 -1
- docker-compose.yaml +64 -39
- python/{main.py → get_entities.py} +33 -25
- run.sh +12 -0
Dockerfile.python
CHANGED
@@ -2,8 +2,12 @@ FROM docker.io/python:3-slim-buster
|
|
2 |
MAINTAINER Niv Sardi <[email protected]>
|
3 |
WORKDIR /app
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
RUN
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
2 |
MAINTAINER Niv Sardi <[email protected]>
|
3 |
WORKDIR /app
|
4 |
|
5 |
+
RUN apt update && apt install -y libcairo2 libglib2.0-0 libgl1 && rm -rf /var/cache/apt
|
6 |
+
COPY python/requirements.txt ./python/requirements.txt
|
7 |
+
RUN pip install -r ./python/requirements.txt
|
8 |
|
9 |
+
COPY run.sh ./run
|
10 |
+
RUN chmod +x run
|
11 |
+
COPY python ./python
|
12 |
+
|
13 |
+
CMD env PATH=$PATH:/usr/local/bin python3 ./python/watcher.py
|
README.org
CHANGED
@@ -18,7 +18,8 @@ The process is pretty simple:
|
|
18 |
* running
|
19 |
#+begin_src sh
|
20 |
# build the training dataset
|
21 |
-
docker-compose up --build --remove-orphans
|
|
|
22 |
|
23 |
# run the training on your machine or collab
|
24 |
# https://colab.research.google.com/drive/10R7uwVJJ1R1k6oTjbkkhxPDka7COK-WE
|
|
|
18 |
* running
|
19 |
#+begin_src sh
|
20 |
# build the training dataset
|
21 |
+
docker-compose up --build --remove-orphans -d
|
22 |
+
docker-compose exec python ./run
|
23 |
|
24 |
# run the training on your machine or collab
|
25 |
# https://colab.research.google.com/drive/10R7uwVJJ1R1k6oTjbkkhxPDka7COK-WE
|
docker-compose.yaml
CHANGED
@@ -1,50 +1,75 @@
|
|
1 |
version: "3.9" # optional since v1.27.0
|
2 |
services:
|
3 |
-
|
4 |
build:
|
5 |
dockerfile: Dockerfile.python
|
6 |
context: .
|
7 |
-
command: "python3 src/main.py"
|
8 |
-
volumes:
|
9 |
-
- "./data:/app/data:z"
|
10 |
-
|
11 |
-
puppet:
|
12 |
-
build:
|
13 |
-
dockerfile: Dockerfile.deno
|
14 |
-
context: .
|
15 |
-
links:
|
16 |
-
- browserless
|
17 |
environment:
|
18 |
-
|
19 |
-
|
20 |
-
DEBUG: "puppet"
|
21 |
-
depends_on:
|
22 |
-
- "browserless"
|
23 |
-
# command: "sh -c 'while echo deno; do sleep 3h; done'" # debug
|
24 |
-
command: "deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
|
25 |
-
volumes:
|
26 |
-
- "./deno:/app/src:z" # for debugging
|
27 |
-
- "./data:/app/data:z"
|
28 |
-
#restart: unless-stopped:600
|
29 |
-
deploy:
|
30 |
-
restart_policy:
|
31 |
-
condition: any
|
32 |
-
delay: 600s
|
33 |
-
window: 300s
|
34 |
-
|
35 |
-
cutter:
|
36 |
-
build:
|
37 |
-
dockerfile: Dockerfile.python
|
38 |
-
context: .
|
39 |
depends_on:
|
40 |
-
- "
|
|
|
|
|
41 |
volumes:
|
42 |
-
- "./python:/app/
|
43 |
- "./data:/app/data:z"
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
|
|
1 |
version: "3.9" # optional since v1.27.0
|
2 |
services:
|
3 |
+
python:
|
4 |
build:
|
5 |
dockerfile: Dockerfile.python
|
6 |
context: .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
environment:
|
8 |
+
GECKO_HOST: geckodriver
|
9 |
+
GECKO_PORT: 4444
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
depends_on:
|
11 |
+
- "geckodriver"
|
12 |
+
links:
|
13 |
+
- "geckodriver"
|
14 |
volumes:
|
15 |
+
- "./python:/app/python:z" # for debugging
|
16 |
- "./data:/app/data:z"
|
17 |
|
18 |
+
geckodriver:
|
19 |
+
image: docker.io/instrumentisto/geckodriver
|
20 |
+
entrypoint: ["sh", "-c", "while true; do geckodriver --binary=/opt/firefox/firefox --log warn --port 4444 --host 0.0.0.0; sleep 2; done"]
|
21 |
+
ports: # this is not required but nice for local debug
|
22 |
+
- "4444:4444"
|
23 |
+
# crawler:
|
24 |
+
# build:
|
25 |
+
# dockerfile: Dockerfile.python
|
26 |
+
# context: .
|
27 |
+
# command: "sh -c 'while true; do python3 src/get_entities.py; touch data/entities.csv; sleep 24h; done'"
|
28 |
+
# volumes:
|
29 |
+
# - "./data:/app/data:z"
|
30 |
+
|
31 |
+
# cutter:
|
32 |
+
# build:
|
33 |
+
# dockerfile: Dockerfile.python
|
34 |
+
# context: .
|
35 |
+
# environment:
|
36 |
+
# GECKO_HOST: geckodriver
|
37 |
+
# GECKO_PORT: 4444
|
38 |
+
# depends_on:
|
39 |
+
# - "geckodriver"
|
40 |
+
# links:
|
41 |
+
# - "geckodriver"
|
42 |
+
# volumes:
|
43 |
+
# - "./python:/app/src:z" # for debugging
|
44 |
+
# - "./data:/app/data:z"
|
45 |
+
|
46 |
+
# browserless:
|
47 |
+
# image: docker.io/zenika/alpine-chrome
|
48 |
+
# entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
|
49 |
+
# ports:
|
50 |
+
# - "3000:3000"
|
51 |
+
|
52 |
+
# puppet:
|
53 |
+
# build:
|
54 |
+
# dockerfile: Dockerfile.deno
|
55 |
+
# context: .
|
56 |
+
# links:
|
57 |
+
# - browserless
|
58 |
+
# environment:
|
59 |
+
# BROWSERLESS_HOST: browserless
|
60 |
+
# BROWSERLESS_PORT: 3000
|
61 |
+
# DEBUG: "puppet"
|
62 |
+
# depends_on:
|
63 |
+
# - "browserless"
|
64 |
+
# #command: "sh -c 'while echo deno; do sleep 3h; done'" # debug
|
65 |
+
# command: "deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
|
66 |
+
# volumes:
|
67 |
+
# - "./deno:/app/src:z" # for debugging
|
68 |
+
# - "./data:/app/data:z"
|
69 |
+
# #restart: unless-stopped:600
|
70 |
+
# deploy:
|
71 |
+
# restart_policy:
|
72 |
+
# condition: any
|
73 |
+
# delay: 600s
|
74 |
+
# window: 300s
|
75 |
|
python/{main.py → get_entities.py}
RENAMED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import csv
|
2 |
import requests
|
3 |
import shutil
|
@@ -14,43 +15,50 @@ page = requests.get(URL)
|
|
14 |
soup = BeautifulSoup(page.content, 'html.parser')
|
15 |
|
16 |
options = soup.find(class_='form-control').find_all('option')
|
17 |
-
mkdir.make_dirs([defaults.DATA_PATH])
|
18 |
|
|
|
19 |
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
|
20 |
writer = csv.writer(csvfile)
|
21 |
writer.writerow(Entity.row_names())
|
22 |
|
23 |
-
i = 0
|
24 |
bar = ChargingBar('Processing', max=len(options))
|
25 |
for o in options[1:]:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
a = a.attrs['href']
|
41 |
-
except AttributeError:
|
42 |
-
a = soup.select_one(selectors.entity_mailto)
|
43 |
try:
|
44 |
-
a =
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
|
50 |
-
writer.writerow(e.to_row())
|
51 |
i+=1
|
52 |
bar.next()
|
53 |
bar.finish()
|
54 |
|
55 |
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
|
56 |
-
print('scrape finished')
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
import csv
|
3 |
import requests
|
4 |
import shutil
|
|
|
15 |
soup = BeautifulSoup(page.content, 'html.parser')
|
16 |
|
17 |
options = soup.find(class_='form-control').find_all('option')
|
18 |
+
mkdir.make_dirs([defaults.DATA_PATH, defaults.LOGOS_DATA_PATH])
|
19 |
|
20 |
+
i = 0
|
21 |
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
|
22 |
writer = csv.writer(csvfile)
|
23 |
writer.writerow(Entity.row_names())
|
24 |
|
|
|
25 |
bar = ChargingBar('Processing', max=len(options))
|
26 |
for o in options[1:]:
|
27 |
+
def get_bco():
|
28 |
+
(name, bco)= (o.text, o.attrs['value'])
|
29 |
+
page = requests.post(URL, data={'bco': bco})
|
30 |
+
soup = BeautifulSoup(page.content, 'html.parser')
|
31 |
+
try:
|
32 |
+
img = soup.select_one(selectors.logosbancos).attrs['src']
|
33 |
+
img = img.replace('../', 'https://www.bcra.gob.ar/')
|
34 |
+
fn = f"{defaults.LOGOS_DATA_PATH}/{bco}.0.png"
|
35 |
+
web.get_img_logo(img, fn)
|
36 |
+
except AttributeError as err:
|
37 |
+
print('img', name, err)
|
38 |
+
img = None
|
39 |
+
|
40 |
+
a = soup.select_one(selectors.entity_http)
|
|
|
|
|
|
|
41 |
try:
|
42 |
+
a = a.attrs['href']
|
43 |
+
except AttributeError:
|
44 |
+
a = soup.select_one(selectors.entity_mailto)
|
45 |
+
try:
|
46 |
+
a = 'http://' + a.attrs['href'].split('@')[1]
|
47 |
|
48 |
+
except TypeError:
|
49 |
+
print('ERROR', a)
|
50 |
+
|
51 |
+
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
|
52 |
+
writer.writerow(e.to_row())
|
53 |
+
|
54 |
+
try:
|
55 |
+
get_bco()
|
56 |
+
except Exception as e:
|
57 |
+
print(f'Error processing: {e}')
|
58 |
|
|
|
|
|
59 |
i+=1
|
60 |
bar.next()
|
61 |
bar.finish()
|
62 |
|
63 |
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
|
64 |
+
print(f'scrape finished, found {i} entities, dumped to {defaults.MAIN_CSV_PATH}')
|
run.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/sh
|
2 |
+
|
3 |
+
PY=python3
|
4 |
+
echo "🏛 fetching entities"
|
5 |
+
${PY} ./python/get_entities.py
|
6 |
+
echo "🌏 getting vendor data"
|
7 |
+
${PY} ./python/vendor.py --parallel $(cat /proc/cpuinfo | grep processor | wc -l)
|
8 |
+
echo "✨ augmenting data"
|
9 |
+
${PY} ./python/augment.py
|
10 |
+
echo "🖼 croping augmented data"
|
11 |
+
${PY} ./python/crop.py ./data/augmented/images
|
12 |
+
echo "TODO: 🧠 train model"
|