Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
·
f1ab0d5
1
Parent(s):
dc89ab8
implement python inotify watcher and add to docker-compose
Browse files- Dockerfile.python +8 -0
- crawler/entity.py +2 -1
- crawler/imtool.py +138 -0
- crawler/main.py +4 -3
- crawler/requirements.txt +1 -1
- crawler/screenshot.py +12 -5
- crawler/vendor.py +10 -45
- crawler/watcher.py +20 -0
- crawler/web.py +46 -0
- docker-compose.yaml +12 -3
Dockerfile.python
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM docker.io/jjanzic/docker-python3-opencv
|
2 |
+
MAINTAINER Niv Sardi <[email protected]>
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY crawler ./src
|
6 |
+
RUN pip install -r ./src/requirements.txt
|
7 |
+
|
8 |
+
CMD python3 ./src/watcher.py
|
crawler/entity.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
|
3 |
class Entity():
|
|
|
4 |
def __init__(self, name, bco, url=None, logo=None):
|
5 |
self.name = name
|
6 |
self.bco = bco
|
@@ -35,7 +36,7 @@ Entity:
|
|
35 |
|
36 |
@property
|
37 |
def DATA_PATH(self):
|
38 |
-
return
|
39 |
|
40 |
def to_row(self):
|
41 |
return [self.name, self.bco, self.url, self.logo]
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
|
3 |
class Entity():
|
4 |
+
_DATA_PATH = './data'
|
5 |
def __init__(self, name, bco, url=None, logo=None):
|
6 |
self.name = name
|
7 |
self.bco = bco
|
|
|
36 |
|
37 |
@property
|
38 |
def DATA_PATH(self):
|
39 |
+
return self._DATA_PATH
|
40 |
|
41 |
def to_row(self):
|
42 |
return [self.name, self.bco, self.url, self.logo]
|
crawler/imtool.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
import os
|
4 |
+
import math
|
5 |
+
import cv2
|
6 |
+
import pathlib
|
7 |
+
from typing import NamedTuple
|
8 |
+
|
9 |
+
from entity import Entity
|
10 |
+
|
11 |
+
TILE_SIZE = 800
|
12 |
+
TILE_OVERLAP = 0.8
|
13 |
+
|
14 |
+
class BoundingBox(NamedTuple):
|
15 |
+
x: float = 0.0
|
16 |
+
y: float = 0.0
|
17 |
+
w: float = 0.0
|
18 |
+
h: float = 0.0
|
19 |
+
|
20 |
+
@classmethod
|
21 |
+
def from_centroid(cls, c):
|
22 |
+
x = math.floor(c.x + c.w/2)
|
23 |
+
y = math.floor(c.y + c.h/2)
|
24 |
+
self = cls(x=x, y=y, w=math.ceil(c.w), h=math.ceil(c.h))
|
25 |
+
return self
|
26 |
+
|
27 |
+
@classmethod
|
28 |
+
def from_dict(cls, d):
|
29 |
+
self = cls(x=d['x'], y=d['y'], w=d['width'], h=d['height'])
|
30 |
+
return self
|
31 |
+
|
32 |
+
class Centroid(BoundingBox):
|
33 |
+
@classmethod
|
34 |
+
def from_bounding_box(cls, b):
|
35 |
+
x = math.floor(b.x - c.w/2)
|
36 |
+
y = math.floor(b.y - c.h/2)
|
37 |
+
self = cls(x=x, y=y, w=math.ceil(c.w), h=math.ceil(c.h))
|
38 |
+
|
39 |
+
def read_bounding_boxes(filename):
|
40 |
+
boxes = []
|
41 |
+
with open(filename, 'r') as f:
|
42 |
+
(x,y,w,h) = [float(i) for i in f.readline().split(' ')[1:]]
|
43 |
+
boxes.append(BoundingBox(x,y,w,h))
|
44 |
+
return boxes
|
45 |
+
|
46 |
+
def floor_point(a, b):
|
47 |
+
return (math.floor(a), math.floor(b))
|
48 |
+
|
49 |
+
def cut_img(im, s, e):
|
50 |
+
x = s[0]
|
51 |
+
y = s[1]
|
52 |
+
w = e[0] - x
|
53 |
+
h = e[1] - y
|
54 |
+
|
55 |
+
print("DEBUG", im.shape, x, y, w, h)
|
56 |
+
return im[y:h, x:w]
|
57 |
+
|
58 |
+
def cut_logo(im, l):
|
59 |
+
(x, y, w, h) = floor_logo(l)
|
60 |
+
return im[x:w, y:h]
|
61 |
+
|
62 |
+
def crop(fn, logos):
|
63 |
+
basename = os.path.basename(fn).replace('.png', '')
|
64 |
+
out = f"./data/squares"
|
65 |
+
pathlib.Path(out).mkdir(parents=True, exist_ok=True)
|
66 |
+
|
67 |
+
im = cv2.imread(fn)
|
68 |
+
|
69 |
+
(h, w, c) = im.shape
|
70 |
+
(tx, ty)= (
|
71 |
+
math.ceil(w/(TILE_SIZE*TILE_OVERLAP)),
|
72 |
+
math.ceil(h/(TILE_SIZE*TILE_OVERLAP))
|
73 |
+
)
|
74 |
+
|
75 |
+
print('shape', basename, tx, ty, h, w, logos)
|
76 |
+
for x in range(tx):
|
77 |
+
for y in range(ty):
|
78 |
+
color = (0,x*(255/tx),y*(255/ty))
|
79 |
+
|
80 |
+
fx = math.floor(x*(w - TILE_SIZE)/(tx))
|
81 |
+
fy = math.floor(y*(h - TILE_SIZE)/(ty))
|
82 |
+
|
83 |
+
start = (fx, fy)
|
84 |
+
end = (fx + TILE_SIZE, fy + TILE_SIZE)
|
85 |
+
|
86 |
+
#im = cv2.rectangle(im, start, end, color, 10)
|
87 |
+
li = []
|
88 |
+
for l in logos:
|
89 |
+
def intersect():
|
90 |
+
six = l.x - fx
|
91 |
+
siy = l.y - fy
|
92 |
+
eix = six + l.w
|
93 |
+
eiy = siy + l.h
|
94 |
+
|
95 |
+
if six < 0:
|
96 |
+
if six + l.w < 0:
|
97 |
+
return None
|
98 |
+
six = 0
|
99 |
+
if siy < 0:
|
100 |
+
if siy + l.h < 0:
|
101 |
+
return None
|
102 |
+
siy = 0
|
103 |
+
if eix > TILE_SIZE:
|
104 |
+
if eix - l.w > TILE_SIZE:
|
105 |
+
return None
|
106 |
+
eix = TILE_SIZE
|
107 |
+
if eiy > TILE_SIZE:
|
108 |
+
if eiy - l.h > TILE_SIZE:
|
109 |
+
return None
|
110 |
+
eiy = TILE_SIZE
|
111 |
+
|
112 |
+
return (six, siy), (eix, eiy)
|
113 |
+
|
114 |
+
p = intersect()
|
115 |
+
if p:
|
116 |
+
li.append(p)
|
117 |
+
|
118 |
+
c = (255, 0, 0)
|
119 |
+
nim = im[fy:fy+TILE_SIZE, fx:fx+TILE_SIZE]
|
120 |
+
name =f"{out}/{basename}.{x}.{y}"
|
121 |
+
cv2.imwrite(f"{name}.png", nim)
|
122 |
+
if len(li):
|
123 |
+
with open(f"{name}.txt", 'w') as f:
|
124 |
+
for p in li:
|
125 |
+
cw = p[1][0] - p[0][0]
|
126 |
+
ch = p[1][1] - p[0][1]
|
127 |
+
cx = cw/2 + p[0][0]
|
128 |
+
cy = ch/2 + p[0][1]
|
129 |
+
|
130 |
+
a = f"{basename} {cx/TILE_SIZE} {cy/TILE_SIZE} {cw/TILE_SIZE} {ch/TILE_SIZE}"
|
131 |
+
f.write(a)
|
132 |
+
print(a)
|
133 |
+
|
134 |
+
if __name__ == '__main__':
|
135 |
+
boxes = read_bounding_boxes("./data/debug.full.txt")
|
136 |
+
print(boxes)
|
137 |
+
crop("./data/debug.full.png", boxes)
|
138 |
+
|
crawler/main.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import csv
|
2 |
-
|
3 |
import requests
|
4 |
from bs4 import BeautifulSoup
|
5 |
from progress.bar import ChargingBar
|
@@ -7,13 +7,14 @@ from progress.bar import ChargingBar
|
|
7 |
from entity import Entity
|
8 |
from common import selectors
|
9 |
|
|
|
|
|
10 |
URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
|
11 |
page = requests.get(URL)
|
12 |
-
|
13 |
soup = BeautifulSoup(page.content, "html.parser")
|
14 |
|
15 |
options = soup.find(class_="form-control").find_all('option')
|
16 |
-
with open('entidades.csv', 'w', newline='') as csvfile:
|
17 |
writer = csv.writer(csvfile)
|
18 |
writer.writerow(Entity.row_names())
|
19 |
|
|
|
1 |
import csv
|
2 |
+
import pathlib
|
3 |
import requests
|
4 |
from bs4 import BeautifulSoup
|
5 |
from progress.bar import ChargingBar
|
|
|
7 |
from entity import Entity
|
8 |
from common import selectors
|
9 |
|
10 |
+
pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
|
11 |
+
|
12 |
URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
|
13 |
page = requests.get(URL)
|
|
|
14 |
soup = BeautifulSoup(page.content, "html.parser")
|
15 |
|
16 |
options = soup.find(class_="form-control").find_all('option')
|
17 |
+
with open('./data/entidades.csv', 'w', newline='') as csvfile:
|
18 |
writer = csv.writer(csvfile)
|
19 |
writer.writerow(Entity.row_names())
|
20 |
|
crawler/requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
bs4==0.0.1
|
2 |
progress==1.6
|
3 |
-
|
|
|
1 |
bs4==0.0.1
|
2 |
progress==1.6
|
3 |
+
inotify
|
crawler/screenshot.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
#
|
|
|
|
|
|
|
3 |
from selenium import webdriver
|
4 |
from selenium.webdriver.common.keys import Keys
|
5 |
from selenium.webdriver.common.by import By
|
@@ -11,18 +14,22 @@ options = webdriver.FirefoxOptions()
|
|
11 |
options.add_argument("--headless")
|
12 |
options.add_argument("--window-size=1920x8000")
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
driver = webdriver.Firefox(options=options)
|
15 |
def sc_entity(e: Entity):
|
16 |
print(e)
|
17 |
driver.get(e.url)
|
18 |
-
driver.save_screenshot(f"{e.DATA_PATH}/
|
19 |
-
driver.save_full_page_screenshot(f"{e.DATA_PATH}/
|
20 |
|
21 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
|
22 |
-
with open(f"{e.DATA_PATH}/
|
23 |
for i in logos:
|
24 |
-
f.write(
|
25 |
-
print(i.get_attribute('src'), i.rect)
|
26 |
|
27 |
if __name__ == '__main__':
|
28 |
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
#
|
3 |
+
|
4 |
+
import math
|
5 |
+
|
6 |
from selenium import webdriver
|
7 |
from selenium.webdriver.common.keys import Keys
|
8 |
from selenium.webdriver.common.by import By
|
|
|
14 |
options.add_argument("--headless")
|
15 |
options.add_argument("--window-size=1920x8000")
|
16 |
|
17 |
+
def coord_to_point(c):
|
18 |
+
x = math.floor(c['x'] + c['width']/2)
|
19 |
+
y = math.floor(c['y'] + c['height']/2)
|
20 |
+
return f"{x} {y} {math.roof(c['width'])} {math.roof(c['height'])}"
|
21 |
+
|
22 |
driver = webdriver.Firefox(options=options)
|
23 |
def sc_entity(e: Entity):
|
24 |
print(e)
|
25 |
driver.get(e.url)
|
26 |
+
driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
|
27 |
+
driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
|
28 |
|
29 |
logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
|
30 |
+
with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
|
31 |
for i in logos:
|
32 |
+
f.write(f"{e.bco} {coord_to_point(i.rect)}")
|
|
|
33 |
|
34 |
if __name__ == '__main__':
|
35 |
sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
|
crawler/vendor.py
CHANGED
@@ -1,58 +1,21 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
import pathlib
|
3 |
-
|
4 |
import shutil
|
5 |
import csv
|
6 |
import concurrent.futures
|
7 |
import requests
|
8 |
-
|
9 |
from progress.bar import ChargingBar
|
10 |
|
11 |
from entity import Entity
|
12 |
-
from common import selectors
|
13 |
import screenshot
|
14 |
-
|
15 |
-
def write_cert(e: Entity):
|
16 |
-
ssl_url = e.url.split("/")[2]
|
17 |
-
try:
|
18 |
-
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
19 |
-
with open(f"{e.DATA_PATH}/cert", 'w') as f:
|
20 |
-
f.write(cert)
|
21 |
-
except Exception as err:
|
22 |
-
with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
|
23 |
-
f.write(str(err))
|
24 |
-
|
25 |
-
def get_logos(e: Entity, page):
|
26 |
-
soup = BeautifulSoup(page.content, "html.parser")
|
27 |
-
logos = soup.select(selectors.logo)
|
28 |
-
|
29 |
-
i = 0
|
30 |
-
lfn = []
|
31 |
-
for l in logos:
|
32 |
-
src = l.attrs['src']
|
33 |
-
ext = src.split('.')[-1].split('/')[-1]
|
34 |
-
try:
|
35 |
-
res = requests.get(src, stream=True)
|
36 |
-
except Exception:
|
37 |
-
res = requests.get(f"{e.url}/{src}")
|
38 |
-
|
39 |
-
fn = f"{e.DATA_PATH}/{i}.{ext}"
|
40 |
-
with open(fn, "wb") as f:
|
41 |
-
shutil.copyfileobj(res.raw, f)
|
42 |
-
lfn.append(fn)
|
43 |
-
i+=1
|
44 |
|
45 |
def query_vendor_site(e: Entity):
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
page = requests.get(e.url)
|
50 |
-
except Exception:
|
51 |
-
e.url = e.url.replace('http', 'https')
|
52 |
-
page = requests.get(e.url)
|
53 |
-
|
54 |
-
write_cert(e)
|
55 |
-
get_logos(e, page)
|
56 |
screenshot.sc_entity(e)
|
57 |
return (fn, lfn)
|
58 |
|
@@ -73,8 +36,10 @@ def from_csv(fn):
|
|
73 |
bar.next()
|
74 |
bar.finish()
|
75 |
|
76 |
-
#query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
|
77 |
#exit()
|
78 |
|
79 |
if __name__ == '__main__':
|
80 |
-
|
|
|
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import pathlib
|
3 |
+
|
4 |
import shutil
|
5 |
import csv
|
6 |
import concurrent.futures
|
7 |
import requests
|
8 |
+
|
9 |
from progress.bar import ChargingBar
|
10 |
|
11 |
from entity import Entity
|
|
|
12 |
import screenshot
|
13 |
+
import web
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def query_vendor_site(e: Entity):
|
16 |
+
page = web.get_page(e)
|
17 |
+
fn = web.get_cert(e)
|
18 |
+
lfn = web.get_logos(e, page)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
screenshot.sc_entity(e)
|
20 |
return (fn, lfn)
|
21 |
|
|
|
36 |
bar.next()
|
37 |
bar.finish()
|
38 |
|
39 |
+
#query_vendor_site(Entity.from_dict({'url':'http://www.bancoprovincia.com.ar', 'bco':'debug'}))
|
40 |
#exit()
|
41 |
|
42 |
if __name__ == '__main__':
|
43 |
+
#pathlib.Path(e.DATA_PATH).mkdir(parents=True, exist_ok=True)
|
44 |
+
pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
|
45 |
+
from_csv(f"{Entity._DATA_PATH}/entidades.csv")
|
crawler/watcher.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import inotify.adapters
|
3 |
+
from imtool import read_bounding_boxes, crop
|
4 |
+
|
5 |
+
def watch(dir):
|
6 |
+
i = inotify.adapters.Inotify()
|
7 |
+
i.add_watch(dir)
|
8 |
+
for event in i.event_gen(yield_nones=False):
|
9 |
+
(_, type_names, path, filename) = event
|
10 |
+
|
11 |
+
if filename.endswith(".png") and type_names[0] in ['IN_CLOSE_WRITE']:
|
12 |
+
print(f"--PATH=[{path}] FILENAME=[{filename}] EVENT_TYPES={type_names}")
|
13 |
+
try:
|
14 |
+
bbs = read_bounding_boxes(os.path.join(path, filename.replace('.png', '.txt')))
|
15 |
+
crop(os.path.join(path, filename), bbs)
|
16 |
+
except Exception as e:
|
17 |
+
print(f"error: {e}")
|
18 |
+
|
19 |
+
if __name__ == '__main__':
|
20 |
+
watch('./data')
|
crawler/web.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import ssl
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
|
5 |
+
from entity import Entity
|
6 |
+
from common import selectors
|
7 |
+
def get_page(e: Entity):
|
8 |
+
try:
|
9 |
+
page = requests.get(e.url)
|
10 |
+
except Exception:
|
11 |
+
e.url = e.url.replace('http', 'https')
|
12 |
+
page = requests.get(e.url)
|
13 |
+
return page
|
14 |
+
|
15 |
+
def get_cert(e: Entity):
|
16 |
+
ssl_url = e.url.split("/")[2]
|
17 |
+
try:
|
18 |
+
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
19 |
+
fn = f"{e.DATA_PATH}/{e.bco}.cert"
|
20 |
+
with open(fn, 'w') as f:
|
21 |
+
f.write(cert)
|
22 |
+
except Exception as err:
|
23 |
+
with open(f"{fn}.error.log", 'w+') as f:
|
24 |
+
f.write(str(err))
|
25 |
+
return fn
|
26 |
+
|
27 |
+
def get_logos(e: Entity, page):
|
28 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
29 |
+
logos = soup.select(selectors.logo)
|
30 |
+
|
31 |
+
i = 0
|
32 |
+
lfn = []
|
33 |
+
for l in logos:
|
34 |
+
src = l.attrs['src']
|
35 |
+
ext = src.split('.')[-1].split('/')[-1]
|
36 |
+
try:
|
37 |
+
res = requests.get(src, stream=True)
|
38 |
+
except Exception:
|
39 |
+
res = requests.get(f"{e.url}/{src}")
|
40 |
+
|
41 |
+
fn = f"{e.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
|
42 |
+
with open(fn, "wb") as f:
|
43 |
+
shutil.copyfileobj(res.raw, f)
|
44 |
+
lfn.append(fn)
|
45 |
+
i+=1
|
46 |
+
return lfn
|
docker-compose.yaml
CHANGED
@@ -12,9 +12,10 @@ services:
|
|
12 |
DEBUG: "puppet"
|
13 |
depends_on:
|
14 |
- "browserless"
|
15 |
-
command: "sh -c 'while echo deno; do sleep 3h; done'" #
|
|
|
16 |
volumes:
|
17 |
-
- "./src
|
18 |
- "./data:/app/data:z"
|
19 |
#restart: unless-stopped:600
|
20 |
deploy:
|
@@ -22,7 +23,15 @@ services:
|
|
22 |
condition: any
|
23 |
delay: 600s
|
24 |
window: 300s
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
browserless:
|
27 |
image: docker.io/zenika/alpine-chrome
|
28 |
entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
|
|
|
12 |
DEBUG: "puppet"
|
13 |
depends_on:
|
14 |
- "browserless"
|
15 |
+
# command: "sh -c 'while echo deno; do sleep 3h; done'" #
|
16 |
+
command: "deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
|
17 |
volumes:
|
18 |
+
# - "./src:/app/src:z" # for debugging
|
19 |
- "./data:/app/data:z"
|
20 |
#restart: unless-stopped:600
|
21 |
deploy:
|
|
|
23 |
condition: any
|
24 |
delay: 600s
|
25 |
window: 300s
|
26 |
+
cutter:
|
27 |
+
build:
|
28 |
+
dockerfile: Dockerfile.python
|
29 |
+
context: .
|
30 |
+
depends_on:
|
31 |
+
- "puppet"
|
32 |
+
volumes:
|
33 |
+
# - "./crawler:/app/src:z" # for debugging
|
34 |
+
- "./data:/app/data:z"
|
35 |
browserless:
|
36 |
image: docker.io/zenika/alpine-chrome
|
37 |
entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]
|