Spaces:

mediaparty2023
/

spoof-detect

Runtime error

App Files Files Community

Niv Sardi commited on Aug 22, 2022

Commit

f1ab0d5

1 Parent(s): dc89ab8

implement python inotify watcher and add to docker-compose

Browse files

Files changed (10) hide show

Dockerfile.python +8 -0
crawler/entity.py +2 -1
crawler/imtool.py +138 -0
crawler/main.py +4 -3
crawler/requirements.txt +1 -1
crawler/screenshot.py +12 -5
crawler/vendor.py +10 -45
crawler/watcher.py +20 -0
crawler/web.py +46 -0
docker-compose.yaml +12 -3

Dockerfile.python ADDED Viewed

	@@ -0,0 +1,8 @@

+FROM docker.io/jjanzic/docker-python3-opencv
+MAINTAINER Niv Sardi <[email protected]>
+WORKDIR /app
+COPY crawler ./src
+RUN pip install -r ./src/requirements.txt
+CMD python3 ./src/watcher.py

crawler/entity.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 class Entity():
     def __init__(self, name, bco, url=None, logo=None):
         self.name = name
         self.bco = bco
@@ -35,7 +36,7 @@ Entity:
     @property
     def DATA_PATH(self):
-        return f"./data/{self.bco}"
     def to_row(self):
         return [self.name, self.bco, self.url, self.logo]

 #!/usr/bin/env python3
 class Entity():
+    _DATA_PATH = './data'
     def __init__(self, name, bco, url=None, logo=None):
         self.name = name
         self.bco = bco
     @property
     def DATA_PATH(self):
+        return self._DATA_PATH
     def to_row(self):
         return [self.name, self.bco, self.url, self.logo]

crawler/imtool.py ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/usr/bin/env python3
+import os
+import math
+import cv2
+import pathlib
+from typing import NamedTuple
+from entity import Entity
+TILE_SIZE = 800
+TILE_OVERLAP = 0.8
+class BoundingBox(NamedTuple):
+    x: float = 0.0
+    y: float = 0.0
+    w: float = 0.0
+    h: float = 0.0
+    @classmethod
+    def from_centroid(cls, c):
+        x = math.floor(c.x + c.w/2)
+        y = math.floor(c.y + c.h/2)
+        self = cls(x=x, y=y, w=math.ceil(c.w), h=math.ceil(c.h))
+        return self
+    @classmethod
+    def from_dict(cls, d):
+        self = cls(x=d['x'], y=d['y'], w=d['width'], h=d['height'])
+        return self
+class Centroid(BoundingBox):
+    @classmethod
+    def from_bounding_box(cls, b):
+        x = math.floor(b.x - c.w/2)
+        y = math.floor(b.y - c.h/2)
+        self = cls(x=x, y=y, w=math.ceil(c.w), h=math.ceil(c.h))
+def read_bounding_boxes(filename):
+    boxes = []
+    with open(filename, 'r') as f:
+        (x,y,w,h) = [float(i) for i in f.readline().split(' ')[1:]]
+        boxes.append(BoundingBox(x,y,w,h))
+    return boxes
+def floor_point(a, b):
+    return (math.floor(a), math.floor(b))
+def cut_img(im, s, e):
+    x = s[0]
+    y = s[1]
+    w = e[0] - x
+    h = e[1] - y
+    print("DEBUG", im.shape, x, y, w, h)
+    return im[y:h, x:w]
+def cut_logo(im, l):
+    (x, y, w, h) = floor_logo(l)
+    return im[x:w, y:h]
+def crop(fn, logos):
+    basename = os.path.basename(fn).replace('.png', '')
+    out = f"./data/squares"
+    pathlib.Path(out).mkdir(parents=True, exist_ok=True)
+    im = cv2.imread(fn)
+    (h, w, c) = im.shape
+    (tx, ty)= (
+        math.ceil(w/(TILE_SIZE*TILE_OVERLAP)),
+        math.ceil(h/(TILE_SIZE*TILE_OVERLAP))
+    )
+    print('shape', basename, tx, ty, h, w, logos)
+    for x in range(tx):
+        for y in range(ty):
+            color = (0,x*(255/tx),y*(255/ty))
+            fx = math.floor(x*(w - TILE_SIZE)/(tx))
+            fy = math.floor(y*(h - TILE_SIZE)/(ty))
+            start = (fx, fy)
+            end = (fx + TILE_SIZE, fy + TILE_SIZE)
+            #im = cv2.rectangle(im, start, end, color, 10)
+            li = []
+            for l in logos:
+                def intersect():
+                    six = l.x - fx
+                    siy = l.y - fy
+                    eix = six + l.w
+                    eiy = siy + l.h
+                    if six < 0:
+                        if six + l.w < 0:
+                            return None
+                        six = 0
+                    if siy < 0:
+                        if siy + l.h < 0:
+                            return None
+                        siy = 0
+                    if eix > TILE_SIZE:
+                        if eix - l.w > TILE_SIZE:
+                            return None
+                        eix = TILE_SIZE
+                    if eiy > TILE_SIZE:
+                        if eiy - l.h > TILE_SIZE:
+                            return None
+                        eiy = TILE_SIZE
+                    return (six, siy), (eix, eiy)
+                p = intersect()
+                if p:
+                    li.append(p)
+            c = (255, 0, 0)
+            nim = im[fy:fy+TILE_SIZE, fx:fx+TILE_SIZE]
+            name =f"{out}/{basename}.{x}.{y}"
+            cv2.imwrite(f"{name}.png", nim)
+            if len(li):
+                with open(f"{name}.txt", 'w') as f:
+                    for p in li:
+                        cw = p[1][0] - p[0][0]
+                        ch = p[1][1] - p[0][1]
+                        cx = cw/2 + p[0][0]
+                        cy = ch/2 + p[0][1]
+                        a = f"{basename} {cx/TILE_SIZE} {cy/TILE_SIZE} {cw/TILE_SIZE} {ch/TILE_SIZE}"
+                        f.write(a)
+                        print(a)
+if __name__ == '__main__':
+    boxes = read_bounding_boxes("./data/debug.full.txt")
+    print(boxes)
+    crop("./data/debug.full.png", boxes)

crawler/main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import csv
 import requests
 from bs4 import BeautifulSoup
 from progress.bar import ChargingBar
@@ -7,13 +7,14 @@ from progress.bar import ChargingBar
 from entity import Entity
 from common import selectors
 URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
 page = requests.get(URL)
 soup = BeautifulSoup(page.content, "html.parser")
 options = soup.find(class_="form-control").find_all('option')
-with open('entidades.csv', 'w', newline='') as csvfile:
     writer = csv.writer(csvfile)
     writer.writerow(Entity.row_names())

 import csv
+import pathlib
 import requests
 from bs4 import BeautifulSoup
 from progress.bar import ChargingBar
 from entity import Entity
 from common import selectors
+pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
 URL = "http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp"
 page = requests.get(URL)
 soup = BeautifulSoup(page.content, "html.parser")
 options = soup.find(class_="form-control").find_all('option')
+with open('./data/entidades.csv', 'w', newline='') as csvfile:
     writer = csv.writer(csvfile)
     writer.writerow(Entity.row_names())

crawler/requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 bs4==0.0.1
 progress==1.6
-selenium==4.3.0

 bs4==0.0.1
 progress==1.6
+inotify

crawler/screenshot.py CHANGED Viewed

@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
 #
 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.common.by import By
@@ -11,18 +14,22 @@ options = webdriver.FirefoxOptions()
 options.add_argument("--headless")
 options.add_argument("--window-size=1920x8000")
 driver = webdriver.Firefox(options=options)
 def sc_entity(e: Entity):
     print(e)
     driver.get(e.url)
-    driver.save_screenshot(f"{e.DATA_PATH}/screenshot.png")
-    driver.save_full_page_screenshot(f"{e.DATA_PATH}/screenshot.full.png")
     logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
-    with open(f"{e.DATA_PATH}/logo.pos", 'w') as f:
         for i in logos:
-            f.write(repr(i.rect))
-            print(i.get_attribute('src'), i.rect)
 if __name__ == '__main__':
     sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))

 #!/usr/bin/env python3
 #
+import math
 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.common.by import By
 options.add_argument("--headless")
 options.add_argument("--window-size=1920x8000")
+def coord_to_point(c):
+    x = math.floor(c['x'] + c['width']/2)
+    y = math.floor(c['y'] + c['height']/2)
+    return f"{x} {y} {math.roof(c['width'])} {math.roof(c['height'])}"
 driver = webdriver.Firefox(options=options)
 def sc_entity(e: Entity):
     print(e)
     driver.get(e.url)
+    driver.save_screenshot(f"{e.DATA_PATH}/{e.bco}.png")
+    driver.save_full_page_screenshot(f"{e.DATA_PATH}/{e.bco}.full.png")
     logos = driver.find_elements(By.CSS_SELECTOR, selectors.logo)
+    with open(f"{e.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
         for i in logos:
+            f.write(f"{e.bco} {coord_to_point(i.rect)}")
 if __name__ == '__main__':
     sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))

crawler/vendor.py CHANGED Viewed

@@ -1,58 +1,21 @@
 #!/usr/bin/env python3
 import pathlib
-import ssl
 import shutil
 import csv
 import concurrent.futures
 import requests
-from bs4 import BeautifulSoup
 from progress.bar import ChargingBar
 from entity import Entity
-from common import selectors
 import screenshot
-def write_cert(e: Entity):
-    ssl_url = e.url.split("/")[2]
-    try:
-        cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
-        with open(f"{e.DATA_PATH}/cert", 'w') as f:
-            f.write(cert)
-    except Exception as err:
-        with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
-            f.write(str(err))
-def get_logos(e: Entity, page):
-    soup = BeautifulSoup(page.content, "html.parser")
-    logos = soup.select(selectors.logo)
-    i = 0
-    lfn = []
-    for l in logos:
-        src = l.attrs['src']
-        ext = src.split('.')[-1].split('/')[-1]
-        try:
-            res = requests.get(src, stream=True)
-        except Exception:
-            res = requests.get(f"{e.url}/{src}")
-        fn = f"{e.DATA_PATH}/{i}.{ext}"
-        with open(fn, "wb") as f:
-            shutil.copyfileobj(res.raw, f)
-        lfn.append(fn)
-        i+=1
 def query_vendor_site(e: Entity):
-    pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
-    try:
-        page = requests.get(e.url)
-    except Exception:
-        e.url = e.url.replace('http', 'https')
-        page = requests.get(e.url)
-    write_cert(e)
-    get_logos(e, page)
     screenshot.sc_entity(e)
     return (fn, lfn)
@@ -73,8 +36,10 @@ def from_csv(fn):
                 bar.next()
             bar.finish()
-#query_vendor_site('http://www.bancoprovincia.com.ar', 'debug')
 #exit()
 if __name__ == '__main__':
-    from_csv('entidades.csv')

 #!/usr/bin/env python3
 import pathlib
 import shutil
 import csv
 import concurrent.futures
 import requests
 from progress.bar import ChargingBar
 from entity import Entity
 import screenshot
+import web
 def query_vendor_site(e: Entity):
+    page = web.get_page(e)
+    fn = web.get_cert(e)
+    lfn = web.get_logos(e, page)
     screenshot.sc_entity(e)
     return (fn, lfn)
                 bar.next()
             bar.finish()
+#query_vendor_site(Entity.from_dict({'url':'http://www.bancoprovincia.com.ar', 'bco':'debug'}))
 #exit()
 if __name__ == '__main__':
+    #pathlib.Path(e.DATA_PATH).mkdir(parents=True, exist_ok=True)
+    pathlib.Path(f"{Entity._DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
+    from_csv(f"{Entity._DATA_PATH}/entidades.csv")

crawler/watcher.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+import inotify.adapters
+from imtool import read_bounding_boxes, crop
+def watch(dir):
+    i = inotify.adapters.Inotify()
+    i.add_watch(dir)
+    for event in i.event_gen(yield_nones=False):
+        (_, type_names, path, filename) = event
+        if filename.endswith(".png") and type_names[0] in ['IN_CLOSE_WRITE']:
+            print(f"--PATH=[{path}] FILENAME=[{filename}] EVENT_TYPES={type_names}")
+            try:
+                bbs = read_bounding_boxes(os.path.join(path, filename.replace('.png', '.txt')))
+                crop(os.path.join(path, filename), bbs)
+            except Exception as e:
+                print(f"error: {e}")
+if __name__ == '__main__':
+    watch('./data')

crawler/web.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python3
+import ssl
+from bs4 import BeautifulSoup
+from entity import Entity
+from common import selectors
+def get_page(e: Entity):
+    try:
+        page = requests.get(e.url)
+    except Exception:
+        e.url = e.url.replace('http', 'https')
+        page = requests.get(e.url)
+    return page
+def get_cert(e: Entity):
+    ssl_url = e.url.split("/")[2]
+    try:
+        cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
+        fn = f"{e.DATA_PATH}/{e.bco}.cert"
+        with open(fn, 'w') as f:
+            f.write(cert)
+    except Exception as err:
+        with open(f"{fn}.error.log", 'w+') as f:
+            f.write(str(err))
+    return fn
+def get_logos(e: Entity, page):
+    soup = BeautifulSoup(page.content, "html.parser")
+    logos = soup.select(selectors.logo)
+    i = 0
+    lfn = []
+    for l in logos:
+        src = l.attrs['src']
+        ext = src.split('.')[-1].split('/')[-1]
+        try:
+            res = requests.get(src, stream=True)
+        except Exception:
+            res = requests.get(f"{e.url}/{src}")
+        fn = f"{e.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
+        with open(fn, "wb") as f:
+            shutil.copyfileobj(res.raw, f)
+        lfn.append(fn)
+        i+=1
+    return lfn

docker-compose.yaml CHANGED Viewed

@@ -12,9 +12,10 @@ services:
       DEBUG: "puppet"
     depends_on:
       - "browserless"
-    command: "sh -c 'while echo deno; do sleep 3h; done'" #"deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
     volumes:
-      - "./src-deno:/app/src:z"
       - "./data:/app/data:z"
     #restart: unless-stopped:600
     deploy:
@@ -22,7 +23,15 @@ services:
         condition: any
         delay: 600s
         window: 300s
   browserless:
      image: docker.io/zenika/alpine-chrome
      entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]

       DEBUG: "puppet"
     depends_on:
       - "browserless"
+    # command: "sh -c 'while echo deno; do sleep 3h; done'" #
+    command: "deno run --allow-net --allow-env --allow-read --allow-write src/index.ts"
     volumes:
+      # - "./src:/app/src:z" # for debugging
       - "./data:/app/data:z"
     #restart: unless-stopped:600
     deploy:
         condition: any
         delay: 600s
         window: 300s
+  cutter:
+    build:
+      dockerfile: Dockerfile.python
+      context: .
+    depends_on:
+      - "puppet"
+    volumes:
+      # - "./crawler:/app/src:z" # for debugging
+      - "./data:/app/data:z"
   browserless:
      image: docker.io/zenika/alpine-chrome
      entrypoint: ["sh", "-c", "while true; do chromium-browser --headless --use-gl=swiftshader --disable-software-rasterizer --disable-dev-shm-usage --no-sandbox --remote-debugging-address=0.0.0.0 --remote-debugging-port=3000; sleep 2; done"]