Add python code

пре 4 година · ab88abe59f
--- a/README.md
+++ b/README.md
@@ -2,5 +2,5 @@
 Some sample code taken from sites I've completed. They're written in
 Javascript, PHP, and SCSS. Some examples are incomplete and meant for
 demonstrative purposes only as I'm not comfortable sharing complete code of some
 live, potentialy profitable projects.
 live important projects.

--- a/python/crawler.py
+++ b/python/crawler.py
@@ -0,0 +1,331 @@
 #/usr/bin/python3
 # My First attempt at building a selenium crawler. It works but many parts of the code are unnecessary or 
 # could have been written better
 from selenium import webdriver
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException
 from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
 from PIL import Image

 import base64
 import selenium
 import os, traceback, sys
 import random
 import json
 import time
 import requests

 class Spider():

    def __init__(self):
        self.keys = webdriver.common.keys
        self.settings = self.getConfig('login.json')

        fp = self.getDriverPrefs()
        self.driver = webdriver.Firefox(firefox_profile=fp)
        self.driver.implicitly_wait(10)
        self.waitTime = random.randint(4, 10)

        self.start()
        return

    def getDriverPrefs(self):
        fp = webdriver.FirefoxProfile()
        fp.set_preference("browser.download.folderList", 2)
        fp.set_preference("browser.download.manager.showWhenStarting", False)
        fp.set_preference("browser.download.dir", os.getcwd()+'/Results/')
        fp.set_preference("browser.helperApps.neverAsk.saveToDisk", 'text/csv')
        #fp.set_preference("browser.download.manager.focusWhenStarting", False)
        #fp.set_preference("browser.download.useDownloadDir", True)
        #fp.set_preference("browser.helperApps.alwaysAsk.force", False)
        #fp.set_preference("browser.download.manager.alertOnEXEOpen", False)
        #fp.set_preference("browser.download.manager.closeWhenDone", True)
        #fp.set_preference("browser.download.manager.showAlertOnComplete", False)
        #fp.set_preference("browser.download.manager.useWindow", False)
        #fp.set_preference("services.sync.prefs.sync.browser.download.manager.showWhenStarting", False)
        #fp.set_preference("pdfjs.disabled", True)
        #fp.set_preference('profile.default_content_setting_values.automatic_downloads', 1)
        return fp

    def getConfig(self, fpath):
        with open(fpath, 'r', encoding='utf-8') as login_file:
            return json.loads(login_file.read())


    def start(self):
        self.driver.get(self.settings["entry"][0])
        assert "Authorization" in self.driver.title

        self.loginClicks() #Logs in, navigates through prompts and terms pages

        #Put exception handler here for updating json file. Should work for all exceptions including C-c
        try:
            self.cleanSelects()
            self.getData()
            self.gracefulQuit()
        except KeyboardInterrupt:
            print ("Shutdown requested...exiting")
            self.gracefulQuit()
        except TimeoutException:
            self.__init__()
        except Exception:
            traceback.print_exc(file=sys.stdout)
        return

    def loginClicks(self):
        e = self.driver.find_element_by_id("userid")
        e.send_keys(self.settings["username"])
        e = self.driver.find_element_by_id("pin")
        e.send_keys(self.settings["password"])
        self.handleCaptcha(self.driver.find_element_by_css_selector, 'input.btn-primary').click()
        self.driver.find_element_by_id("chkAgree").click()
        time.sleep(self.settings['waitTime'])
        self.driver.find_element_by_class_name("action-agree").click()
        req = self.driver.find_element_by_css_selector('.CaBusiness')
        self.realClick(self.driver.find_element_by_css_selector('.CaBusiness a'), req)
        return

    def handleCaptcha(self, x, *args):
        try:
            x = x(*args)
            return x
        except (selenium.common.exceptions.TimeoutException, selenium.common.exceptions.NoSuchElementException):
            self.solveCaptcha()
            return x(*args)

    def solveCaptcha(self):
        #pulls image
        print('handling captcha')
        time.sleep(self.settings['waitTime'])
        body = self.driver.find_element_by_css_selector('body')
        self.driver.switch_to.default_content()
        body.click()
        self.wait()

        e = self.driver.find_element_by_css_selector('#captcha-image')
        imgName = "captcha.png"
        image = self.pullCaptcha(e, imgName)
        key = self.postCaptcha(image)
        time.sleep(self.settings['waitTime'])
        answer = self.getCaptcha(key)
        if answer == None:
            return
        self.captchaClicks(answer)
        if len(self.driver.find_elements_by_css_selector('#captcha-image')) != 0:
            print ('captcha was wrong')
            self.solveCaptcha()
        return

    def pullCaptcha(self, element, path):
        # Gets captcha image
        location = element.location
        size = element.size
        # saves screenshot of entire page
        self.driver.save_screenshot(path)

        left = location['x']
        top = location['y'] + 140
        right = location['x'] + size['width']
        bottom = location['y'] + size['height'] + 140

        image = Image.open(path)
        image = image.crop((530, 405, 635, 440))  # defines crop points
        image.save(path, format='png')  # saves new cropped image

        with open(path, "rb") as image_file:
            im = image_file.read()
        return im

    def postCaptcha(self, image):
        params = {'json': 1, 'key': self.settings["captchaKey"],
                  'method': 'post', 'calc': 1}

        files = {'file': ('file.png', image, 'image/png', {'Expires': '0'})}
        response = requests.post(self.settings["captchaPOST"], files=files,
                                     timeout=10, params=params)
        print(response.text)
        if 'ERROR' in response.text:
            raise Exception(f'Captcha Error: {response.text}')

        return response.json()['request']


    def getCaptcha(self, i):
        print(i)
        params = {'json': 1, 'key': self.settings["captchaKey"], 'action':'get', 'id':i}
        while True:
            response = requests.get(self.settings["captchaGET"], timeout=10, params=params)
            print(response.text)
            if 'UNSOLVABLE'  in response.text or '+' in response.text or '-' in response.text:
                self.selectAndClick('#captcha-refresh')
                self.solveCaptcha()
                return
            if 'ERROR'  in response.text:
                raise Exception(f'Captcha Error: {response.text}')
            if '=' in response.text:
                self.selectAndClick('#captcha-refresh')
                self.solveCaptcha()
                return
            if 'NOT_READY' in response.text:
                time.sleep(10)
                continue
            else:
                break
        a = response.json()['request']
        print(a)
        return a

    def selectAndClick(self, s):
        e = self.handleCaptcha(self.driver.find_element_by_css_selector, s)
        self.handleCaptcha(e.click)
        self.wait()
        return

    def captchaClicks(self, answer):
        #Last wrappers is needed in case the solution is wrong, other wrappers are extra
        inputBox = self.handleCaptcha(self.driver.find_element_by_css_selector, '#Attempt')
        self.handleCaptcha(inputBox.clear)
        self.handleCaptcha(inputBox.send_keys, answer)
        time.sleep(self.settings['waitTime'])
        btn = self.handleCaptcha(self.driver.find_element_by_css_selector, '.action-validate-captcha.originButtonCompact.ui-priority-primary')
        self.handleCaptcha(btn.click)
        return

    def realClick(self, target, req):
    #Used for when site needs special click event combinations
        a = webdriver.common.action_chains.ActionChains(self.driver)
        a.move_to_element(req).perform()
        chain = webdriver.common.action_chains.ActionChains(self.driver)
        chain.move_to_element(target).click(target).perform()
        return

    def getData(self):
        #Used for organizing sequence of bot commands according to configs

        if self.settings["completed"] >= self.settings["endPosition"]:
            print("Already at end position")
            self.gracefulQuit()
            return

        #Clear selects that may have been left by the last instance
        self.driver.get(self.settings['entry'][1])
        self.insertPosition(self.settings['completed'])
        self.selectRecords(False)

        p = 0
        while (self.settings["completed"] < self.settings["endPosition"]
               and p < self.settings["interval"]):
            self.clickSequence()
            self.saveCompleted()
            time.sleep(self.waitTime)
            p = p + 1
        return

    def insertPosition(self, p):
        try:
            inputBox = self.handleCaptcha(self.driver.find_element_by_css_selector, 'div.page.click-enterkey:not(.disabled)')
            self.wait()
            inputBox.click()
            self.wait()
            inputBox = self.handleCaptcha(self.driver.find_element_by_css_selector, 'input:not(#Attempt)')
            self.wait()
            #ActionChains(self.driver).move_to_element(inputBox).click().send_keys(startPosition).send_keys(Keys.RETURN).perform()
            self.handleCaptcha(inputBox.send_keys, p)
            self.handleCaptcha(inputBox.send_keys, Keys.RETURN)
            return
        except (StaleElementReferenceException, ElementNotInteractableException):
            self.wait()
            self.wait()
            self.insertPosition(p)
            return
        return

    def wait(self):
        time.sleep(self.settings['waitTime'])
        return

    def selectRecords(self, ifSelect):
        #This may create a bug where some records are missed because of captcha prompt
        for x in range(0,10):
            self.wait()
            #b = self.handleCaptcha(EC.visibility_of_element_located, (By.CSS_SELECTOR, '#checkboxCol.action-tag-all'))
            #w = self.handleCaptcha(WebDriverWait, self.driver, 10)
            #u = self.handleCaptcha(w.until, b)
            u = self.handleCaptcha(self.driver.find_element_by_css_selector, '#checkboxCol.action-tag-all')
            #allButton = self.handleCaptcha(WebDriverWait(self.driver, 10).until(
            #)
            allButton = u
            checked = allButton.get_attribute('checked')
            if (checked == None) == ifSelect:
                self.handleCaptcha(allButton.click)
                time.sleep(self.settings['waitTime'])

            nextButton = self.handleCaptcha(self.driver.find_element_by_css_selector, ".next.button")
            self.handleCaptcha(nextButton.click)
        return

    def clickSequence(self):
        #Decides which clicks should be done next to collect records
        url = self.settings["entry"][1]
        self.driver.get(url)
        startPosition = str(self.settings["completed"] + 1)

        self.handleCaptcha(self.insertPosition, startPosition)
        time.sleep(3)
        self.selectRecords(True)
        e = self.handleCaptcha(self.driver.find_element_by_css_selector, "a.download.action-download")
        e.click()
        self.wait()
        e = self.handleCaptcha(self.driver.find_element_by_css_selector, "#detailDetail")
        e.click()
        e = self.handleCaptcha(self.driver.find_element_by_css_selector,
                          '.originButton.ui-priority-primary.view-on-exportlimit.view-on-randomsample.action-download')
        e.click()
        time.sleep(self.settings['waitTime'])

        #Records need to be deselected to select new ones
        self.driver.get(url)
        self.handleCaptcha(self.insertPosition, startPosition)
        self.selectRecords(False)

        self.settings["completed"] = self.settings["completed"] + 10
        print(f'completed: {self.settings["completed"]}')
        return

    def cleanSelects(self):
        if self.settings["cleanMode"][0]:
            url = self.settings["entry"][1]
            cleanMode = self.settings["cleanMode"]
            start = cleanMode[1]
            stop = cleanMode[2]
        else:
            return

        self.driver.get(url)
        self.handleCaptcha(self.insertPosition, str(start))

        #This will have side effects
        dif = int((start - stop)/10)

        for a in range(0, dif):
            self.selectRecords(False)
        return

    def saveCompleted(self):
        with open('login.json', 'w', encoding='utf-8') as login_file:
            login_file.write(json.dumps(self.settings, indent=4, sort_keys=True))
            return
        return

    def gracefulQuit(self):
        self.saveCompleted()
        return


 if __name__ == "__main__":
    spider = Spider()

--- a/python/scrape.py
+++ b/python/scrape.py
@@ -0,0 +1,121 @@
 #/usr/bin/python3
 #A simple crawler for RRP Canada I made for a coding assessment
 from selenium import webdriver #The selenium python3 module is required
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.common.action_chains import ActionChains

 import random
 import time
 import pymongo
 from pymongo import MongoClient
 from io import StringIO

 # Scraping can become inefficient if frequent page hits cause the IP to be
 # blocked. A good strategy is to mimick the browsing behaviour of a real user
 # with random delays between page actions of 1-10 seconds, and a rest (hit
 # speed) of a few hours. Rotation of IP, account, and user agent could also be
 # added later to reduce suspicion.

 hit_speed = 60*60*3
 max_wait = 10
 min_wait = 1
 login_username = "etceterax+bot@protonmail.com"
 login_password = "7}I8)dZ-GVv|]&jK]<]^$B,y$"

 ## Initialize the MongoDB client and a headless firefox browser

 client = MongoClient()
 db = client.optima
 products = db.products
 opts = webdriver.FirefoxOptions()
 opts.headless = True

 # products.create_index([('name', pymongo.ASCENDING), ('category',
 #     pymongo.ASCENDING), ('by', pymongo.ASCENDING), ('supplied_by',
 #         pymongo.ASCENDING)], unique=True)

 def wait():
    time.sleep(random.randint(min_wait, max_wait))

 def start():
    driver.get("https://www.rrpcanada.org/#/login")
    wait()
    assert "Sign In" in driver.title

 def login():
    email = driver.find_element_by_id("email")
    password = driver.find_element_by_id("password")
    email.send_keys(login_username)
    password.send_keys(login_password)
    password.send_keys(Keys.RETURN)
    wait()

 def getProducts():
    wait()
    next_button = driver.find_element_by_css_selector("a[href*='#/user-type']")
    next_button.click();
    wait()
    next_button = driver.find_element_by_css_selector("a[href*='#/available-products-services']")
    next_button.click();
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.table div.line-item button")))
    wait()
    buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
    print(f'{len(buttons)} results')

    #During account rotation, this could start with value of b not already stored
    for b in buttons:
        # ActionChains(driver).move_to_element(b).perform()
        driver.execute_script("arguments[0].scrollIntoView();", b)
        buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
        b.click()
        wait()
        getProduct()
        driver.find_element_by_css_selector('div.close-modal-button').click()
        wait()

 # There should be some error handling here in case an item has a missing
 # description or location field
 def getProduct():
    #Grab all the text in the modal which are a line seperated list of keys and values
    info = driver.find_element_by_xpath("//div[@class='modal-content-container']").text.split("\n")

    for i in range(0, len(info)):
        if info[i] == 'Contact:':
            x = i
            break

    details = info[x:-1] #Splice for product details while removing the comment button's text

    #Common information has keys and values on the same line
    product = {
            "category": info[1].strip(),
            "name": info[2].strip(),
            "by": info[3][4:].strip(),
            "supplied_by": info[4][14:].strip(),
            "made_in": info[7][8:].strip(),
            "description": info[8].strip(),
            }

    #Product details have keys and values alternating lines
    for i in range(0, len(details), 2):
        product[details[i].strip()[:-1]] = details[i+1].strip()

    if products.find_one(product):
        return
    #Updates product information and inserts if not already set
    products.update_one(product, {"$set": product}, upsert=True)

 # An infinite loop that initializes the selenium driver, crawls, quits, then rests.
 while True:
    driver = webdriver.Firefox(options=opts)
    driver.implicitly_wait(random.randint(min_wait, max_wait))
    start()
    login()
    getProducts()
    driver.close()
    time.sleep(hit_speed)


--- a/python/scrapy-lua
+++ b/python/scrapy-lua
@@ -0,0 +1 @@
 Subproject commit 8bf9d46553f5064bd13d9fb6d85730056cf9f085