|
- #/usr/bin/python3
- #A simple crawler for RRP Canada I made for a coding assessment
- from selenium import webdriver #The selenium python3 module is required
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.action_chains import ActionChains
-
- import random
- import time
- import pymongo
- from pymongo import MongoClient
- from io import StringIO
-
- # Scraping can become inefficient if frequent page hits cause the IP to be
- # blocked. A good strategy is to mimick the browsing behaviour of a real user
- # with random delays between page actions of 1-10 seconds, and a rest (hit
- # speed) of a few hours. Rotation of IP, account, and user agent could also be
- # added later to reduce suspicion.
-
- hit_speed = 60*60*3
- max_wait = 10
- min_wait = 1
- login_username = "etceterax+bot@protonmail.com"
- login_password = "7}I8)dZ-GVv|]&jK]<]^$B,y$"
-
- ## Initialize the MongoDB client and a headless firefox browser
-
- client = MongoClient()
- db = client.optima
- products = db.products
- opts = webdriver.FirefoxOptions()
- opts.headless = True
-
- # products.create_index([('name', pymongo.ASCENDING), ('category',
- # pymongo.ASCENDING), ('by', pymongo.ASCENDING), ('supplied_by',
- # pymongo.ASCENDING)], unique=True)
-
- def wait():
- time.sleep(random.randint(min_wait, max_wait))
-
- def start():
- driver.get("https://www.rrpcanada.org/#/login")
- wait()
- assert "Sign In" in driver.title
-
- def login():
- email = driver.find_element_by_id("email")
- password = driver.find_element_by_id("password")
- email.send_keys(login_username)
- password.send_keys(login_password)
- password.send_keys(Keys.RETURN)
- wait()
-
- def getProducts():
- wait()
- next_button = driver.find_element_by_css_selector("a[href*='#/user-type']")
- next_button.click();
- wait()
- next_button = driver.find_element_by_css_selector("a[href*='#/available-products-services']")
- next_button.click();
- WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.table div.line-item button")))
- wait()
- buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
- print(f'{len(buttons)} results')
-
- #During account rotation, this could start with value of b not already stored
- for b in buttons:
- # ActionChains(driver).move_to_element(b).perform()
- driver.execute_script("arguments[0].scrollIntoView();", b)
- buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
- b.click()
- wait()
- getProduct()
- driver.find_element_by_css_selector('div.close-modal-button').click()
- wait()
-
- # There should be some error handling here in case an item has a missing
- # description or location field
- def getProduct():
- #Grab all the text in the modal which are a line seperated list of keys and values
- info = driver.find_element_by_xpath("//div[@class='modal-content-container']").text.split("\n")
-
- for i in range(0, len(info)):
- if info[i] == 'Contact:':
- x = i
- break
-
- details = info[x:-1] #Splice for product details while removing the comment button's text
-
- #Common information has keys and values on the same line
- product = {
- "category": info[1].strip(),
- "name": info[2].strip(),
- "by": info[3][4:].strip(),
- "supplied_by": info[4][14:].strip(),
- "made_in": info[7][8:].strip(),
- "description": info[8].strip(),
- }
-
- #Product details have keys and values alternating lines
- for i in range(0, len(details), 2):
- product[details[i].strip()[:-1]] = details[i+1].strip()
-
- if products.find_one(product):
- return
- #Updates product information and inserts if not already set
- products.update_one(product, {"$set": product}, upsert=True)
-
- # An infinite loop that initializes the selenium driver, crawls, quits, then rests.
- while True:
- driver = webdriver.Firefox(options=opts)
- driver.implicitly_wait(random.randint(min_wait, max_wait))
- start()
- login()
- getProducts()
- driver.close()
- time.sleep(hit_speed)
-
|