#/usr/bin/python3 #A simple crawler for RRP Canada I made for a coding assessment from selenium import webdriver #The selenium python3 module is required from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.action_chains import ActionChains import random import time import pymongo from pymongo import MongoClient from io import StringIO # Scraping can become inefficient if frequent page hits cause the IP to be # blocked. A good strategy is to mimick the browsing behaviour of a real user # with random delays between page actions of 1-10 seconds, and a rest (hit # speed) of a few hours. Rotation of IP, account, and user agent could also be # added later to reduce suspicion. hit_speed = 60*60*3 max_wait = 10 min_wait = 1 login_username = "etceterax+bot@protonmail.com" login_password = "7}I8)dZ-GVv|]&jK]<]^$B,y$" ## Initialize the MongoDB client and a headless firefox browser client = MongoClient() db = client.optima products = db.products opts = webdriver.FirefoxOptions() opts.headless = True # products.create_index([('name', pymongo.ASCENDING), ('category', # pymongo.ASCENDING), ('by', pymongo.ASCENDING), ('supplied_by', # pymongo.ASCENDING)], unique=True) def wait(): time.sleep(random.randint(min_wait, max_wait)) def start(): driver.get("https://www.rrpcanada.org/#/login") wait() assert "Sign In" in driver.title def login(): email = driver.find_element_by_id("email") password = driver.find_element_by_id("password") email.send_keys(login_username) password.send_keys(login_password) password.send_keys(Keys.RETURN) wait() def getProducts(): wait() next_button = driver.find_element_by_css_selector("a[href*='#/user-type']") next_button.click(); wait() next_button = driver.find_element_by_css_selector("a[href*='#/available-products-services']") next_button.click(); WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.table div.line-item button"))) wait() buttons = driver.find_elements_by_css_selector("div.table div.line-item button") print(f'{len(buttons)} results') #During account rotation, this could start with value of b not already stored for b in buttons: # ActionChains(driver).move_to_element(b).perform() driver.execute_script("arguments[0].scrollIntoView();", b) buttons = driver.find_elements_by_css_selector("div.table div.line-item button") b.click() wait() getProduct() driver.find_element_by_css_selector('div.close-modal-button').click() wait() # There should be some error handling here in case an item has a missing # description or location field def getProduct(): #Grab all the text in the modal which are a line seperated list of keys and values info = driver.find_element_by_xpath("//div[@class='modal-content-container']").text.split("\n") for i in range(0, len(info)): if info[i] == 'Contact:': x = i break details = info[x:-1] #Splice for product details while removing the comment button's text #Common information has keys and values on the same line product = { "category": info[1].strip(), "name": info[2].strip(), "by": info[3][4:].strip(), "supplied_by": info[4][14:].strip(), "made_in": info[7][8:].strip(), "description": info[8].strip(), } #Product details have keys and values alternating lines for i in range(0, len(details), 2): product[details[i].strip()[:-1]] = details[i+1].strip() if products.find_one(product): return #Updates product information and inserts if not already set products.update_one(product, {"$set": product}, upsert=True) # An infinite loop that initializes the selenium driver, crawls, quits, then rests. while True: driver = webdriver.Firefox(options=opts) driver.implicitly_wait(random.randint(min_wait, max_wait)) start() login() getProducts() driver.close() time.sleep(hit_speed)