Examples of code I've written in PHP, Javascript, SCSS, etc.
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

scrape.py 4.2 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #/usr/bin/python3
  2. #A simple crawler for RRP Canada I made for a coding assessment
  3. from selenium import webdriver #The selenium python3 module is required
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.ui import WebDriverWait
  8. from selenium.webdriver.common.action_chains import ActionChains
  9. import random
  10. import time
  11. import pymongo
  12. from pymongo import MongoClient
  13. from io import StringIO
  14. # Scraping can become inefficient if frequent page hits cause the IP to be
  15. # blocked. A good strategy is to mimick the browsing behaviour of a real user
  16. # with random delays between page actions of 1-10 seconds, and a rest (hit
  17. # speed) of a few hours. Rotation of IP, account, and user agent could also be
  18. # added later to reduce suspicion.
  19. hit_speed = 60*60*3
  20. max_wait = 10
  21. min_wait = 1
  22. login_username = "etceterax+bot@protonmail.com"
  23. login_password = "7}I8)dZ-GVv|]&jK]<]^$B,y$"
  24. ## Initialize the MongoDB client and a headless firefox browser
  25. client = MongoClient()
  26. db = client.optima
  27. products = db.products
  28. opts = webdriver.FirefoxOptions()
  29. opts.headless = True
  30. # products.create_index([('name', pymongo.ASCENDING), ('category',
  31. # pymongo.ASCENDING), ('by', pymongo.ASCENDING), ('supplied_by',
  32. # pymongo.ASCENDING)], unique=True)
  33. def wait():
  34. time.sleep(random.randint(min_wait, max_wait))
  35. def start():
  36. driver.get("https://www.rrpcanada.org/#/login")
  37. wait()
  38. assert "Sign In" in driver.title
  39. def login():
  40. email = driver.find_element_by_id("email")
  41. password = driver.find_element_by_id("password")
  42. email.send_keys(login_username)
  43. password.send_keys(login_password)
  44. password.send_keys(Keys.RETURN)
  45. wait()
  46. def getProducts():
  47. wait()
  48. next_button = driver.find_element_by_css_selector("a[href*='#/user-type']")
  49. next_button.click();
  50. wait()
  51. next_button = driver.find_element_by_css_selector("a[href*='#/available-products-services']")
  52. next_button.click();
  53. WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.table div.line-item button")))
  54. wait()
  55. buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
  56. print(f'{len(buttons)} results')
  57. #During account rotation, this could start with value of b not already stored
  58. for b in buttons:
  59. # ActionChains(driver).move_to_element(b).perform()
  60. driver.execute_script("arguments[0].scrollIntoView();", b)
  61. buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
  62. b.click()
  63. wait()
  64. getProduct()
  65. driver.find_element_by_css_selector('div.close-modal-button').click()
  66. wait()
  67. # There should be some error handling here in case an item has a missing
  68. # description or location field
  69. def getProduct():
  70. #Grab all the text in the modal which are a line seperated list of keys and values
  71. info = driver.find_element_by_xpath("//div[@class='modal-content-container']").text.split("\n")
  72. for i in range(0, len(info)):
  73. if info[i] == 'Contact:':
  74. x = i
  75. break
  76. details = info[x:-1] #Splice for product details while removing the comment button's text
  77. #Common information has keys and values on the same line
  78. product = {
  79. "category": info[1].strip(),
  80. "name": info[2].strip(),
  81. "by": info[3][4:].strip(),
  82. "supplied_by": info[4][14:].strip(),
  83. "made_in": info[7][8:].strip(),
  84. "description": info[8].strip(),
  85. }
  86. #Product details have keys and values alternating lines
  87. for i in range(0, len(details), 2):
  88. product[details[i].strip()[:-1]] = details[i+1].strip()
  89. if products.find_one(product):
  90. return
  91. #Updates product information and inserts if not already set
  92. products.update_one(product, {"$set": product}, upsert=True)
  93. # An infinite loop that initializes the selenium driver, crawls, quits, then rests.
  94. while True:
  95. driver = webdriver.Firefox(options=opts)
  96. driver.implicitly_wait(random.randint(min_wait, max_wait))
  97. start()
  98. login()
  99. getProducts()
  100. driver.close()
  101. time.sleep(hit_speed)