Examples of code I've written in PHP, Javascript, SCSS, etc.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 
 
 

122 linhas
4.2 KiB

  1. #/usr/bin/python3
  2. #A simple crawler for RRP Canada I made for a coding assessment
  3. from selenium import webdriver #The selenium python3 module is required
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.webdriver.support.ui import WebDriverWait
  8. from selenium.webdriver.common.action_chains import ActionChains
  9. import random
  10. import time
  11. import pymongo
  12. from pymongo import MongoClient
  13. from io import StringIO
  14. # Scraping can become inefficient if frequent page hits cause the IP to be
  15. # blocked. A good strategy is to mimick the browsing behaviour of a real user
  16. # with random delays between page actions of 1-10 seconds, and a rest (hit
  17. # speed) of a few hours. Rotation of IP, account, and user agent could also be
  18. # added later to reduce suspicion.
  19. hit_speed = 60*60*3
  20. max_wait = 10
  21. min_wait = 1
  22. login_username = "etceterax+bot@protonmail.com"
  23. login_password = "7}I8)dZ-GVv|]&jK]<]^$B,y$"
  24. ## Initialize the MongoDB client and a headless firefox browser
  25. client = MongoClient()
  26. db = client.optima
  27. products = db.products
  28. opts = webdriver.FirefoxOptions()
  29. opts.headless = True
  30. # products.create_index([('name', pymongo.ASCENDING), ('category',
  31. # pymongo.ASCENDING), ('by', pymongo.ASCENDING), ('supplied_by',
  32. # pymongo.ASCENDING)], unique=True)
  33. def wait():
  34. time.sleep(random.randint(min_wait, max_wait))
  35. def start():
  36. driver.get("https://www.rrpcanada.org/#/login")
  37. wait()
  38. assert "Sign In" in driver.title
  39. def login():
  40. email = driver.find_element_by_id("email")
  41. password = driver.find_element_by_id("password")
  42. email.send_keys(login_username)
  43. password.send_keys(login_password)
  44. password.send_keys(Keys.RETURN)
  45. wait()
  46. def getProducts():
  47. wait()
  48. next_button = driver.find_element_by_css_selector("a[href*='#/user-type']")
  49. next_button.click();
  50. wait()
  51. next_button = driver.find_element_by_css_selector("a[href*='#/available-products-services']")
  52. next_button.click();
  53. WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.table div.line-item button")))
  54. wait()
  55. buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
  56. print(f'{len(buttons)} results')
  57. #During account rotation, this could start with value of b not already stored
  58. for b in buttons:
  59. # ActionChains(driver).move_to_element(b).perform()
  60. driver.execute_script("arguments[0].scrollIntoView();", b)
  61. buttons = driver.find_elements_by_css_selector("div.table div.line-item button")
  62. b.click()
  63. wait()
  64. getProduct()
  65. driver.find_element_by_css_selector('div.close-modal-button').click()
  66. wait()
  67. # There should be some error handling here in case an item has a missing
  68. # description or location field
  69. def getProduct():
  70. #Grab all the text in the modal which are a line seperated list of keys and values
  71. info = driver.find_element_by_xpath("//div[@class='modal-content-container']").text.split("\n")
  72. for i in range(0, len(info)):
  73. if info[i] == 'Contact:':
  74. x = i
  75. break
  76. details = info[x:-1] #Splice for product details while removing the comment button's text
  77. #Common information has keys and values on the same line
  78. product = {
  79. "category": info[1].strip(),
  80. "name": info[2].strip(),
  81. "by": info[3][4:].strip(),
  82. "supplied_by": info[4][14:].strip(),
  83. "made_in": info[7][8:].strip(),
  84. "description": info[8].strip(),
  85. }
  86. #Product details have keys and values alternating lines
  87. for i in range(0, len(details), 2):
  88. product[details[i].strip()[:-1]] = details[i+1].strip()
  89. if products.find_one(product):
  90. return
  91. #Updates product information and inserts if not already set
  92. products.update_one(product, {"$set": product}, upsert=True)
  93. # An infinite loop that initializes the selenium driver, crawls, quits, then rests.
  94. while True:
  95. driver = webdriver.Firefox(options=opts)
  96. driver.implicitly_wait(random.randint(min_wait, max_wait))
  97. start()
  98. login()
  99. getProducts()
  100. driver.close()
  101. time.sleep(hit_speed)