Examples of code I've written in PHP, Javascript, SCSS, etc.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 
 

332 lignes
13 KiB

  1. #/usr/bin/python3
  2. # My First attempt at building a selenium crawler. It works but many parts of the code are unnecessary or
  3. # could have been written better
  4. from selenium import webdriver
  5. from selenium.webdriver.support.ui import WebDriverWait
  6. from selenium.webdriver.common.action_chains import ActionChains
  7. from selenium.webdriver.common.keys import Keys
  8. from selenium.webdriver.support import expected_conditions as EC
  9. from selenium.webdriver.common.by import By
  10. from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException
  11. from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
  12. from PIL import Image
  13. import base64
  14. import selenium
  15. import os, traceback, sys
  16. import random
  17. import json
  18. import time
  19. import requests
  20. class Spider():
  21. def __init__(self):
  22. self.keys = webdriver.common.keys
  23. self.settings = self.getConfig('login.json')
  24. fp = self.getDriverPrefs()
  25. self.driver = webdriver.Firefox(firefox_profile=fp)
  26. self.driver.implicitly_wait(10)
  27. self.waitTime = random.randint(4, 10)
  28. self.start()
  29. return
  30. def getDriverPrefs(self):
  31. fp = webdriver.FirefoxProfile()
  32. fp.set_preference("browser.download.folderList", 2)
  33. fp.set_preference("browser.download.manager.showWhenStarting", False)
  34. fp.set_preference("browser.download.dir", os.getcwd()+'/Results/')
  35. fp.set_preference("browser.helperApps.neverAsk.saveToDisk", 'text/csv')
  36. #fp.set_preference("browser.download.manager.focusWhenStarting", False)
  37. #fp.set_preference("browser.download.useDownloadDir", True)
  38. #fp.set_preference("browser.helperApps.alwaysAsk.force", False)
  39. #fp.set_preference("browser.download.manager.alertOnEXEOpen", False)
  40. #fp.set_preference("browser.download.manager.closeWhenDone", True)
  41. #fp.set_preference("browser.download.manager.showAlertOnComplete", False)
  42. #fp.set_preference("browser.download.manager.useWindow", False)
  43. #fp.set_preference("services.sync.prefs.sync.browser.download.manager.showWhenStarting", False)
  44. #fp.set_preference("pdfjs.disabled", True)
  45. #fp.set_preference('profile.default_content_setting_values.automatic_downloads', 1)
  46. return fp
  47. def getConfig(self, fpath):
  48. with open(fpath, 'r', encoding='utf-8') as login_file:
  49. return json.loads(login_file.read())
  50. def start(self):
  51. self.driver.get(self.settings["entry"][0])
  52. assert "Authorization" in self.driver.title
  53. self.loginClicks() #Logs in, navigates through prompts and terms pages
  54. #Put exception handler here for updating json file. Should work for all exceptions including C-c
  55. try:
  56. self.cleanSelects()
  57. self.getData()
  58. self.gracefulQuit()
  59. except KeyboardInterrupt:
  60. print ("Shutdown requested...exiting")
  61. self.gracefulQuit()
  62. except TimeoutException:
  63. self.__init__()
  64. except Exception:
  65. traceback.print_exc(file=sys.stdout)
  66. return
  67. def loginClicks(self):
  68. e = self.driver.find_element_by_id("userid")
  69. e.send_keys(self.settings["username"])
  70. e = self.driver.find_element_by_id("pin")
  71. e.send_keys(self.settings["password"])
  72. self.handleCaptcha(self.driver.find_element_by_css_selector, 'input.btn-primary').click()
  73. self.driver.find_element_by_id("chkAgree").click()
  74. time.sleep(self.settings['waitTime'])
  75. self.driver.find_element_by_class_name("action-agree").click()
  76. req = self.driver.find_element_by_css_selector('.CaBusiness')
  77. self.realClick(self.driver.find_element_by_css_selector('.CaBusiness a'), req)
  78. return
  79. def handleCaptcha(self, x, *args):
  80. try:
  81. x = x(*args)
  82. return x
  83. except (selenium.common.exceptions.TimeoutException, selenium.common.exceptions.NoSuchElementException):
  84. self.solveCaptcha()
  85. return x(*args)
  86. def solveCaptcha(self):
  87. #pulls image
  88. print('handling captcha')
  89. time.sleep(self.settings['waitTime'])
  90. body = self.driver.find_element_by_css_selector('body')
  91. self.driver.switch_to.default_content()
  92. body.click()
  93. self.wait()
  94. e = self.driver.find_element_by_css_selector('#captcha-image')
  95. imgName = "captcha.png"
  96. image = self.pullCaptcha(e, imgName)
  97. key = self.postCaptcha(image)
  98. time.sleep(self.settings['waitTime'])
  99. answer = self.getCaptcha(key)
  100. if answer == None:
  101. return
  102. self.captchaClicks(answer)
  103. if len(self.driver.find_elements_by_css_selector('#captcha-image')) != 0:
  104. print ('captcha was wrong')
  105. self.solveCaptcha()
  106. return
  107. def pullCaptcha(self, element, path):
  108. # Gets captcha image
  109. location = element.location
  110. size = element.size
  111. # saves screenshot of entire page
  112. self.driver.save_screenshot(path)
  113. left = location['x']
  114. top = location['y'] + 140
  115. right = location['x'] + size['width']
  116. bottom = location['y'] + size['height'] + 140
  117. image = Image.open(path)
  118. image = image.crop((530, 405, 635, 440)) # defines crop points
  119. image.save(path, format='png') # saves new cropped image
  120. with open(path, "rb") as image_file:
  121. im = image_file.read()
  122. return im
  123. def postCaptcha(self, image):
  124. params = {'json': 1, 'key': self.settings["captchaKey"],
  125. 'method': 'post', 'calc': 1}
  126. files = {'file': ('file.png', image, 'image/png', {'Expires': '0'})}
  127. response = requests.post(self.settings["captchaPOST"], files=files,
  128. timeout=10, params=params)
  129. print(response.text)
  130. if 'ERROR' in response.text:
  131. raise Exception(f'Captcha Error: {response.text}')
  132. return response.json()['request']
  133. def getCaptcha(self, i):
  134. print(i)
  135. params = {'json': 1, 'key': self.settings["captchaKey"], 'action':'get', 'id':i}
  136. while True:
  137. response = requests.get(self.settings["captchaGET"], timeout=10, params=params)
  138. print(response.text)
  139. if 'UNSOLVABLE' in response.text or '+' in response.text or '-' in response.text:
  140. self.selectAndClick('#captcha-refresh')
  141. self.solveCaptcha()
  142. return
  143. if 'ERROR' in response.text:
  144. raise Exception(f'Captcha Error: {response.text}')
  145. if '=' in response.text:
  146. self.selectAndClick('#captcha-refresh')
  147. self.solveCaptcha()
  148. return
  149. if 'NOT_READY' in response.text:
  150. time.sleep(10)
  151. continue
  152. else:
  153. break
  154. a = response.json()['request']
  155. print(a)
  156. return a
  157. def selectAndClick(self, s):
  158. e = self.handleCaptcha(self.driver.find_element_by_css_selector, s)
  159. self.handleCaptcha(e.click)
  160. self.wait()
  161. return
  162. def captchaClicks(self, answer):
  163. #Last wrappers is needed in case the solution is wrong, other wrappers are extra
  164. inputBox = self.handleCaptcha(self.driver.find_element_by_css_selector, '#Attempt')
  165. self.handleCaptcha(inputBox.clear)
  166. self.handleCaptcha(inputBox.send_keys, answer)
  167. time.sleep(self.settings['waitTime'])
  168. btn = self.handleCaptcha(self.driver.find_element_by_css_selector, '.action-validate-captcha.originButtonCompact.ui-priority-primary')
  169. self.handleCaptcha(btn.click)
  170. return
  171. def realClick(self, target, req):
  172. #Used for when site needs special click event combinations
  173. a = webdriver.common.action_chains.ActionChains(self.driver)
  174. a.move_to_element(req).perform()
  175. chain = webdriver.common.action_chains.ActionChains(self.driver)
  176. chain.move_to_element(target).click(target).perform()
  177. return
  178. def getData(self):
  179. #Used for organizing sequence of bot commands according to configs
  180. if self.settings["completed"] >= self.settings["endPosition"]:
  181. print("Already at end position")
  182. self.gracefulQuit()
  183. return
  184. #Clear selects that may have been left by the last instance
  185. self.driver.get(self.settings['entry'][1])
  186. self.insertPosition(self.settings['completed'])
  187. self.selectRecords(False)
  188. p = 0
  189. while (self.settings["completed"] < self.settings["endPosition"]
  190. and p < self.settings["interval"]):
  191. self.clickSequence()
  192. self.saveCompleted()
  193. time.sleep(self.waitTime)
  194. p = p + 1
  195. return
  196. def insertPosition(self, p):
  197. try:
  198. inputBox = self.handleCaptcha(self.driver.find_element_by_css_selector, 'div.page.click-enterkey:not(.disabled)')
  199. self.wait()
  200. inputBox.click()
  201. self.wait()
  202. inputBox = self.handleCaptcha(self.driver.find_element_by_css_selector, 'input:not(#Attempt)')
  203. self.wait()
  204. #ActionChains(self.driver).move_to_element(inputBox).click().send_keys(startPosition).send_keys(Keys.RETURN).perform()
  205. self.handleCaptcha(inputBox.send_keys, p)
  206. self.handleCaptcha(inputBox.send_keys, Keys.RETURN)
  207. return
  208. except (StaleElementReferenceException, ElementNotInteractableException):
  209. self.wait()
  210. self.wait()
  211. self.insertPosition(p)
  212. return
  213. return
  214. def wait(self):
  215. time.sleep(self.settings['waitTime'])
  216. return
  217. def selectRecords(self, ifSelect):
  218. #This may create a bug where some records are missed because of captcha prompt
  219. for x in range(0,10):
  220. self.wait()
  221. #b = self.handleCaptcha(EC.visibility_of_element_located, (By.CSS_SELECTOR, '#checkboxCol.action-tag-all'))
  222. #w = self.handleCaptcha(WebDriverWait, self.driver, 10)
  223. #u = self.handleCaptcha(w.until, b)
  224. u = self.handleCaptcha(self.driver.find_element_by_css_selector, '#checkboxCol.action-tag-all')
  225. #allButton = self.handleCaptcha(WebDriverWait(self.driver, 10).until(
  226. #)
  227. allButton = u
  228. checked = allButton.get_attribute('checked')
  229. if (checked == None) == ifSelect:
  230. self.handleCaptcha(allButton.click)
  231. time.sleep(self.settings['waitTime'])
  232. nextButton = self.handleCaptcha(self.driver.find_element_by_css_selector, ".next.button")
  233. self.handleCaptcha(nextButton.click)
  234. return
  235. def clickSequence(self):
  236. #Decides which clicks should be done next to collect records
  237. url = self.settings["entry"][1]
  238. self.driver.get(url)
  239. startPosition = str(self.settings["completed"] + 1)
  240. self.handleCaptcha(self.insertPosition, startPosition)
  241. time.sleep(3)
  242. self.selectRecords(True)
  243. e = self.handleCaptcha(self.driver.find_element_by_css_selector, "a.download.action-download")
  244. e.click()
  245. self.wait()
  246. e = self.handleCaptcha(self.driver.find_element_by_css_selector, "#detailDetail")
  247. e.click()
  248. e = self.handleCaptcha(self.driver.find_element_by_css_selector,
  249. '.originButton.ui-priority-primary.view-on-exportlimit.view-on-randomsample.action-download')
  250. e.click()
  251. time.sleep(self.settings['waitTime'])
  252. #Records need to be deselected to select new ones
  253. self.driver.get(url)
  254. self.handleCaptcha(self.insertPosition, startPosition)
  255. self.selectRecords(False)
  256. self.settings["completed"] = self.settings["completed"] + 10
  257. print(f'completed: {self.settings["completed"]}')
  258. return
  259. def cleanSelects(self):
  260. if self.settings["cleanMode"][0]:
  261. url = self.settings["entry"][1]
  262. cleanMode = self.settings["cleanMode"]
  263. start = cleanMode[1]
  264. stop = cleanMode[2]
  265. else:
  266. return
  267. self.driver.get(url)
  268. self.handleCaptcha(self.insertPosition, str(start))
  269. #This will have side effects
  270. dif = int((start - stop)/10)
  271. for a in range(0, dif):
  272. self.selectRecords(False)
  273. return
  274. def saveCompleted(self):
  275. with open('login.json', 'w', encoding='utf-8') as login_file:
  276. login_file.write(json.dumps(self.settings, indent=4, sort_keys=True))
  277. return
  278. return
  279. def gracefulQuit(self):
  280. self.saveCompleted()
  281. return
  282. if __name__ == "__main__":
  283. spider = Spider()