-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathscraper.py
79 lines (67 loc) · 3.25 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import time,utils,string
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from utils import config
def scrape(post_url):
bot = utils.create_bot(headless=False)
data = {}
try:
# Load cookies to prevent cookie overlay & other issues
bot.get('https://www.reddit.com')
for cookie in config['reddit_cookies'].split('; '):
cookie_data = cookie.split('=')
bot.add_cookie({'name':cookie_data[0],'value':cookie_data[1],'domain':'reddit.com'})
bot.get(post_url)
# Fetching the post itself, text & screenshot
post = WebDriverWait(bot, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.Post')))
post_text = post.find_element(By.CSS_SELECTOR, 'h1').text
data['post'] = post_text
post.screenshot('output/post.png')
# Let comments load
bot.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
# Fetching comments & top level comment determinator
comments = bot.find_elements(By.CSS_SELECTOR, 'div[id^=t1_][tabindex]')
allowed_style = comments[0].get_attribute("style")
# Filter for top only comments
NUMBER_OF_COMMENTS = 5
comments = [comment for comment in comments if comment.get_attribute("style") == allowed_style][:NUMBER_OF_COMMENTS]
print('💬 Scraping comments...',end="",flush=True)
# Save time & resources by only fetching X content
for i in range(len(comments)):
try:
print('.',end="",flush=True)
# Filter out locked comments (AutoMod)
try:
comments[i].find_element(By.CSS_SELECTOR, 'icon.icon-lock_fill')
continue
except:
pass
# Scrolling to the comment ensures that the profile picture loads
# Credits: https://stackoverflow.com/a/57630350
desired_y = (comments[i].size['height'] / 2) + comments[i].location['y']
window_h = bot.execute_script('return window.innerHeight')
window_y = bot.execute_script('return window.pageYOffset')
current_y = (window_h / 2) + window_y
scroll_y_by = desired_y - current_y
bot.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
time.sleep(0.2)
# Getting comment into string
text = "\n".join([element.text for element in comments[i].find_elements(By.CSS_SELECTOR, '.RichTextJSON-root')])
# Screenshot & save text
comments[i].screenshot(f'output/{i}.png')
data[str(i)] = ''.join(filter(lambda c: c in string.printable, text))
except Exception as e:
if config['debug']:
raise e
pass
if bot.session_id:
bot.quit()
return data
except Exception as e:
if bot.session_id:
bot.quit()
if config['debug']:
raise e
return False