-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapepages.py
134 lines (108 loc) · 4.64 KB
/
scrapepages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#pip install -r /path/to/requirements.txt
import mechanize
from bs4 import BeautifulSoup
from urllib3 import HTTPResponse
# from tkinter import *
# master = Tk()
# Label(master, text="First Name").grid(row=0)
# Label(master, text="Last Name").grid(row=1)
# e1 = Entry(master)
# e2 = Entry(master)
# e1.grid(row=0, column=1)
# e2.grid(row=1, column=1)
print('made it here')
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('user-agent', 'Mozilla/5.1 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]
#setting debug to True will print(headers
br.set_debug_http(False)
#begin compiling a list off all available links to do searching from...
#15 is not correct :( We need a better way to do this
# CORRECT WAY is to inspect the original number of results shown. the moment
# that number is no longer the same we have come across the last page of results and
# can flag the program to stop and execute the next line of code.
# while (number of results is == number of results) print
# Or we just throw an exception and the rest of the program continues! YAY We'll go this route for now.
### BTW This while loop begins the google search and populates the source_pages Array
#with sites we need to visit for contact info.
def startTheShow():
#sources of potential contact information
hard_search_terms =["facebook", "twitter", "email", "e-mail", "linkedin", "linked-in", "telephone", "contact", "Contact", "CONTACT", "Telephone"]
#if we find another link to search we need to change the url to equal the base url + the new search query link.
#then we need to execute the function again and push to the same array.
#afterwards we need to verify that the array contains all of the found results and not just the ones that were pushed last
source_pages = []
#will need to separate this into two parts and encode the search query
#for now it stays for ease of testing
baseurl = 'https://www.google.com'
print("@@@@@Use quotes to do an exact query search.")
query_string = input("What query string would you like to use? ").split(" ")
exclude_terms = input("Would you like to exclude any terms from your search?(Comma separated)(none to quit) ").split(" ")
#build a search query string
url = ''
if exclude_terms[0] == 'none':
print('using if')
url = baseurl + "/search?q="+("+").join(query_string)
elif len(exclude_terms) >= 1 and exclude_terms != "none":
print('using elif')
for i in range(0, len(exclude_terms)):
url += baseurl + "/search?q=" + ("+").join(query_string) + "+-"+exclude_terms[i];
else:
try:
print('using else try')
url = baseurl + "/search?q="+ ("+").join(query_string)
except:
print("error setting url: ", url)
print("What is url? ", url)
tryNextPage = True
#build a list of urls so we can visit and scrape those pages for contact info.
while tryNextPage == True:
text = br.open(url).read()
soup = BeautifulSoup(text, 'html.parser')
test = soup.find_all(attrs={'class':'g'})
totalResultsFound = soup.find_all(id="resultStats")
# TODO: They use infinite scroll now so
print(totalResultsFound)
# In JS it looks like this(kinda)
"""
combine = link_headers.reduce( (acc=[], val, idx) => {
acc.push(
{
val: val.innerText.split('\n')[0],
url: val.querySelector("cite")
}
)
return acc;
}, [] )
"""
for i in range(0, len(test)-1):
#get the url from the list of links we are presented on a page.
findSearchableLinks = test[i].find_all('a')[0].get('href').split('&sa')[0].split('q=')[1]
print(findSearchableLinks)
#line.decode('utf-8').strip()
source_pages += [str(findSearchableLinks)]
#return our results so they may be passed to our inspect pages function in main
print('found search pages urls ', source_pages)
return source_pages
def eval_page():
print('some awesome function to pull some data from a webpage')
def inspect_pages(linksToSearch):
print('...execute inspect_pages')
links_found = []
#loop through the list of links we found via our search
for i in range(0, len(linksToSearch)):
text = br.open(linksToSearch[i]).read()
soup = BeautifulSoup(text, 'html.parser')
foundText = soup.find_all(string=["Contact","CONTACT US", "contact us", "Contact Us", "Telephone", "Email", "email", 'telephone', 'e-mail', 'contact', 'CONTACT']);
links_found.append(foundText);
if len(links_found > 0):
print(len(links_found), "links found")
return links_found
else:
print("No links found")
def main():
usefulLinks = startTheShow()
listOfUsefulPages = inspect_pages(usefulLinks)
print(listOfUsefulPages)
main()