PythonWork/scraping_linkedin.py at master · hesithaw/PythonWork · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""-----------------------------------------------------------
    Python Selenium Script for LinkedIn Scrape
   ----------------------------------------------------------"""

#FILE IMPORTS
from   selenium import webdriver
from   os       import remove
from selenium.webdriver.common.keys import Keys
import utils
import time
import sys
import argparse
from bs4 import BeautifulSoup, SoupStrainer
import requests

#SCRAPER GLOBALS
_url       = 'https://www.linkedin.com'
_file_path = 'temp.html'
_out_file_path = 'out.txt'

#INIT THE DRIVER
_driver = webdriver.Firefox()

#LOGIN AND ADVANCED SEARCH FOR CL ARGS
def login_and_search(args):
	_driver.get(_url)
	user     = _driver.find_element_by_xpath('//*[@id="login-email"]')
	password = _driver.find_element_by_xpath('//*[@id="login-password"]')

	user.send_keys(args.user)
	password.send_keys(args.password)
	_driver.find_element_by_xpath('//*[@id="login-submit"]').click()

	time.sleep(20)

	_driver.find_element_by_xpath('//*[@id="nav-search-controls-wormhole"]/button').click()
	time.sleep(5)
	search = _driver.find_element_by_xpath('//*[@class="ember-text-field ember-view"]')
	search.send_keys(args.keyword + Keys.RETURN)

	time.sleep(5)
	_driver.execute_script("window.scrollTo(0, 10000);")
	time.sleep(5)
	save_current_html()

def save_current_html():
    file = open(_file_path, 'w')
    file.write(_driver.page_source.encode('utf-8'))
    file.close()

#Page parsing functions
def parse_page(file_path, out_file_path):
	file = open(file_path, 'r')
	parser = BeautifulSoup(file, "html.parser", parse_only=SoupStrainer('div', {"class" : "search-result__info pt3 pb4 ph0"}))
	collection = parser.findAll('a', { "class" :'search-result__result-link ember-view'})
	for items in collection:
		write_results(_url + items.get('href') + '\n', out_file_path)
	write_results('----------------------------------------------\n', out_file_path)
	file.close()

def write_results(results, out_file_path):
	file = open(out_file_path, 'a')
	file.write(results)
	file.close()

def teardown():
    try:
        remove(_file_path)
    except:
        print('Temp file does not exist')

def init_arguments():
	parser = argparse.ArgumentParser()
	parser.add_argument("-u",        dest="user", 			help=" User Name  ")
	parser.add_argument("-p",        dest="password",             help=" Password ")
	parser.add_argument("-key",        dest="keyword",            	help=" Key Words ")
	return parser

def parse_arguments(parser):
    args = parser.parse_args()
    args.url = _url
    return args

def main():
	parser = init_arguments()
	args = parse_arguments(parser)
	login_and_search(args)
	parse_page(_file_path, _out_file_path)
	teardown()
	_driver.quit()

if __name__ == "__main__":
	main()