Skip to content

Commit 2848374

Browse files
committed
find lost files
1 parent e5ef009 commit 2848374

File tree

6 files changed

+340
-0
lines changed

6 files changed

+340
-0
lines changed

猎聘网/ExeclUtils.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# coding=utf-8
2+
"""
3+
@author:SoS
4+
@data:2018/3/19
5+
@version:Python3.6
6+
"""
7+
import xlwt
8+
9+
class ExeclUtils():
10+
@staticmethod
11+
def create_execl(sheet_name,row_titles):
12+
'''
13+
sheet_name:表格名
14+
row_titles:行标题
15+
'''
16+
f = xlwt.Workbook()
17+
sheet_info = f.add_sheet(sheet_name,cell_overwrite_ok=True)
18+
for i in range(0,len(row_titles)):
19+
sheet_info.write(0,i,row_titles[i])
20+
return f, sheet_info
21+
22+
@staticmethod
23+
def write_execl(execl_file,execl_sheet,count,data,execl_name):
24+
'''
25+
execl_file:文件对象
26+
execl_sheet:表格名
27+
count:数据插入到哪一行
28+
data:传入的数据 []类型
29+
execl_name:execl文件名
30+
'''
31+
for j in range(len(data)):
32+
execl_sheet.write(count,j,data[j])
33+
execl_file.save(execl_name)

猎聘网/Main.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# coding=utf-8
2+
"""
3+
@author:SoS
4+
@data:2018/3/19
5+
@version:Python3.6
6+
"""
7+
from cooperated_re import JobRe
8+
from cooperated_bs import JobBs
9+
from cooperated_xpath import JobXpath
10+
11+
class Main():
12+
@staticmethod
13+
def select_type():
14+
type = input('请输入爬虫类型:\n1.xpath\n2.BeatuifulSoup4\n3.re\n')
15+
try:
16+
type = int(type)
17+
except:
18+
print("请您正确输入")
19+
print("您已输入 ", type)
20+
if type == 1:
21+
print("开始xpath爬取数据....")
22+
xpath = JobXpath()
23+
xpath.crawler_data()
24+
elif type == 2:
25+
print("开始bs4爬取数据....")
26+
bs = JobBs()
27+
bs.crawler_data()
28+
else:
29+
print("开始re爬取数据")
30+
re = JobRe()
31+
re.crawler_data()
32+
print("爬取完毕")
33+
34+
35+
if __name__ == '__main__':
36+
Main.select_type()

猎聘网/Spider.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# coding=utf-8
2+
"""
3+
@author:SoS
4+
@data:2018/3/19
5+
@version:Python3.6
6+
"""
7+
import abc
8+
import time
9+
import requests
10+
from ExeclUtils import ExeclUtils
11+
12+
# abstract class
13+
class Spider():
14+
__metaclass__ = abc.ABCMeta
15+
16+
def __init__(self):
17+
self.row_title = ['标题','待遇','地区','学历要求','经验','公司名称','所属行业','职位描述']
18+
sheet_name = "猎聘网"
19+
self.execl_f, self.sheet_info = ExeclUtils.create_execl(sheet_name,self.row_title)
20+
# add element in one data
21+
self.job_data = []
22+
# the data added start with 1
23+
self.count = 0
24+
25+
def crawler_data(self):
26+
'''
27+
crawler data
28+
'''
29+
for i in range(0,5):
30+
url = 'https://www.liepin.com/zhaopin/?industryType=&jobKind=&sortFlag=15&degradeFlag=0&industries=&salary=&compscale=&key=Python&clean_condition=&headckid=4a4adb68b22970bd&d_pageSize=40&siTag=p_XzVCa5J0EfySMbVjghcw~fA9rXquZc5IkJpXC-Ycixw&d_headId=62ac45351cdd7a103ac7d50e1142b2a0&d_ckId=62ac45351cdd7a103ac7d50e1142b2a0&d_sfrom=search_fp&d_curPage=0&curPage={}'.format(i)
31+
self.request_job_list(url)
32+
time.sleep(2)
33+
34+
def request_job_list(self,url):
35+
'''
36+
get the job data by request url
37+
'''
38+
try:
39+
headers = {
40+
'Referer':'https://www.liepin.com/',
41+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
42+
}
43+
reponse = requests.get(url,headers = headers)
44+
# utf-8
45+
if reponse.status_code != 200:
46+
return
47+
self.parse_job_list(reponse.text)
48+
except Exception as e:
49+
# raise e
50+
print('request_job_list error : {}'.format(e))
51+
52+
@abc.abstractmethod
53+
def parse_job_list(self,text):
54+
'''
55+
parsing the data from the response
56+
'''
57+
pass
58+
59+
def request_job_details(self,url):
60+
'''
61+
request thr job detail's url
62+
'''
63+
try:
64+
headers = {
65+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
66+
}
67+
response = requests.get(url,headers = headers);
68+
# utf-8
69+
if response.status_code != 200:
70+
return
71+
self.parse_job_details(response.text)
72+
except Exception as e:
73+
# raise e
74+
print('request_job_details error : {}'.format(e))
75+
76+
@abc.abstractmethod
77+
def parse_job_details(self,text):
78+
'''
79+
parsing the job details from text
80+
'''
81+
pass
82+
83+
def append(self,title,salary,region,degree,experience,name,industry):
84+
self.job_data.append(title)
85+
self.job_data.append(salary)
86+
self.job_data.append(region)
87+
self.job_data.append(degree)
88+
self.job_data.append(experience)
89+
self.job_data.append(name)
90+
self.job_data.append(industry)
91+
92+
def data_clear(self):
93+
self.job_data = []
94+
95+
def extract(self, data):
96+
return data[0] if len(data) > 0 else ""

猎聘网/cooperated_bs.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# coding=utf-8
2+
"""
3+
@author:SoS
4+
@data:2018/3/20
5+
@version:Python3.6
6+
"""
7+
import time
8+
from urllib import parse
9+
from Spider import Spider
10+
from bs4 import BeautifulSoup
11+
from ExeclUtils import ExeclUtils
12+
13+
14+
class JobBs(Spider):
15+
def __init__(self):
16+
super(JobBs, self).__init__()
17+
18+
def parse_job_list(self, text):
19+
try:
20+
soup = BeautifulSoup(text,'lxml')
21+
divs = soup.select('.sojob-item-main.clearfix')
22+
for div in divs:
23+
title = self.extract(div.select('.job-info > h3'))['title']
24+
href = self.extract(div.select('.job-info > h3 a'))['href']
25+
26+
result = self.extract(div.select('.job-info > p'))
27+
if hasattr(result,'title'):
28+
result = result['title'].split('_')
29+
else:
30+
# 虽然不会出现
31+
result = ['','','']
32+
salary = result[0]
33+
region = result[1]
34+
degree = result[2]
35+
experience = result[3]
36+
name = self.extract(div.select('.company-info.nohover > p a')).string
37+
industry = self.extract(div.select('.company-info.nohover .field-financing span a')).string
38+
self.append(title, salary, region, degree,experience, name, industry)
39+
print(self.job_data)
40+
self.request_job_details(parse.urljoin('https://www.liepin.com', href))
41+
time.sleep(1)
42+
except Exception as e:
43+
print("parse_job_list error :",str(e))
44+
45+
def parse_job_details(self, text):
46+
try:
47+
soup2 = BeautifulSoup(text,'lxml')
48+
detail = soup2.select('.content.content-word')
49+
if detail:
50+
self.job_data.append(detail[0].get_text())
51+
else:
52+
self.job_data.append("暂无信息")
53+
self.count += 1
54+
ExeclUtils.write_execl(self.execl_f, self.sheet_info, self.count, self.job_data, "猎聘网_bs.xlsx")
55+
print("crawel ", self.count, "条数据")
56+
self.data_clear()
57+
except Exception as e:
58+
print("parse_job_details error : ",str(e))

猎聘网/cooperated_re.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# coding=utf-8
2+
"""
3+
@author:SoS
4+
@data:2018/3/20
5+
@version:Python3.6
6+
"""
7+
import re
8+
import time
9+
from urllib import parse
10+
from Spider import Spider
11+
from ExeclUtils import ExeclUtils
12+
13+
14+
class JobRe(Spider):
15+
def __init__(self):
16+
super(JobRe, self).__init__()
17+
18+
def parse_job_list(self, text):
19+
try:
20+
pattern = re.compile('<div class="job-info">'
21+
'.*?<h3.*?title="(.*?)">.*?<a href="(.*?)".*?title="(.*?)">.*?<p class="company-name">.*?>(.*?)</a>.*?<p class="field-financing">.*?target="_blank">(.*?)</a>.*?</span>', re.S)
22+
datas = re.findall(pattern, text)
23+
for data in datas:
24+
title = data[0]
25+
href = data[1]
26+
result = data[2].split('_')
27+
salary = result[0]
28+
region = result[1]
29+
degree = result[2]
30+
experience = result[3]
31+
name = data[3]
32+
industry = data[4]
33+
self.append(title, salary, region, degree,
34+
experience, name, industry)
35+
print(self.job_data)
36+
self.request_job_details(parse.urljoin(
37+
'https://www.liepin.com', href))
38+
time.sleep(1)
39+
except Exception as e:
40+
print("re parse_job_list error : ", str(e))
41+
42+
def parse_job_details(self, text):
43+
try:
44+
pattern = re.compile(
45+
'<div class="content content-word">(.*?)</div>.*?<div class="job-item main.*?">', re.S)
46+
text = re.search(pattern, text)
47+
detail = re.sub(re.compile('<[^>]+>', re.S), '', text.group(1))
48+
if detail:
49+
self.job_data.append(detail)
50+
else:
51+
self.job_data.append("暂无职位信息")
52+
self.count += 1
53+
ExeclUtils.write_execl(self.execl_f, self.sheet_info, self.count, self.job_data, "猎聘网_re.xlsx")
54+
print("crawel ", self.count, "条数据")
55+
self.data_clear()
56+
except Exception as e:
57+
print("re parse_job_list error : ", str(e))

猎聘网/cooperated_xpath.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# coding=utf-8
2+
"""
3+
@author:SoS
4+
@data:2018/3/19
5+
@version:Python3.6
6+
"""
7+
import time
8+
from lxml import etree
9+
from urllib import parse
10+
from Spider import Spider
11+
from ExeclUtils import ExeclUtils
12+
13+
14+
class JobXpath(Spider):
15+
def __init__(self):
16+
super(JobXpath, self).__init__()
17+
18+
def parse_job_list(self, text):
19+
try:
20+
selector = etree.HTML(text)
21+
divs = selector.xpath('//div[@class="sojob-item-main clearfix"]')
22+
for div in divs:
23+
title = self.extract(div.xpath('./div[1]/h3/@title'))
24+
data = self.extract(div.xpath('./div[1]/p[1]/@title'))
25+
data = data.split("_")
26+
salary = data[0]
27+
region = data[1]
28+
degree = data[2]
29+
experience = data[3]
30+
name = self.extract(div.xpath('./div[2]/p[1]/a/text()'))
31+
industry = self.extract(
32+
div.xpath('./div[2]/p[2]/span/a/text()'))
33+
href = self.extract(div.xpath('./div[1]/h3/a/@href'))
34+
35+
self.append(title, salary, region, degree,
36+
experience, name, industry)
37+
print(self.job_data)
38+
self.request_job_details(parse.urljoin(
39+
'https://www.liepin.com', href))
40+
time.sleep(1)
41+
except Exception as e:
42+
print('parse_job_list error : {}'.format(e))
43+
44+
def parse_job_details(self, text):
45+
try:
46+
selector = etree.HTML(text)
47+
data = selector.xpath('//div[@class="about-position"]/div[3]')
48+
# strip()不管用?
49+
detail = data[0].xpath('string(.)').replace(" ", "")
50+
if detail is "":
51+
self.job_data.append("职位无介绍")
52+
else:
53+
self.job_data.append(detail)
54+
self.count += 1
55+
ExeclUtils.write_execl(
56+
self.execl_f, self.sheet_info, self.count, self.job_data, "猎聘网_xpath.xlsx")
57+
print("crawel ", self.count, "条数据")
58+
self.data_clear()
59+
except Exception as e:
60+
print('parse_job_details error : {}'.format(e))

0 commit comments

Comments
 (0)