Skip to content

Commit 8c2ee5a

Browse files
committed
豆瓣电影top250抓取
1 parent 8cda58d commit 8c2ee5a

File tree

2 files changed

+41
-7
lines changed

2 files changed

+41
-7
lines changed

爬虫/douban_top250.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/python2.7
2+
# encoding=utf-8
3+
4+
import codecs
5+
from pyquery import PyQuery as pq
6+
DOWNLOAD_URL = 'https://movie.douban.com/top250'
7+
8+
def parse_html(url):
9+
html = pq(url = url)
10+
top_name_items = html('ol.grid_view li div.hd span.title,.other')
11+
top_name_list = []
12+
for item in top_name_items.items():
13+
if item.text()[0] == '/':
14+
top_name = top_name_list.pop() + ' ' + item.text()
15+
top_name_list.append(top_name)
16+
else:
17+
top_name_list.append(item.text())
18+
next_page = html('div.paginator span.next a').attr('href')
19+
if next_page:
20+
return top_name_list, DOWNLOAD_URL + next_page
21+
return top_name_list, None
22+
23+
if __name__ == '__main__':
24+
url = DOWNLOAD_URL
25+
with codecs.open('top250','wb',encoding='utf-8') as fp:
26+
while url:
27+
movies, url = parse_html(url)
28+
fp.write(u'{movies}\n'.format(movies='\n'.join(movies)))
29+
30+
31+
#print html('ol.grid_view li div.hd span.title').text()

爬虫/test_bs.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,28 @@
33
import re
44
from pyquery import PyQuery as pq
55
from pybloom import BloomFilter
6-
#bf = BloomFilter()
6+
bf = BloomFilter(capacity=1000,error_rate=0.001)
77
url = 'http://www.baidu.com'
8+
bf.add(url)
9+
level = 0
810

911
#html = urllib.urlopen(url)
1012
#soup = bs(html,'lxml')
1113
#init_html = soup.prettify()
1214
#doc = pq(init_html)
1315
#print doc
14-
def search_url(url):
16+
def search_url(url,max_level,level=0):
1517
try:
16-
18+
print '|'+'-'*level+url
1719
doc = pq(url = url)
1820
pq_items = doc('[href]').items()
1921
for item in pq_items:
2022
url_new = item.attr('href')
21-
if re.findall(r'https?.+',url_new):
22-
print url_new
23-
search_url(url_new)
23+
if re.findall(r'https?.+',url_new) and url_new not in bf and level < max_level:
24+
new_level = level + 1
25+
bf.add(url_new)
26+
search_url(url_new,max_level=max_level,level=new_level)
2427
except:
2528
return False
26-
search_url(url)
29+
search_url(url,max_level=2)
2730
# print item.attr('href') if _re

0 commit comments

Comments
 (0)