File tree 2 files changed +41
-7
lines changed
2 files changed +41
-7
lines changed Original file line number Diff line number Diff line change
1
+ #!/usr/bin/python2.7
2
+ # encoding=utf-8
3
+
4
+ import codecs
5
+ from pyquery import PyQuery as pq
6
+ DOWNLOAD_URL = 'https://movie.douban.com/top250'
7
+
8
+ def parse_html (url ):
9
+ html = pq (url = url )
10
+ top_name_items = html ('ol.grid_view li div.hd span.title,.other' )
11
+ top_name_list = []
12
+ for item in top_name_items .items ():
13
+ if item .text ()[0 ] == '/' :
14
+ top_name = top_name_list .pop () + ' ' + item .text ()
15
+ top_name_list .append (top_name )
16
+ else :
17
+ top_name_list .append (item .text ())
18
+ next_page = html ('div.paginator span.next a' ).attr ('href' )
19
+ if next_page :
20
+ return top_name_list , DOWNLOAD_URL + next_page
21
+ return top_name_list , None
22
+
23
+ if __name__ == '__main__' :
24
+ url = DOWNLOAD_URL
25
+ with codecs .open ('top250' ,'wb' ,encoding = 'utf-8' ) as fp :
26
+ while url :
27
+ movies , url = parse_html (url )
28
+ fp .write (u'{movies}\n ' .format (movies = '\n ' .join (movies )))
29
+
30
+
31
+ #print html('ol.grid_view li div.hd span.title').text()
Original file line number Diff line number Diff line change 3
3
import re
4
4
from pyquery import PyQuery as pq
5
5
from pybloom import BloomFilter
6
- # bf = BloomFilter()
6
+ bf = BloomFilter (capacity = 1000 , error_rate = 0.001 )
7
7
url = 'http://www.baidu.com'
8
+ bf .add (url )
9
+ level = 0
8
10
9
11
#html = urllib.urlopen(url)
10
12
#soup = bs(html,'lxml')
11
13
#init_html = soup.prettify()
12
14
#doc = pq(init_html)
13
15
#print doc
14
- def search_url (url ):
16
+ def search_url (url , max_level , level = 0 ):
15
17
try :
16
-
18
+ print '|' + '-' * level + url
17
19
doc = pq (url = url )
18
20
pq_items = doc ('[href]' ).items ()
19
21
for item in pq_items :
20
22
url_new = item .attr ('href' )
21
- if re .findall (r'https?.+' ,url_new ):
22
- print url_new
23
- search_url (url_new )
23
+ if re .findall (r'https?.+' ,url_new ) and url_new not in bf and level < max_level :
24
+ new_level = level + 1
25
+ bf .add (url_new )
26
+ search_url (url_new ,max_level = max_level ,level = new_level )
24
27
except :
25
28
return False
26
- search_url (url )
29
+ search_url (url , max_level = 2 )
27
30
# print item.attr('href') if _re
You can’t perform that action at this time.
0 commit comments