豆瓣电影top250抓取

ClessLi · ClessLi · commit 8c2ee5a8cb13 · 2018-04-10T14:53:40.000+08:00
diff --git a/爬虫/douban_top250.py b/爬虫/douban_top250.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python2.7
+# encoding=utf-8
+
+import codecs
+from pyquery import PyQuery as pq
+DOWNLOAD_URL = 'https://movie.douban.com/top250'
+
+def parse_html(url):
+    html = pq(url = url)
+    top_name_items = html('ol.grid_view li div.hd span.title,.other')
+    top_name_list = []
+    for item in top_name_items.items():
+        if item.text()[0] == '/':
+            top_name = top_name_list.pop() + '  ' + item.text()
+            top_name_list.append(top_name)
+        else:
+            top_name_list.append(item.text())
+    next_page = html('div.paginator span.next a').attr('href')
+    if next_page:
+        return top_name_list, DOWNLOAD_URL + next_page
+    return top_name_list, None
+
+if __name__ == '__main__':
+    url = DOWNLOAD_URL
+    with codecs.open('top250','wb',encoding='utf-8') as fp:
+        while url:
+            movies, url = parse_html(url)
+            fp.write(u'{movies}\n'.format(movies='\n'.join(movies)))
+
+
+#print html('ol.grid_view li div.hd span.title').text()
diff --git a/爬虫/test_bs.py b/爬虫/test_bs.py
@@ -3,25 +3,28 @@
 import re
 from pyquery import PyQuery as pq
 from pybloom import BloomFilter
-#bf = BloomFilter()
+bf = BloomFilter(capacity=1000,error_rate=0.001)
 url = 'http://www.baidu.com'
+bf.add(url)
+level = 0
 
 #html = urllib.urlopen(url)
 #soup = bs(html,'lxml')
 #init_html = soup.prettify()
 #doc = pq(init_html)
 #print doc
-def search_url(url):
+def search_url(url,max_level,level=0):
     try:
-
+        print '|'+'-'*level+url
         doc = pq(url = url)
         pq_items = doc('[href]').items()
         for item in pq_items:
             url_new = item.attr('href')
-            if re.findall(r'https?.+',url_new):
-                print url_new
-                search_url(url_new)
+            if re.findall(r'https?.+',url_new) and url_new not in bf and level < max_level:
+                new_level = level + 1
+                bf.add(url_new)
+                search_url(url_new,max_level=max_level,level=new_level)
     except:
         return False
-search_url(url)
+search_url(url,max_level=2)
 #    print item.attr('href') if _re