2
2
from concurrent import futures
3
3
import multiprocessing as mp
4
4
import os
5
+ import json
5
6
import uuid
6
7
8
+ from bs4 import BeautifulSoup
9
+ from markdown import markdown
7
10
import requests
8
11
import urllib3
9
12
12
15
urllib3 .disable_warnings (urllib3 .exceptions .InsecureRequestWarning )
13
16
14
17
# Avoid rate limiting (tcp)
15
- _URL_BOT_ID = 'Bot {id}' .format (id = str (uuid .uuid4 ()))
16
- URL_HEADERS = {'User-Agent' : _URL_BOT_ID }
17
- URL_TIMEOUT = 10.0
18
-
19
- # Sources of data (file)
20
- IN_PATH = os .path .join (os .getcwd (), 'urlin.txt' )
21
- OUT_PATH = os .path .join (os .getcwd (), 'urlout.txt' )
22
-
23
- # Collect repository URLs (bash)
24
- _URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+' # proto://host+path+params
25
- _FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'" .format (regex = _URL_RE )
26
- _FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}" .format (urlin = IN_PATH )
27
- COMMAND = '{find} | {filter}' .format (find = _FIND_URLS , filter = _FILTER_URLS )
18
+ URL_BOT_ID = f'Bot { str (uuid .uuid4 ())} '
19
+
20
+
21
+ def extract_urls_from_html (content , all_urls ):
22
+ soup = BeautifulSoup (content , 'html.parser' )
23
+ for a in soup .find_all ('a' , href = True ):
24
+ url = a ['href' ]
25
+ if url .startswith ('http' ):
26
+ all_urls .add (url )
27
+
28
+
29
+ def extract_urls (discover_path ):
30
+ exclude = ['.git' , '.vscode' ]
31
+ all_urls = set ()
32
+ max_strlen = - 1
33
+ for root , dirs , files in os .walk (discover_path , topdown = True ):
34
+ dirs [:] = [d for d in dirs if d not in exclude ]
35
+ for file in files :
36
+ output = f'Currently checking: file={ file } '
37
+ file_path = os .path .join (root , file )
38
+ if max_strlen < len (output ):
39
+ max_strlen = len (output )
40
+ print (output .ljust (max_strlen ), end = '\r ' )
41
+ if file_path .endswith ('.html' ):
42
+ content = open (file_path )
43
+ extract_urls_from_html (content , all_urls )
44
+ elif file_path .endswith ('.markdown' ):
45
+ content = markdown (open (file_path ).read ())
46
+ extract_urls_from_html (content , all_urls )
47
+ return all_urls
28
48
29
49
30
50
def run_workers (work , data , worker_threads = mp .cpu_count ()* 4 ):
@@ -42,13 +62,15 @@ def get_url_status(url):
42
62
clean_url = url .strip ('?.' )
43
63
try :
44
64
response = requests .get (
45
- clean_url , verify = False , timeout = URL_TIMEOUT ,
46
- headers = URL_HEADERS )
65
+ clean_url , verify = False , timeout = 10.0 ,
66
+ headers = { 'User-Agent' : URL_BOT_ID } )
47
67
return (clean_url , response .status_code )
48
68
except requests .exceptions .Timeout :
49
69
return (clean_url , 504 )
50
70
except requests .exceptions .ConnectionError :
51
71
return (clean_url , - 1 )
72
+ except requests .exceptions .TooManyRedirects :
73
+ return (clean_url , - 1 )
52
74
53
75
54
76
def bad_url (url_status ):
@@ -65,22 +87,20 @@ def bad_url(url_status):
65
87
66
88
def main ():
67
89
print ('Extract urls...' )
68
- os .system (COMMAND )
69
- with open (IN_PATH , 'r' ) as fr :
70
- urls = map (lambda l : l .strip ('\n ' ), fr .readlines ())
71
- with open (OUT_PATH , 'w' ) as fw :
72
- url_id = 1
73
- max_strlen = - 1
74
- for url_path , url_status in run_workers (get_url_status , urls ):
75
- output = 'Currently checking: id={uid} host={uhost}' .format (
76
- uid = url_id , uhost = urllib3 .util .parse_url (url_path ).host )
77
- if max_strlen < len (output ):
78
- max_strlen = len (output )
79
- print (output .ljust (max_strlen ), end = '\r ' )
80
- if bad_url (url_status ) is True :
81
- fw .write ('{}: {}\n ' .format (url_path , url_status ))
82
- url_id += 1
83
- print ('\n Done.' )
90
+ all_urls = extract_urls (os .getcwd ())
91
+ print ('\n Check urls...' )
92
+ bad_urls = {}
93
+ url_id = 1
94
+ max_strlen = - 1
95
+ for url_path , url_status in run_workers (get_url_status , all_urls ):
96
+ output = f'Currently checking: id={ url_id } host={ urllib3 .util .parse_url (url_path ).host } '
97
+ if max_strlen < len (output ):
98
+ max_strlen = len (output )
99
+ print (output .ljust (max_strlen ), end = '\r ' )
100
+ if bad_url (url_status ) is True :
101
+ bad_urls [url_path ] = url_status
102
+ url_id += 1
103
+ print (f'\n Bad urls: { json .dumps (bad_urls , indent = 4 )} ' )
84
104
85
105
86
106
if __name__ == '__main__' :
0 commit comments