Skip to content

Commit a5274df

Browse files
authored
Merge pull request #166 from huangsam/bugfix/url-discovery
Improve url detection
2 parents ff1b657 + 938b504 commit a5274df

File tree

2 files changed

+51
-35
lines changed

2 files changed

+51
-35
lines changed

.gitignore

-4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@ develop-eggs
1818
# Installer logs
1919
pip-log.txt
2020

21-
# URL logs
22-
urlin.txt
23-
urlout.txt
24-
2521
# Unit test / coverage reports
2622
.coverage
2723
.tox

check_urls.py

+51-31
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
from concurrent import futures
33
import multiprocessing as mp
44
import os
5+
import json
56
import uuid
67

8+
from bs4 import BeautifulSoup
9+
from markdown import markdown
710
import requests
811
import urllib3
912

@@ -12,19 +15,36 @@
1215
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
1316

1417
# Avoid rate limiting (tcp)
15-
_URL_BOT_ID = 'Bot {id}'.format(id=str(uuid.uuid4()))
16-
URL_HEADERS = {'User-Agent': _URL_BOT_ID}
17-
URL_TIMEOUT = 10.0
18-
19-
# Sources of data (file)
20-
IN_PATH = os.path.join(os.getcwd(), 'urlin.txt')
21-
OUT_PATH = os.path.join(os.getcwd(), 'urlout.txt')
22-
23-
# Collect repository URLs (bash)
24-
_URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+' # proto://host+path+params
25-
_FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'".format(regex=_URL_RE)
26-
_FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}".format(urlin=IN_PATH)
27-
COMMAND = '{find} | {filter}'.format(find=_FIND_URLS, filter=_FILTER_URLS)
18+
URL_BOT_ID = f'Bot {str(uuid.uuid4())}'
19+
20+
21+
def extract_urls_from_html(content, all_urls):
22+
soup = BeautifulSoup(content, 'html.parser')
23+
for a in soup.find_all('a', href=True):
24+
url = a['href']
25+
if url.startswith('http'):
26+
all_urls.add(url)
27+
28+
29+
def extract_urls(discover_path):
30+
exclude = ['.git', '.vscode']
31+
all_urls = set()
32+
max_strlen = -1
33+
for root, dirs, files in os.walk(discover_path, topdown=True):
34+
dirs[:] = [d for d in dirs if d not in exclude]
35+
for file in files:
36+
output = f'Currently checking: file={file}'
37+
file_path = os.path.join(root, file)
38+
if max_strlen < len(output):
39+
max_strlen = len(output)
40+
print(output.ljust(max_strlen), end='\r')
41+
if file_path.endswith('.html'):
42+
content = open(file_path)
43+
extract_urls_from_html(content, all_urls)
44+
elif file_path.endswith('.markdown'):
45+
content = markdown(open(file_path).read())
46+
extract_urls_from_html(content, all_urls)
47+
return all_urls
2848

2949

3050
def run_workers(work, data, worker_threads=mp.cpu_count()*4):
@@ -42,13 +62,15 @@ def get_url_status(url):
4262
clean_url = url.strip('?.')
4363
try:
4464
response = requests.get(
45-
clean_url, verify=False, timeout=URL_TIMEOUT,
46-
headers=URL_HEADERS)
65+
clean_url, verify=False, timeout=10.0,
66+
headers={'User-Agent': URL_BOT_ID})
4767
return (clean_url, response.status_code)
4868
except requests.exceptions.Timeout:
4969
return (clean_url, 504)
5070
except requests.exceptions.ConnectionError:
5171
return (clean_url, -1)
72+
except requests.exceptions.TooManyRedirects:
73+
return (clean_url, -1)
5274

5375

5476
def bad_url(url_status):
@@ -65,22 +87,20 @@ def bad_url(url_status):
6587

6688
def main():
6789
print('Extract urls...')
68-
os.system(COMMAND)
69-
with open(IN_PATH, 'r') as fr:
70-
urls = map(lambda l: l.strip('\n'), fr.readlines())
71-
with open(OUT_PATH, 'w') as fw:
72-
url_id = 1
73-
max_strlen = -1
74-
for url_path, url_status in run_workers(get_url_status, urls):
75-
output = 'Currently checking: id={uid} host={uhost}'.format(
76-
uid=url_id, uhost=urllib3.util.parse_url(url_path).host)
77-
if max_strlen < len(output):
78-
max_strlen = len(output)
79-
print(output.ljust(max_strlen), end='\r')
80-
if bad_url(url_status) is True:
81-
fw.write('{}: {}\n'.format(url_path, url_status))
82-
url_id += 1
83-
print('\nDone.')
90+
all_urls = extract_urls(os.getcwd())
91+
print('\nCheck urls...')
92+
bad_urls = {}
93+
url_id = 1
94+
max_strlen = -1
95+
for url_path, url_status in run_workers(get_url_status, all_urls):
96+
output = f'Currently checking: id={url_id} host={urllib3.util.parse_url(url_path).host}'
97+
if max_strlen < len(output):
98+
max_strlen = len(output)
99+
print(output.ljust(max_strlen), end='\r')
100+
if bad_url(url_status) is True:
101+
bad_urls[url_path] = url_status
102+
url_id += 1
103+
print(f'\nBad urls: {json.dumps(bad_urls, indent=4)}')
84104

85105

86106
if __name__ == '__main__':

0 commit comments

Comments
 (0)