1
0
mirror of https://github.com/deadc0de6/dotdrop.git synced 2026-02-04 21:29:43 +00:00
Files
dotdrop/scripts/check_links.py
2023-01-19 23:02:31 +01:00

98 lines
2.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""
author: deadc0de6 (https://github.com/deadc0de6)
Copyright (c) 2023, deadc0de6
URL checking script
"""
import sys
import re
from urllib.parse import urlparse
import requests
TIMEOUT = 3
VALID_RET = [
200,
302,
]
IGNORES = [
'badgen.net',
]
USER_AGENT = (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.36'
)
HEADERS = {
'User-Agent': USER_AGENT,
}
PATTERN = (
r"https?://[a-zA-Z0-9][a-zA-Z0-9-]{1,61}"
r"[a-zA-Z0-9]\.[=a-zA-Z0-9\_\/\?\&\%\+\#\.\-]+"
)
def get_links(path):
"""get a list of URLS"""
with open(path, encoding='utf-8') as file:
content = file.read()
entries = re.findall(PATTERN, content)
urls = list(set(entries))
return urls
def check_links(urls):
"""check urls"""
cnt = 0
ign = 0
for url in urls:
cnt += 1
hostname = urlparse(url).hostname
if hostname in IGNORES:
print(f' [IGN] {url}')
ign += 1
continue
verb = 'head'
ret = requests.head(url,
timeout=TIMEOUT,
allow_redirects=True,
headers=HEADERS).status_code
if ret not in VALID_RET:
verb = 'get'
ret = requests.get(url,
timeout=TIMEOUT,
allow_redirects=True,
headers=HEADERS).status_code
if ret not in VALID_RET:
print(f' [ERROR] {url} returned {ret}')
return False
print(f' [OK-{verb}-{ret}] {url}')
print(f'OK - total {cnt} links checked ({ign} ignored)')
return True
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f'usage: {sys.argv[0]} <path>')
sys.exit(1)
print(f'checking {sys.argv[1]} for links...')
links = get_links(sys.argv[1])
print(f' found {len(links)} links')
try:
if not check_links(links):
sys.exit(1)
except ValueError as exc:
print(f'error {exc}')
sys.exit(1)
except urlparse.URLError as exc:
print(f'urlparse error {exc}')
sys.exit(1)
except requests.exceptions.RequestException as exc:
print(f'requests error {exc}')
sys.exit(1)
sys.exit(0)