目录
目录X
扫描网站里面的外链
AI摘要(BLUF)
客户经常突然发送扫描报告,要求处理其中的外链问题,但这些报告缺乏具体信息,未指明外链出现在哪个页面或JS文件中,给处理工作带来极大困扰。这种情况让相关人员感到焦急,因为无法准确定位问题源头,难以有效解决外链问题。这种模糊的报告格式严重影响了工作效率,需要更明确的沟通和更详细的问题描述来改善处理流程。
有时候客户突然扔过来一个扫描报告,里面都是一些要处理外链,但是报告又没有明确指出出现在哪个页面或者JS,让人抓急。
在一个有python环境的Llinux(Windows也可以,这里以Linux举例)
1.准备环境
pip3 install requests bs4 beautifulsoup4 --break-system-packages
2.执行脚本
Linux复制代码,生成site_scanner.py
python3 site_scanner.py
MAX_THREADS 是最大线程
MAX_DEPTH是最大深度
BASE_URL是要扫描的网址
扫完之后会在当前目录下生成scan_result.txt
cat << 'EOF' > site_scanner.py
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor
import re
import urllib3
# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# --- Configuration ---
BASE_URL = "这里输入网址"
MAX_THREADS = 50 # Optimized for N100
MAX_DEPTH = 3
# ---------------------
TARGET_DOMAIN = urlparse(BASE_URL).netloc
visited_urls = set()
external_found = set()
scanned_assets = set()
URL_PATTERN = re.compile(r'https?://[^\s\'"<>]+')
def analyze_js(js_url):
if js_url in scanned_assets:
return
scanned_assets.add(js_url)
try:
print(f" [+] Analyzing JS: {js_url}")
res = requests.get(js_url, timeout=3, verify=False)
found = URL_PATTERN.findall(res.text)
for link in found:
link_domain = urlparse(link).netloc
if link_domain and TARGET_DOMAIN not in link_domain:
if not any(x in link for x in ['w3.org', 'schema.org']):
external_found.add(link)
except:
pass
def scan_page(url, depth=1):
if depth > MAX_DEPTH or url in visited_urls:
return
# Check if it's internal domain
parsed_url = urlparse(url)
if TARGET_DOMAIN not in parsed_url.netloc and parsed_url.netloc != "":
external_found.add(url)
return
visited_urls.add(url)
try:
print(f"[*] Scanning ({depth}/{MAX_DEPTH}): {url}")
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
res = requests.get(url, timeout=5, headers=headers, verify=False)
soup = BeautifulSoup(res.text, 'html.parser')
# 1. Collect Links & JS
local_links = []
for a in soup.find_all('a', href=True):
full_url = urljoin(url, a['href'])
local_links.append(full_url)
for script in soup.find_all('script', src=True):
js_url = urljoin(url, script['src'])
analyze_js(js_url)
# 2. Multi-threaded crawling
if depth < MAX_DEPTH:
with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
for link in local_links:
executor.submit(scan_page, link, depth + 1)
except Exception as e:
pass
if __name__ == "__main__":
print(f"Starting Fast Scanner on N100 (Threads: {MAX_THREADS})")
print(f"Target: {BASE_URL}\n")
scan_page(BASE_URL)
print("\n" + "="*50)
print(f"Scan Finished!")
print(f"Pages visited: {len(visited_urls)}")
print(f"JS analyzed: {len(scanned_assets)}")
print(f"External links found: {len(external_found)}")
print("="*50)
with open("scan_result.txt", "w") as f:
for link in sorted(external_found):
print(link)
f.write(link + "\n")
print(f"\nResults saved to scan_result.txt")
EOF