forked from mirogeorg/SplunkListDocumentationAsPDF
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractSplunkPDFLinks.py
More file actions
81 lines (68 loc) · 2.91 KB
/
extractSplunkPDFLinks.py
File metadata and controls
81 lines (68 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
from bs4 import BeautifulSoup
import re
import sys
# Regex to find links matching the documentation structure
manual_link_regex = re.compile(r"^/Documentation/Splunk/([\d.]+)/([^/]+)/(.+)$")
# Target URL for the specific Splunk version
target_url = "https://help.splunk.com/en/splunk-enterprise/alert-and-respond/alerting-manual/9.4/alerting-overview/"
extracted_links = []
print(f"Attempting to fetch and parse: {target_url}")
try:
response = requests.get(
target_url, timeout=30, headers={"User-Agent": "Link Extraction Script"}
)
response.raise_for_status() # Check if the request was successful
page = BeautifulSoup(response.text, "html.parser")
# Find all anchor (<a>) tags with an href attribute
all_links = page.find_all("a", href=True)
print(
f"Found {len(all_links)} total links. Filtering for documentation manual links..."
)
for link in all_links:
print(link)
for link in all_links:
href = link["href"]
match = manual_link_regex.match(href)
if match:
# Extract parts needed for the PDF URL construction
version, section, docname = match.groups()
# Construct the original href (which is what we want to list)
full_href = f"https://docs.splunk.com{href}" # Prepend domain if needed, though original script just used relative path
# Store the matched relative href and the extracted parts
extracted_links.append(
{
"relative_href": href,
"version": version,
"section": section,
"docname": docname,
"pdf_url_pattern": f"https://docs.splunk.com/index.php?title=Documentation:Splunk:{section}:{docname}:{version}&action=pdfbook",
}
)
print(f" -> Matched: {href} (v={version}, s={section}, d={docname})")
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}", file=sys.stderr)
except ImportError:
print(
"Error: Please install 'requests' and 'beautifulsoup4'. `pip install requests beautifulsoup4`",
file=sys.stderr,
)
except Exception as e:
print(f"An error occurred during parsing: {e}", file=sys.stderr)
# --- Output the results ---
if not extracted_links:
print("\nNo links matching the required pattern were found.")
print(
"The website structure might have changed, or the target page doesn't contain such links."
)
else:
print(f"\n--- Extracted Relative Links Matching Pattern ---")
unique_hrefs = sorted(list(set(link["relative_href"] for link in extracted_links)))
for href in unique_hrefs:
print(href)
print(f"\n--- Corresponding PDF URL Patterns ---")
unique_pdf_urls = sorted(
list(set(link["pdf_url_pattern"] for link in extracted_links))
)
for url in unique_pdf_urls:
print(url)