From 23422259c7436a409e6aa5d640f3fdc87e06a64e Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Tue, 25 Jul 2023 16:41:23 -0400 Subject: [PATCH 1/9] add sitemap index generator --- .gitignore | 1 + sitemap-index/.python-version | 1 + sitemap-index/sitemap-index-generator.py | 58 ++++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 sitemap-index/.python-version create mode 100644 sitemap-index/sitemap-index-generator.py diff --git a/.gitignore b/.gitignore index 4cb7f9c..deb9500 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ manifestseption/manifests* manifestseption/search-manifests* manifestseption/sites-temp* snooty/docs-master.zip +sitemap-index/sitemap-index.xml diff --git a/sitemap-index/.python-version b/sitemap-index/.python-version new file mode 100644 index 0000000..1e33456 --- /dev/null +++ b/sitemap-index/.python-version @@ -0,0 +1 @@ +3.11.2 diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py new file mode 100644 index 0000000..651409a --- /dev/null +++ b/sitemap-index/sitemap-index-generator.py @@ -0,0 +1,58 @@ +import pymongo +import pandas as pd + + +repos_branches = pymongo.MongoClient()["pool"].repos_branches + +repos_branches_data = repos_branches.find() +sitemap_urls = [] + +url = "https://www.mongodb.com/" + +for repo in repos_branches_data: + print(repo["repoName"]) + sitemap_extension = "/sitemap-0.xml" + # Exclude repos we don't care about + if repo["repoName"] in ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", "docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator", "docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"]: + continue + if not repo["branches"]: + continue + for branch in repo["branches"]: + if branch["buildsWithSnooty"] == False: #this will be useful once we fix the drivers, mms-docs, k8s maps + sitemap_extension = "/sitemap.xml.gz" + print(branch) + print("branchName: " + branch["gitBranchName"]) + if not branch["active"]: + continue + branch_url_base = url + repo["prefix"]["dotcomprd"] + print("URL BASE:" + branch_url_base) + if "urlSlug" in branch and branch["urlSlug"] is not None: + print("Using urlSlug for the slug") + sitemap_urls.append(branch_url_base + "/" + branch["urlSlug"] + sitemap_extension) + continue + if branch["publishOriginalBranchName"] == True: + print("Using gitBranchName for the slug") + sitemap_urls.append(branch_url_base + "/" + branch["gitBranchName"] + sitemap_extension) + continue + print("I guess this isn't versioned?") + sitemap_urls.append(branch_url_base + sitemap_extension) + + +print(sitemap_urls) + +# Set up DataFrame from the list of URLs + +df = pd.DataFrame(sitemap_urls, columns=["loc"]) + +xml_data = df.to_xml(root_name="sitemapindex", row_name="sitemap", xml_declaration=True) +print(xml_data) + +# Save the XML data to a file +with open("sitemap-index.xml", "w") as file: + file.write(xml_data) + + +## TODO: +# - rewrite v6.0 branch of manual to be manual ratther than v6.0, +# - confirm with ElizabethB how to handle 'aliases' +# - figure out where to put this and how to handle credentials properly From 0be5890e9d48f73c0649bf15ffe84bfea46fb1f7 Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Wed, 26 Jul 2023 15:01:07 -0400 Subject: [PATCH 2/9] now with types and validation and stuff --- sitemap-index/sitemap-index-generator.py | 169 ++++++++++++++++++----- 1 file changed, 134 insertions(+), 35 deletions(-) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index 651409a..8873322 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -1,42 +1,147 @@ import pymongo import pandas as pd +import os +from flutter import check_type, checked +from dataclasses import dataclass +from typing import Optional +@checked +@dataclass +class SitemapUrlSuffix(): + gitBranchName: str + urlSuffix: str + extension: str -repos_branches = pymongo.MongoClient()["pool"].repos_branches +@checked +@dataclass +class Branch(): + gitBranchName: str + active: bool + publishOriginalBranchName: bool + urlSlug: Optional[str] + buildsWithSnooty: bool -repos_branches_data = repos_branches.find() -sitemap_urls = [] +@checked +@dataclass +class Repo(): + repoName: str + branches: list[Branch] | None + prefix: str + baseUrl: str + +class ConstructRepo: + def __init__(self, data) -> None: + self.data = data + + self.repoName: str = data["repoName"] + self.branches = self.get_branches() + self.prefix = self.get_prefix() + self.baseUrl = self.derive_url() + + def get_prefix(self) -> str: + if not check_type(str, self.data["prefix"]["dotcomprd"]): + raise TypeError + return self.data["prefix"]["dotcomprd"] + + def derive_url(self) -> str: + url = "https://www.mongodb.com/" + self.prefix + "/" + return url + + def get_branches(self) -> list[Branch] | None: + if not self.data["branches"]: + self.wonky = True + return None + branch_list: list[Branch] = [] + for branch in self.data["branches"]: + new_branch = Branch(branch["gitBranchName"], + branch.get("active", False), + branch.get("publishOriginalBranchName", False), + branch.get("urlSlug", None), + branch.get("buildsWithSnooty", True)) + branch_list.append(new_branch) + return branch_list + + def export(self) -> Repo: + repo = Repo( + repoName=self.repoName, + branches=self.branches, + prefix=self.prefix, + baseUrl=self.baseUrl + ) + return repo + + +class ConstructSitemapEntry: + def __init__(self, data: Branch) -> None: + self.data = data -url = "https://www.mongodb.com/" + self.gitBranchName: str = data.gitBranchName + self.urlSuffix = self.derive_url_suffix() + self.extension = self.derive_extension() -for repo in repos_branches_data: - print(repo["repoName"]) - sitemap_extension = "/sitemap-0.xml" - # Exclude repos we don't care about - if repo["repoName"] in ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", "docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator", "docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"]: + def derive_extension(self) -> str: + if self.data.buildsWithSnooty: + return "/sitemap-0.xml" + return "/sitemap.xml.gz" + + def derive_url_suffix(self) -> str: + urlSuffix: str = "" + if self.data.urlSlug: + urlSuffix = self.data.urlSlug + return urlSuffix + if self.data.publishOriginalBranchName: + urlSuffix = self.gitBranchName + return urlSuffix + return urlSuffix + + def export(self) -> SitemapUrlSuffix: + suffix = SitemapUrlSuffix( + gitBranchName=self.gitBranchName, + urlSuffix=self.urlSuffix, + extension=self.extension + ) + return suffix + +def run_validation(data) -> tuple[bool, str]: + valid = True + if not check_type(str, data["repoName"]): + valid = False + return valid, "No repo name?!" + if not data.get("branches"): + valid = False + return valid, "No branch entry" + if not (data.get("prefix") and data["prefix"].get("dotcomprd")): + valid = False + return valid, "No dotcomprd prefix entry" + return valid, "" + +repos_branches = pymongo.MongoClient(os.environ.get('SNOOTY_CONN_STRING'))["pool"].repos_branches + +repos_branches_data = repos_branches.find() +sitemap_urls: list[str] = [] + +for r in repos_branches_data: + print(r["repoName"]) + validity, message = run_validation(r) + if not validity: + print(message) continue - if not repo["branches"]: + # Skip repos that do not need sitemaps or whose sitemaps are horribly broken because built by legacy tooling + if r["repoName"] in ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", "docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator", "docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"]: + print("Skipping") continue - for branch in repo["branches"]: - if branch["buildsWithSnooty"] == False: #this will be useful once we fix the drivers, mms-docs, k8s maps - sitemap_extension = "/sitemap.xml.gz" - print(branch) - print("branchName: " + branch["gitBranchName"]) - if not branch["active"]: - continue - branch_url_base = url + repo["prefix"]["dotcomprd"] - print("URL BASE:" + branch_url_base) - if "urlSlug" in branch and branch["urlSlug"] is not None: - print("Using urlSlug for the slug") - sitemap_urls.append(branch_url_base + "/" + branch["urlSlug"] + sitemap_extension) - continue - if branch["publishOriginalBranchName"] == True: - print("Using gitBranchName for the slug") - sitemap_urls.append(branch_url_base + "/" + branch["gitBranchName"] + sitemap_extension) - continue - print("I guess this isn't versioned?") - sitemap_urls.append(branch_url_base + sitemap_extension) + repo = ConstructRepo(r).export() + if repo.branches: + for b in repo.branches: + if b.active: + print(b.gitBranchName) + sitemap_suffix: SitemapUrlSuffix = ConstructSitemapEntry(b).export() + sitemap_url: str = repo.baseUrl + sitemap_suffix.urlSuffix + sitemap_suffix.extension + print(sitemap_url) + sitemap_urls.append(sitemap_url) + else: + print("Repo has no branches.") print(sitemap_urls) @@ -50,9 +155,3 @@ # Save the XML data to a file with open("sitemap-index.xml", "w") as file: file.write(xml_data) - - -## TODO: -# - rewrite v6.0 branch of manual to be manual ratther than v6.0, -# - confirm with ElizabethB how to handle 'aliases' -# - figure out where to put this and how to handle credentials properly From 5391f2147c4324af7618cd17a3cf0e99aaf2b922 Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Wed, 26 Jul 2023 15:13:27 -0400 Subject: [PATCH 3/9] move excluded list to top of file --- sitemap-index/sitemap-index-generator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index 8873322..66633c1 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -5,6 +5,10 @@ from dataclasses import dataclass from typing import Optional +excluded_repos = ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", + "docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator", + "docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"] + @checked @dataclass class SitemapUrlSuffix(): @@ -127,7 +131,7 @@ def run_validation(data) -> tuple[bool, str]: print(message) continue # Skip repos that do not need sitemaps or whose sitemaps are horribly broken because built by legacy tooling - if r["repoName"] in ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", "docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator", "docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"]: + if r["repoName"] in excluded_repos: print("Skipping") continue repo = ConstructRepo(r).export() From 85dee956fe89571f6cfe28b82889c10d64febe52 Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Wed, 26 Jul 2023 15:18:14 -0400 Subject: [PATCH 4/9] add todo --- sitemap-index/sitemap-index-generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index 66633c1..805e318 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from typing import Optional +# TODO: replace this with a flag in the DB like excludeFromSitemapIndex or the like excluded_repos = ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", "docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator", "docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"] From 4edec2ba96e4cae198188a000197d73f5f8f04bf Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Thu, 27 Jul 2023 14:16:06 -0400 Subject: [PATCH 5/9] updated with helis feedback --- sitemap-index/sitemap-index-generator.py | 112 ++++++++++++----------- 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index 805e318..57e4d6b 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -4,6 +4,7 @@ from flutter import check_type, checked from dataclasses import dataclass from typing import Optional +from posixpath import join # TODO: replace this with a flag in the DB like excludeFromSitemapIndex or the like excluded_repos = ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", @@ -12,14 +13,14 @@ @checked @dataclass -class SitemapUrlSuffix(): +class SitemapUrlSuffix: gitBranchName: str urlSuffix: str extension: str @checked @dataclass -class Branch(): +class Branch: gitBranchName: str active: bool publishOriginalBranchName: bool @@ -28,7 +29,7 @@ class Branch(): @checked @dataclass -class Repo(): +class Repo: repoName: str branches: list[Branch] | None prefix: str @@ -49,7 +50,7 @@ def get_prefix(self) -> str: return self.data["prefix"]["dotcomprd"] def derive_url(self) -> str: - url = "https://www.mongodb.com/" + self.prefix + "/" + url = join("https://www.mongodb.com", self.prefix) return url def get_branches(self) -> list[Branch] | None: @@ -86,8 +87,8 @@ def __init__(self, data: Branch) -> None: def derive_extension(self) -> str: if self.data.buildsWithSnooty: - return "/sitemap-0.xml" - return "/sitemap.xml.gz" + return "sitemap-0.xml" + return "sitemap.xml.gz" def derive_url_suffix(self) -> str: urlSuffix: str = "" @@ -108,55 +109,56 @@ def export(self) -> SitemapUrlSuffix: return suffix def run_validation(data) -> tuple[bool, str]: - valid = True if not check_type(str, data["repoName"]): - valid = False - return valid, "No repo name?!" + raise ValueError("No repo name?!") if not data.get("branches"): - valid = False - return valid, "No branch entry" + raise ValueError("No branch entry.") if not (data.get("prefix") and data["prefix"].get("dotcomprd")): - valid = False - return valid, "No dotcomprd prefix entry" - return valid, "" - -repos_branches = pymongo.MongoClient(os.environ.get('SNOOTY_CONN_STRING'))["pool"].repos_branches - -repos_branches_data = repos_branches.find() -sitemap_urls: list[str] = [] - -for r in repos_branches_data: - print(r["repoName"]) - validity, message = run_validation(r) - if not validity: - print(message) - continue - # Skip repos that do not need sitemaps or whose sitemaps are horribly broken because built by legacy tooling - if r["repoName"] in excluded_repos: - print("Skipping") - continue - repo = ConstructRepo(r).export() - - if repo.branches: - for b in repo.branches: - if b.active: - print(b.gitBranchName) - sitemap_suffix: SitemapUrlSuffix = ConstructSitemapEntry(b).export() - sitemap_url: str = repo.baseUrl + sitemap_suffix.urlSuffix + sitemap_suffix.extension - print(sitemap_url) - sitemap_urls.append(sitemap_url) - else: - print("Repo has no branches.") - -print(sitemap_urls) - -# Set up DataFrame from the list of URLs - -df = pd.DataFrame(sitemap_urls, columns=["loc"]) - -xml_data = df.to_xml(root_name="sitemapindex", row_name="sitemap", xml_declaration=True) -print(xml_data) - -# Save the XML data to a file -with open("sitemap-index.xml", "w") as file: - file.write(xml_data) + raise ValueError("No dotcomprd prefix entry") + return + + +def main() -> None: + repos_branches = pymongo.MongoClient(os.environ.get('SNOOTY_CONN_STRING'))["pool"].repos_branches + + repos_branches_data = repos_branches.find() + sitemap_urls: list[str] = [] + + for r in repos_branches_data: + try: + run_validation(r) + except Exception as e: + print(e.args) + + # Skip repos that do not need sitemaps or whose sitemaps are horribly broken because built by legacy tooling + if r["repoName"] in excluded_repos: + print("Skipping") + continue + repo = ConstructRepo(r).export() + + if repo.branches: + for b in repo.branches: + if b.active: + print(b.gitBranchName) + sitemap_suffix = ConstructSitemapEntry(b).export() + sitemap_url = join(repo.baseUrl, sitemap_suffix.urlSuffix, sitemap_suffix.extension) + print(sitemap_url) + sitemap_urls.append(sitemap_url) + else: + print("Repo has no branches.") + + print(sitemap_urls) + + # Set up DataFrame from the list of URLs + + df = pd.DataFrame(sitemap_urls, columns=["loc"]) + + xml_data = df.to_xml(root_name="sitemapindex", row_name="sitemap", xml_declaration=True) + print(xml_data) + + # Save the XML data to a file + with open("sitemap-index.xml", "w") as file: + file.write(xml_data) + +if __name__ == "__main__": + main() From 86bbcf545a7206ac63be702ea8dd9641cf5074cb Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Thu, 27 Jul 2023 16:48:12 -0400 Subject: [PATCH 6/9] add namespace --- sitemap-index/sitemap-index-generator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index 57e4d6b..f7cbd48 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -117,7 +117,6 @@ def run_validation(data) -> tuple[bool, str]: raise ValueError("No dotcomprd prefix entry") return - def main() -> None: repos_branches = pymongo.MongoClient(os.environ.get('SNOOTY_CONN_STRING'))["pool"].repos_branches @@ -153,7 +152,7 @@ def main() -> None: df = pd.DataFrame(sitemap_urls, columns=["loc"]) - xml_data = df.to_xml(root_name="sitemapindex", row_name="sitemap", xml_declaration=True) + xml_data = df.to_xml(root_name="sitemapindex", namespaces={"": "http://www.sitemaps.org/schemas/sitemap/0.9"}, row_name="sitemap", xml_declaration=True) print(xml_data) # Save the XML data to a file From 1a2a787e3ac80b84dfd427a222a5ffd67bd45d14 Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Thu, 27 Jul 2023 16:53:33 -0400 Subject: [PATCH 7/9] noir --- sitemap-index/sitemap-index-generator.py | 67 +++++++++++++++++------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index f7cbd48..f2d43eb 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -7,9 +7,20 @@ from posixpath import join # TODO: replace this with a flag in the DB like excludeFromSitemapIndex or the like -excluded_repos = ["docs-404", "docs-meta", "devhub-content", "docs-mongodb-internal", - "docs-mongodb-internal-base", "docs-csfle-merge", "docs-k8s-operator", - "docs-php-library", "docs-ruby", "docs-mongoid", "mms-docs"] +excluded_repos = [ + "docs-404", + "docs-meta", + "devhub-content", + "docs-mongodb-internal", + "docs-mongodb-internal-base", + "docs-csfle-merge", + "docs-k8s-operator", + "docs-php-library", + "docs-ruby", + "docs-mongoid", + "mms-docs", +] + @checked @dataclass @@ -18,6 +29,7 @@ class SitemapUrlSuffix: urlSuffix: str extension: str + @checked @dataclass class Branch: @@ -27,6 +39,7 @@ class Branch: urlSlug: Optional[str] buildsWithSnooty: bool + @checked @dataclass class Repo: @@ -35,10 +48,11 @@ class Repo: prefix: str baseUrl: str + class ConstructRepo: def __init__(self, data) -> None: self.data = data - + self.repoName: str = data["repoName"] self.branches = self.get_branches() self.prefix = self.get_prefix() @@ -59,11 +73,13 @@ def get_branches(self) -> list[Branch] | None: return None branch_list: list[Branch] = [] for branch in self.data["branches"]: - new_branch = Branch(branch["gitBranchName"], - branch.get("active", False), - branch.get("publishOriginalBranchName", False), - branch.get("urlSlug", None), - branch.get("buildsWithSnooty", True)) + new_branch = Branch( + branch["gitBranchName"], + branch.get("active", False), + branch.get("publishOriginalBranchName", False), + branch.get("urlSlug", None), + branch.get("buildsWithSnooty", True), + ) branch_list.append(new_branch) return branch_list @@ -72,7 +88,7 @@ def export(self) -> Repo: repoName=self.repoName, branches=self.branches, prefix=self.prefix, - baseUrl=self.baseUrl + baseUrl=self.baseUrl, ) return repo @@ -99,15 +115,16 @@ def derive_url_suffix(self) -> str: urlSuffix = self.gitBranchName return urlSuffix return urlSuffix - + def export(self) -> SitemapUrlSuffix: suffix = SitemapUrlSuffix( - gitBranchName=self.gitBranchName, - urlSuffix=self.urlSuffix, - extension=self.extension - ) + gitBranchName=self.gitBranchName, + urlSuffix=self.urlSuffix, + extension=self.extension, + ) return suffix - + + def run_validation(data) -> tuple[bool, str]: if not check_type(str, data["repoName"]): raise ValueError("No repo name?!") @@ -117,8 +134,11 @@ def run_validation(data) -> tuple[bool, str]: raise ValueError("No dotcomprd prefix entry") return + def main() -> None: - repos_branches = pymongo.MongoClient(os.environ.get('SNOOTY_CONN_STRING'))["pool"].repos_branches + repos_branches = pymongo.MongoClient(os.environ.get("SNOOTY_CONN_STRING"))[ + "pool" + ].repos_branches repos_branches_data = repos_branches.find() sitemap_urls: list[str] = [] @@ -140,7 +160,9 @@ def main() -> None: if b.active: print(b.gitBranchName) sitemap_suffix = ConstructSitemapEntry(b).export() - sitemap_url = join(repo.baseUrl, sitemap_suffix.urlSuffix, sitemap_suffix.extension) + sitemap_url = join( + repo.baseUrl, sitemap_suffix.urlSuffix, sitemap_suffix.extension + ) print(sitemap_url) sitemap_urls.append(sitemap_url) else: @@ -152,12 +174,19 @@ def main() -> None: df = pd.DataFrame(sitemap_urls, columns=["loc"]) - xml_data = df.to_xml(root_name="sitemapindex", namespaces={"": "http://www.sitemaps.org/schemas/sitemap/0.9"}, row_name="sitemap", xml_declaration=True) + xml_data = df.to_xml( + root_name="sitemapindex", + index=False, + namespaces={"": "http://www.sitemaps.org/schemas/sitemap/0.9"}, + row_name="sitemap", + xml_declaration=True, + ) print(xml_data) # Save the XML data to a file with open("sitemap-index.xml", "w") as file: file.write(xml_data) + if __name__ == "__main__": main() From 5eeab8470cc0deb0f8edd535ae43c85138f13a8f Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Mon, 31 Jul 2023 09:49:51 -0400 Subject: [PATCH 8/9] add comment --- sitemap-index/sitemap-index-generator.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index f2d43eb..432addd 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -14,11 +14,12 @@ "docs-mongodb-internal", "docs-mongodb-internal-base", "docs-csfle-merge", - "docs-k8s-operator", - "docs-php-library", - "docs-ruby", - "docs-mongoid", - "mms-docs", + "docs-k8s-operator", #broken + "docs-php-library", #broken + "docs-ruby", #broken + "docs-mongoid", #broken + "mms-docs", #broken + "docs-rust" #not yet published ] From 80514203f7ee142cac3c1f038e01a7928fcc499a Mon Sep 17 00:00:00 2001 From: Allison Reinheimer Moore Date: Tue, 20 Feb 2024 14:33:12 -0500 Subject: [PATCH 9/9] bump for updates --- sitemap-index/sitemap-index-generator.py | 93 +++++++++++++++++------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/sitemap-index/sitemap-index-generator.py b/sitemap-index/sitemap-index-generator.py index 432addd..c6f0b4b 100644 --- a/sitemap-index/sitemap-index-generator.py +++ b/sitemap-index/sitemap-index-generator.py @@ -6,23 +6,6 @@ from typing import Optional from posixpath import join -# TODO: replace this with a flag in the DB like excludeFromSitemapIndex or the like -excluded_repos = [ - "docs-404", - "docs-meta", - "devhub-content", - "docs-mongodb-internal", - "docs-mongodb-internal-base", - "docs-csfle-merge", - "docs-k8s-operator", #broken - "docs-php-library", #broken - "docs-ruby", #broken - "docs-mongoid", #broken - "mms-docs", #broken - "docs-rust" #not yet published -] - - @checked @dataclass class SitemapUrlSuffix: @@ -39,6 +22,7 @@ class Branch: publishOriginalBranchName: bool urlSlug: Optional[str] buildsWithSnooty: bool + eolType: Optional[str] @checked @@ -49,6 +33,45 @@ class Repo: prefix: str baseUrl: str +@checked +@dataclass +class DBBranchObj: + """Define the branches object in repos_branches""" + id: any #ObjectId + gitBranchName: str + active: bool + urlAliases: Optional[list[str]] + publishOriginalBranchName: bool + urlSlug: str + versionSelectorLabel: str + isStableBranch: bool + buildsWithSnooty: bool + aliases: Optional[any] + name: Optional[str] + + +@checked +@dataclass +class DBPrefixObj: + """Define the prefixes object in repos_branches""" + stg: str + prd: str + dotcomstg: str + dotcomprd: str + +@checked +@dataclass +class DBRepoObj: + repoName: str + branches: list[DBBranchObj] + prefix: list[DBPrefixObj] + bucket: list[any] #don't care + url: list[any] #don't care + project: str #don't care + search: Optional[list[any]] #don't care + groups: Optional[list[any]] #don't care + displayName: Optional[str] #don't care + _id: any #don't care class ConstructRepo: def __init__(self, data) -> None: @@ -60,9 +83,9 @@ def __init__(self, data) -> None: self.baseUrl = self.derive_url() def get_prefix(self) -> str: - if not check_type(str, self.data["prefix"]["dotcomprd"]): + if not check_type(str, self.data["docset"][0]["prefix"]["dotcomprd"]): raise TypeError - return self.data["prefix"]["dotcomprd"] + return self.data["docset"][0]["prefix"]["dotcomprd"] def derive_url(self) -> str: url = join("https://www.mongodb.com", self.prefix) @@ -80,6 +103,7 @@ def get_branches(self) -> list[Branch] | None: branch.get("publishOriginalBranchName", False), branch.get("urlSlug", None), branch.get("buildsWithSnooty", True), + branch.get("eol_type", None) ) branch_list.append(new_branch) return branch_list @@ -130,9 +154,11 @@ def run_validation(data) -> tuple[bool, str]: if not check_type(str, data["repoName"]): raise ValueError("No repo name?!") if not data.get("branches"): - raise ValueError("No branch entry.") - if not (data.get("prefix") and data["prefix"].get("dotcomprd")): - raise ValueError("No dotcomprd prefix entry") + raise ValueError(f"No branch entry for {data['repoName']}.") + if not (data["docset"].get("prefix") and data["docset"]["prefix"].get("dotcomprd")): + raise ValueError(f"No dotcomprd prefix entry for {data['repoName']}") + if not (data.get("prodDeployable")): + raise ValueError(f"Cannot determine prod deployablility for {data['repoName']}") return @@ -141,7 +167,16 @@ def main() -> None: "pool" ].repos_branches - repos_branches_data = repos_branches.find() + lookup_pipeline = [ + {"$lookup": { + "from": "docsets", + "localField": "_id", + "foreignField": "repos", + "as": "docset" + }} + ] + + repos_branches_data = repos_branches.aggregate(lookup_pipeline) sitemap_urls: list[str] = [] for r in repos_branches_data: @@ -150,15 +185,17 @@ def main() -> None: except Exception as e: print(e.args) - # Skip repos that do not need sitemaps or whose sitemaps are horribly broken because built by legacy tooling - if r["repoName"] in excluded_repos: - print("Skipping") + print(r) + + # Skip repos that do not need sitemaps and br + if r["internalOnly"] or not r["prodDeployable"]: + print(f"Skipping {r['repoName']}") continue repo = ConstructRepo(r).export() if repo.branches: for b in repo.branches: - if b.active: + if b.active and not (b.eolType or b.urlSlug == "upcoming" or b.urlSlug == "beta" or (b.gitBranchName == "master" and b.urlSlug == "master")): print(b.gitBranchName) sitemap_suffix = ConstructSitemapEntry(b).export() sitemap_url = join( @@ -185,7 +222,7 @@ def main() -> None: print(xml_data) # Save the XML data to a file - with open("sitemap-index.xml", "w") as file: + with open("sitemap-index-full.xml", "w") as file: file.write(xml_data)