Link checker: display original href rather than resolved URL

This commit is contained in:
rubenwardy
2025-10-05 16:29:53 +01:00
parent a604b3cd09
commit 8bafaed671
2 changed files with 8 additions and 7 deletions

View File

@@ -15,7 +15,6 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from typing import Sequence
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from jinja2.utils import markupsafe
from markdown_it import MarkdownIt
@@ -107,7 +106,7 @@ def get_user_mentions(html: str) -> set:
return set([x.get("data-username") for x in links])
def get_links(html: str, url: str) -> set:
def get_links(html: str) -> set:
soup = BeautifulSoup(html, "html.parser")
links = soup.select("a[href]")
return set([urljoin(url, x.get("href")) for x in links])
return set([x.get("href") for x in links])

View File

@@ -19,7 +19,7 @@ import random
import re
import sys
from time import sleep
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
from typing import Optional
import requests
@@ -131,6 +131,7 @@ def _url_exists(url: str) -> str:
def _check_for_dead_links(package: Package) -> dict[str, str]:
ignored_urls = set(app.config.get("LINK_CHECKER_IGNORED_URLS", ""))
base_url = package.get_url("packages.view", absolute=True)
links: set[Optional[str]] = {
package.repo,
package.website,
@@ -142,7 +143,7 @@ def _check_for_dead_links(package: Package) -> dict[str, str]:
}
if package.desc:
links.update(get_links(render_markdown(package.desc), package.get_url("packages.view", absolute=True)))
links.update(get_links(render_markdown(package.desc)))
print(f"Checking {package.title} ({len(links)} links) for broken links", file=sys.stderr)
@@ -152,14 +153,15 @@ def _check_for_dead_links(package: Package) -> dict[str, str]:
if link is None:
continue
url = urlparse(link)
abs_link = urljoin(base_url, link)
url = urlparse(abs_link)
if url.scheme != "http" and url.scheme != "https":
continue
if url.hostname in ignored_urls:
continue
res = _url_exists(link)
res = _url_exists(abs_link)
if res != "":
bad_urls[link] = res