hn_links.py

#
r"""
Grab links from HN Post and generate Markdown post with image thumbnails
It also creates a Hugo blog post from Markdown and images generated

SUPPORT: To regenerate thumbnail, just delete the image file under thumbnails folder inside the post directory.
SUPPORT: To remove any link from the blog post, delete the entry after the post is created **in the blog directory**
Note down all the links somewhere then run the following command from blog directory to delete them
E.g. Image links will be like

![](/images/2021/12/21/httpsunixstackexchangecoma88682.png)
![](/images/2021/12/21/httpscleaveapp.png)

$ pbpaste | awk -F\/ '{print $6}' | tr -d ')' | while read img; do find . -name $img -delete; done # noqa: W605

Usage:
$ python hn-links.py -l https://news.ycombinator.com/item?id=25381191 -b <blog_directory> --open-in-editor
"""

import logging
import os
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from datetime import datetime
from pathlib import Path
from urllib.parse import parse_qs, urlparse

from bs4 import BeautifulSoup
from py_executable_checklist.workflow import WorkflowBase, run_command, run_workflow

from common_utils import fetch_html_page, html_parser_from

UTF_ENCODING = "utf-8"
#

Common functions

#
def fetch_html(url, post_html_page_file):
    logging.info(f"Fetching HTML title for {url}")
    if post_html_page_file.exists():
        logging.info(f"🌕 Loading page from cache {post_html_page_file}")
        return post_html_page_file.read_text(encoding=UTF_ENCODING)

    page_html = fetch_html_page(url)
    logging.info(f"Caching page {post_html_page_file}")
    post_html_page_file.write_text(page_html, encoding=UTF_ENCODING)
    return page_html
#
def relative_image_directory():
    now = datetime.now()
    year = now.strftime("%Y")
    month = now.strftime("%m")
    day = now.strftime("%d")
    return f"images/{year}/{month}/{day}"
#

Workflow steps

#

Create output folder using Post id in the temporary folder

class CreateOutputFolder(WorkflowBase):
#
    hn_link: str
#
    def run(self, context):
        hn_post_id = parse_qs(urlparse(self.hn_link).query).get("id")[0]
        download_folder = f"{os.getcwd()}/.temp"

        target_folder = Path(download_folder) / hn_post_id
        child_links_folder = target_folder / "links"
        thumbnails_folder = target_folder / "thumbnails"

        for f in [target_folder, child_links_folder, thumbnails_folder]:
            f.mkdir(parents=True, exist_ok=True)
#

output

        context["hn_post_id"] = hn_post_id
        context["target_folder"] = target_folder
        context["child_links_folder"] = child_links_folder
        context["thumbnails_folder"] = thumbnails_folder
#

Use requests to download HTML using a browser user agent

class GrabPostHtml(WorkflowBase):
#
    hn_link: str
    target_folder: str
#
    def run(self, context):
        post_html_page_file = Path(self.target_folder) / "hn_post.html"
        page_html = fetch_html(self.hn_link, post_html_page_file)
#

output

        context["page_html"] = page_html
#

Create BeautifulSoap parser from html

class ParsePostHtml(WorkflowBase):
#
    page_html: str
#
    def run(self, context):
#

output

        context["bs"] = html_parser_from(self.page_html)
#

Extract page title using BeautifulSoap HTML parser

class GrabPostTitle(WorkflowBase):
#
    bs: BeautifulSoup
#
    def run(self, context):
#

output

        context["hn_post_title"] = self.bs.title.string
#

Extract all links

class ExtractAllLinksFromPost(WorkflowBase):
#
    bs: BeautifulSoup
#
    def run(self, context):
        all_links = {link.get("href") for link in self.bs.find_all("a", href=True)}
#

output

        context["all_links"] = all_links
#

Write all links to file so that the next script can read them

class WriteLinksToFile(WorkflowBase):
#
    all_links: set
#
    def run(self, context):
        links_file = Path(context["target_folder"]) / "links.txt"
        links_file.write_text("\n".join(self.all_links), encoding=UTF_ENCODING)

        context["links_file"] = links_file
#

Call other script to download thumbnails and generate Hugo post

class CallLinksToHugoScript(WorkflowBase):
#
    links_file: Path
    hn_post_title: str
    blog_directory: str
#
    def run(self, _):
        cmd = (
            f"./venv/bin/python3 links_to_hugo.py "
            f'--links-file "{self.links_file}" '
            f'--post-title "{self.hn_post_title}" '
            f'--blog-directory "{self.blog_directory}"  '
            f"--open-in-editor"
        )
        run_command(cmd)
#

Workflow definition

#
def workflow_steps():
    return [
        CreateOutputFolder,
        GrabPostHtml,
        ParsePostHtml,
        GrabPostTitle,
        ExtractAllLinksFromPost,
        WriteLinksToFile,
        CallLinksToHugoScript,
    ]
#

Boilerplate

#
def setup_logging():
    logging.basicConfig(
        handlers=[logging.StreamHandler()],
        format="%(asctime)s - %(filename)s:%(lineno)d - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
    )
    logging.captureWarnings(capture=True)
#
def main(args):
    context = args.__dict__
    run_workflow(context, workflow_steps())
#
def parse_args():
    parser = ArgumentParser(description=__doc__, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-l", "--hn-link", required=True, type=str, help="Link to HN Post")
    parser.add_argument(
        "-b",
        "--blog-directory",
        type=str,
        required=True,
        help="Full path to blog directory",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        default=False,
        dest="verbose",
        help="Display context variables at each step",
    )
    return parser.parse_args()


if __name__ == "__main__":
    setup_logging()
    args = parse_args()
    main(args)