import logging
import os
import random
import time
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from pathlib import Path
from dotenv import load_dotenv
from playwright.sync_api import Playwright
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from playwright.sync_api import sync_playwright
from slug import slug
load_dotenv()
def scroll_speed():
return random.randint(300, 500)
def scroll_to_end(page):
current_scroll_position, new_height = 0, 1
while current_scroll_position <= new_height:
page.evaluate(f"""() => {{ window.scrollTo(0, {current_scroll_position}); }} """)
new_height = page.evaluate("""() => { return document.body.scrollHeight; } """)
current_scroll_position += scroll_speed()
logging.info(f"current_scroll_position: {current_scroll_position}, new_height: {new_height}")
time.sleep(2)
def urls_to_fetch(input_url: str, input_file: Path) -> list[str]:
if not input_url and not input_file:
logging.error("No input URL or file provided")
raise ValueError("No input URL or file provided")
urls_from_file = input_file.read_text().splitlines() if input_file else []
urls_from_file.append(input_url) if input_url else None
return urls_from_file
def click_on_element(page_action):
try:
el = page_action()
if el:
el.click(timeout=5000)
except PlaywrightTimeoutError as e:
logging.debug(e)
def generate_output_file_name(output_dir, url):
url_as_path = Path(url)
return output_dir.joinpath(f"{slug(f'{url_as_path.parent.name}-{url_as_path.stem}')}.pdf")
def run(playwright: Playwright, args) -> None:
auth_session_file = args.auth_session_file
convert_to_pdf = args.convert_to_pdf
input_url = args.input_url
input_file = args.input_file
urls_from_file = urls_to_fetch(input_url, input_file)
browser = playwright.chromium.launch(headless=False)
for url in urls_from_file:
if not url:
continue
logging.info(f"Processing URL: {url}")
if auth_session_file and Path.cwd().joinpath(auth_session_file).exists():
logging.debug(f"Creating new context with authentication session: {auth_session_file}")
context = browser.new_context(storage_state=auth_session_file)
else:
logging.debug("Creating new context")
context = browser.new_context()
page = context.new_page()
page.goto(url)
try:
page.wait_for_load_state("networkidle")
except PlaywrightTimeoutError as e:
logging.error(f"Timeout waiting for page: {url} to load", e)
continue
click_on_element(lambda: page.get_by_test_id("close-button"))
click_on_element(lambda: page.get_by_role("button", name="Accept all cookies"))
click_on_element(lambda: page.get_by_role("button", name="Accept all"))
page.focus("body")
scroll_to_end(page)
if convert_to_pdf:
output_dir = Path.cwd().joinpath("target/pdfs")
output_dir.mkdir(parents=True, exist_ok=True)
output_file_path = generate_output_file_name(output_dir, url)
page.pdf(path=output_file_path.as_posix(), format="A4")
else:
page.pause()
if input_file:
urls_from_file.remove(url)
output_list = os.linesep.join([str(x) for x in urls_from_file])
input_file.write_text(output_list)
context.close()
def setup_logging(verbosity):
logging_level = logging.WARNING
if verbosity == 1:
logging_level = logging.INFO
elif verbosity >= 2:
logging_level = logging.DEBUG
logging.basicConfig(
handlers=[
logging.StreamHandler(),
],
format="%(asctime)s - %(filename)s:%(lineno)d - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging_level,
)
logging.captureWarnings(capture=True)
def parse_args():
parser = ArgumentParser(description=__doc__, formatter_class=RawDescriptionHelpFormatter)
parser.add_argument(
"-v",
"--verbose",
action="count",
default=0,
dest="verbose",
help="Increase verbosity of logging output",
)
parser.add_argument("-f", "--input-file", type=Path, required=False, help="Input file with URLs")
parser.add_argument("-i", "--input-url", type=str, required=False, help="Web Url")
parser.add_argument("-a", "--auth-session-file", type=str, help="Playwright authentication session")
parser.add_argument("-p", "--convert-to-pdf", action="store_true", help="Convert to PDF")
return parser.parse_args()
def main(args):
with sync_playwright() as playwright:
run(playwright, args)
if __name__ == "__main__":
args = parse_args()
setup_logging(args.verbose)
main(args)