thermoprint-homework/jobs/joke_sources.py

import requests
from bs4 import BeautifulSoup
import random

class JokeSource:
    def get_name(self):
        return "Generic Source"

    def fetch_joke(self):
        """Returns a single joke string or None."""
        raise NotImplementedError

class JednorozecJokeSource(JokeSource):
    def get_name(self):
        return "vtipy.jednorozec.cz"

    def fetch_joke(self):
        url = "https://vtipy.jednorozec.cz/"
        try:
            # Add a User-Agent to be polite and avoid basic blocking
            headers = {'User-Agent': 'Mozilla/5.0 (compatible; PrintServer/1.0)'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            jokes = []

            # Strategy 1: Look for specific classes often used in blogs/joke sites
            # We look for divs that might contain the joke text
            potential_classes = ['post', 'entry', 'hentry', 'joke', 'vtip']
            for class_name in potential_classes:
                elements = soup.find_all(class_=lambda x: x and class_name in x.split())
                if elements:
                    for el in elements:
                        for br in el.find_all("br"):
                            br.replace_with("\n")
                        text = el.get_text()
                        lines = [line.strip() for line in text.splitlines() if line.strip()]
                        text = "\n".join(lines)
                        # Filter out very short texts (titles, metadata) and ensure safety limit
                        if len(text) > 20 and len(lines) <= 20:
                            jokes.append(text)
                    if jokes:
                        break

            # Strategy 2: Fallback to all paragraphs if no specific container found
            if not jokes:
                for p in soup.find_all('p'):
                    for br in p.find_all("br"):
                        br.replace_with("\n")
                    text = p.get_text()
                    lines = [line.strip() for line in text.splitlines() if line.strip()]
                    text = "\n".join(lines)
                    if len(text) > 50 and len(lines) <= 20: # Assume jokes are somewhat long paragraphs
                        jokes.append(text)

            if jokes:
                return random.choice(jokes)
            return None

        except Exception as e:
            raise e

class BestPageJokeSource(JokeSource):
    def get_name(self):
        return "bestpage.cz"

    def fetch_joke(self):
        url = "https://bestpage.cz/vtipy/"
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (compatible; PrintServer/1.0)'}
            response = requests.get(url, headers=headers, timeout=10)
            # Older sites often use windows-1250 or iso-8859-2
            response.encoding = response.apparent_encoding

            soup = BeautifulSoup(response.content, 'html.parser')

            jokes = []

            # Bestpage is an older site, often using tables or simple paragraphs
            for el in soup.find_all(['p', 'div', 'td']):
                for br in el.find_all("br"):
                    br.replace_with("\n")
                text = el.get_text()
                lines = [line.strip() for line in text.splitlines() if line.strip()]
                text = "\n".join(lines)

                if 50 < len(text) < 1000 and len(lines) <= 20:
                    jokes.append(text)

            if jokes:
                return random.choice(jokes)
            return None

        except Exception as e:
            raise e