thermoprint-homework/jobs/joke_sources.py
2025-12-23 16:32:16 +01:00

97 lines
3.7 KiB
Python

import requests
from bs4 import BeautifulSoup
import random
class JokeSource:
def get_name(self):
return "Generic Source"
def fetch_joke(self):
"""Returns a single joke string or None."""
raise NotImplementedError
class JednorozecJokeSource(JokeSource):
def get_name(self):
return "vtipy.jednorozec.cz"
def fetch_joke(self):
url = "https://vtipy.jednorozec.cz/"
try:
# Add a User-Agent to be polite and avoid basic blocking
headers = {'User-Agent': 'Mozilla/5.0 (compatible; PrintServer/1.0)'}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
jokes = []
# Strategy 1: Look for specific classes often used in blogs/joke sites
# We look for divs that might contain the joke text
potential_classes = ['post', 'entry', 'hentry', 'joke', 'vtip']
for class_name in potential_classes:
elements = soup.find_all(class_=lambda x: x and class_name in x.split())
if elements:
for el in elements:
for br in el.find_all("br"):
br.replace_with("\n")
text = el.get_text()
lines = [line.strip() for line in text.splitlines() if line.strip()]
text = "\n".join(lines)
# Filter out very short texts (titles, metadata) and ensure safety limit
if len(text) > 20 and len(lines) <= 20:
jokes.append(text)
if jokes:
break
# Strategy 2: Fallback to all paragraphs if no specific container found
if not jokes:
for p in soup.find_all('p'):
for br in p.find_all("br"):
br.replace_with("\n")
text = p.get_text()
lines = [line.strip() for line in text.splitlines() if line.strip()]
text = "\n".join(lines)
if len(text) > 50 and len(lines) <= 20: # Assume jokes are somewhat long paragraphs
jokes.append(text)
if jokes:
return random.choice(jokes)
return None
except Exception as e:
raise e
class BestPageJokeSource(JokeSource):
def get_name(self):
return "bestpage.cz"
def fetch_joke(self):
url = "https://bestpage.cz/vtipy/"
try:
headers = {'User-Agent': 'Mozilla/5.0 (compatible; PrintServer/1.0)'}
response = requests.get(url, headers=headers, timeout=10)
# Older sites often use windows-1250 or iso-8859-2
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.content, 'html.parser')
jokes = []
# Bestpage is an older site, often using tables or simple paragraphs
for el in soup.find_all(['p', 'div', 'td']):
for br in el.find_all("br"):
br.replace_with("\n")
text = el.get_text()
lines = [line.strip() for line in text.splitlines() if line.strip()]
text = "\n".join(lines)
if 50 < len(text) < 1000 and len(lines) <= 20:
jokes.append(text)
if jokes:
return random.choice(jokes)
return None
except Exception as e:
raise e