97 lines
3.7 KiB
Python
97 lines
3.7 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import random
|
|
|
|
class JokeSource:
|
|
def get_name(self):
|
|
return "Generic Source"
|
|
|
|
def fetch_joke(self):
|
|
"""Returns a single joke string or None."""
|
|
raise NotImplementedError
|
|
|
|
class JednorozecJokeSource(JokeSource):
|
|
def get_name(self):
|
|
return "vtipy.jednorozec.cz"
|
|
|
|
def fetch_joke(self):
|
|
url = "https://vtipy.jednorozec.cz/"
|
|
try:
|
|
# Add a User-Agent to be polite and avoid basic blocking
|
|
headers = {'User-Agent': 'Mozilla/5.0 (compatible; PrintServer/1.0)'}
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
jokes = []
|
|
|
|
# Strategy 1: Look for specific classes often used in blogs/joke sites
|
|
# We look for divs that might contain the joke text
|
|
potential_classes = ['post', 'entry', 'hentry', 'joke', 'vtip']
|
|
for class_name in potential_classes:
|
|
elements = soup.find_all(class_=lambda x: x and class_name in x.split())
|
|
if elements:
|
|
for el in elements:
|
|
for br in el.find_all("br"):
|
|
br.replace_with("\n")
|
|
text = el.get_text()
|
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
text = "\n".join(lines)
|
|
# Filter out very short texts (titles, metadata) and ensure safety limit
|
|
if len(text) > 20 and len(lines) <= 20:
|
|
jokes.append(text)
|
|
if jokes:
|
|
break
|
|
|
|
# Strategy 2: Fallback to all paragraphs if no specific container found
|
|
if not jokes:
|
|
for p in soup.find_all('p'):
|
|
for br in p.find_all("br"):
|
|
br.replace_with("\n")
|
|
text = p.get_text()
|
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
text = "\n".join(lines)
|
|
if len(text) > 50 and len(lines) <= 20: # Assume jokes are somewhat long paragraphs
|
|
jokes.append(text)
|
|
|
|
if jokes:
|
|
return random.choice(jokes)
|
|
return None
|
|
|
|
except Exception as e:
|
|
raise e
|
|
|
|
class BestPageJokeSource(JokeSource):
|
|
def get_name(self):
|
|
return "bestpage.cz"
|
|
|
|
def fetch_joke(self):
|
|
url = "https://bestpage.cz/vtipy/"
|
|
try:
|
|
headers = {'User-Agent': 'Mozilla/5.0 (compatible; PrintServer/1.0)'}
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
# Older sites often use windows-1250 or iso-8859-2
|
|
response.encoding = response.apparent_encoding
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
jokes = []
|
|
|
|
# Bestpage is an older site, often using tables or simple paragraphs
|
|
for el in soup.find_all(['p', 'div', 'td']):
|
|
for br in el.find_all("br"):
|
|
br.replace_with("\n")
|
|
text = el.get_text()
|
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
text = "\n".join(lines)
|
|
|
|
if 50 < len(text) < 1000 and len(lines) <= 20:
|
|
jokes.append(text)
|
|
|
|
if jokes:
|
|
return random.choice(jokes)
|
|
return None
|
|
|
|
except Exception as e:
|
|
raise e |