Files
msh-system/msh_crmeb_22/scraper/analyze_sources.py

53 lines
1.9 KiB
Python

import requests
from bs4 import BeautifulSoup
import os
SOURCES = {
"nutrients": "http://www.ishen365.com/index.php/rsyys",
"scienceteach": "http://www.ishen365.com/index.php/article/scienceteach",
"wiki": "http://www.ishen365.com/index.php/wiki",
"doctornews": "http://www.ishen365.com/index.php/article/doctornews",
"recipe": "http://www.ishen365.com/index.php/article/godkitchen"
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
def analyze_source(name, url):
print(f"Analyzing {name}: {url}")
try:
response = requests.get(url, headers=HEADERS, timeout=10)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# Try to find list items
# Based on previous scraper, list items were often in 'div.as_list' or similar.
# Let's dump the first few potential list items to see classes.
# Check for common patterns
list_items = soup.find_all('div', class_=lambda x: x and 'list' in x)
print(f" Found {len(list_items)} divs with 'list' in class.")
# Save a snippet to file for manual inspection
with open(f"scraper/source_{name}.html", "w", encoding="utf-8") as f:
f.write(response.text)
except Exception as e:
print(f" Error: {e}")
if __name__ == "__main__":
if not os.path.exists("scraper"):
os.makedirs("scraper")
# Analyze Detail Pages
print("\nAnalyzing Detail Pages...")
details = {
"nutrient_detail": "http://www.ishen365.com/index.php/article/energy",
"article_detail": "http://www.ishen365.com/index.php/article/10/show/4330",
"recipe_detail": "http://www.ishen365.com/index.php/article/18/show/34"
}
for name, url in details.items():
analyze_source(name, url)