53 lines
1.9 KiB
Python
53 lines
1.9 KiB
Python
|
|
import requests
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
import os
|
||
|
|
|
||
|
|
SOURCES = {
|
||
|
|
"nutrients": "http://www.ishen365.com/index.php/rsyys",
|
||
|
|
"scienceteach": "http://www.ishen365.com/index.php/article/scienceteach",
|
||
|
|
"wiki": "http://www.ishen365.com/index.php/wiki",
|
||
|
|
"doctornews": "http://www.ishen365.com/index.php/article/doctornews",
|
||
|
|
"recipe": "http://www.ishen365.com/index.php/article/godkitchen"
|
||
|
|
}
|
||
|
|
|
||
|
|
HEADERS = {
|
||
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
||
|
|
}
|
||
|
|
|
||
|
|
def analyze_source(name, url):
|
||
|
|
print(f"Analyzing {name}: {url}")
|
||
|
|
try:
|
||
|
|
response = requests.get(url, headers=HEADERS, timeout=10)
|
||
|
|
response.raise_for_status()
|
||
|
|
response.encoding = 'utf-8'
|
||
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||
|
|
|
||
|
|
# Try to find list items
|
||
|
|
# Based on previous scraper, list items were often in 'div.as_list' or similar.
|
||
|
|
# Let's dump the first few potential list items to see classes.
|
||
|
|
|
||
|
|
# Check for common patterns
|
||
|
|
list_items = soup.find_all('div', class_=lambda x: x and 'list' in x)
|
||
|
|
print(f" Found {len(list_items)} divs with 'list' in class.")
|
||
|
|
|
||
|
|
# Save a snippet to file for manual inspection
|
||
|
|
with open(f"scraper/source_{name}.html", "w", encoding="utf-8") as f:
|
||
|
|
f.write(response.text)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Error: {e}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
if not os.path.exists("scraper"):
|
||
|
|
os.makedirs("scraper")
|
||
|
|
|
||
|
|
# Analyze Detail Pages
|
||
|
|
print("\nAnalyzing Detail Pages...")
|
||
|
|
details = {
|
||
|
|
"nutrient_detail": "http://www.ishen365.com/index.php/article/energy",
|
||
|
|
"article_detail": "http://www.ishen365.com/index.php/article/10/show/4330",
|
||
|
|
"recipe_detail": "http://www.ishen365.com/index.php/article/18/show/34"
|
||
|
|
}
|
||
|
|
for name, url in details.items():
|
||
|
|
analyze_source(name, url)
|