import requests from bs4 import BeautifulSoup import os SOURCES = { "nutrients": "http://www.ishen365.com/index.php/rsyys", "scienceteach": "http://www.ishen365.com/index.php/article/scienceteach", "wiki": "http://www.ishen365.com/index.php/wiki", "doctornews": "http://www.ishen365.com/index.php/article/doctornews", "recipe": "http://www.ishen365.com/index.php/article/godkitchen" } HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36" } def analyze_source(name, url): print(f"Analyzing {name}: {url}") try: response = requests.get(url, headers=HEADERS, timeout=10) response.raise_for_status() response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # Try to find list items # Based on previous scraper, list items were often in 'div.as_list' or similar. # Let's dump the first few potential list items to see classes. # Check for common patterns list_items = soup.find_all('div', class_=lambda x: x and 'list' in x) print(f" Found {len(list_items)} divs with 'list' in class.") # Save a snippet to file for manual inspection with open(f"scraper/source_{name}.html", "w", encoding="utf-8") as f: f.write(response.text) except Exception as e: print(f" Error: {e}") if __name__ == "__main__": if not os.path.exists("scraper"): os.makedirs("scraper") # Analyze Detail Pages print("\nAnalyzing Detail Pages...") details = { "nutrient_detail": "http://www.ishen365.com/index.php/article/energy", "article_detail": "http://www.ishen365.com/index.php/article/10/show/4330", "recipe_detail": "http://www.ishen365.com/index.php/article/18/show/34" } for name, url in details.items(): analyze_source(name, url)