msh-system/msh_crmeb_22/scraper/analyze_sources.py

import requests
from bs4 import BeautifulSoup
import os

SOURCES = {
    "nutrients": "http://www.ishen365.com/index.php/rsyys",
    "scienceteach": "http://www.ishen365.com/index.php/article/scienceteach",
    "wiki": "http://www.ishen365.com/index.php/wiki",
    "doctornews": "http://www.ishen365.com/index.php/article/doctornews",
    "recipe": "http://www.ishen365.com/index.php/article/godkitchen"
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

def analyze_source(name, url):
    print(f"Analyzing {name}: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')

        # Try to find list items
        # Based on previous scraper, list items were often in 'div.as_list' or similar.
        # Let's dump the first few potential list items to see classes.

        # Check for common patterns
        list_items = soup.find_all('div', class_=lambda x: x and 'list' in x)
        print(f"  Found {len(list_items)} divs with 'list' in class.")

        # Save a snippet to file for manual inspection
        with open(f"scraper/source_{name}.html", "w", encoding="utf-8") as f:
            f.write(response.text)

    except Exception as e:
        print(f"  Error: {e}")

if __name__ == "__main__":
    if not os.path.exists("scraper"):
        os.makedirs("scraper")

    # Analyze Detail Pages
    print("\nAnalyzing Detail Pages...")
    details = {
        "nutrient_detail": "http://www.ishen365.com/index.php/article/energy",
        "article_detail": "http://www.ishen365.com/index.php/article/10/show/4330",
        "recipe_detail": "http://www.ishen365.com/index.php/article/18/show/34"
    }
    for name, url in details.items():
        analyze_source(name, url)