Initial commit: MSH System\n\n- msh_single_uniapp: Vue 2 + UniApp 前端(微信小程序/H5/App/支付宝小程序)\n- msh_crmeb_22: Spring Boot 2.2 后端(C端API/管理端/业务逻辑)\n- models-integration: AI服务集成(Coze/KieAI/腾讯ASR)\n- docs: 产品文档与设计稿
This commit is contained in:
52
msh_crmeb_22/scraper/analyze_sources.py
Normal file
52
msh_crmeb_22/scraper/analyze_sources.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
SOURCES = {
|
||||
"nutrients": "http://www.ishen365.com/index.php/rsyys",
|
||||
"scienceteach": "http://www.ishen365.com/index.php/article/scienceteach",
|
||||
"wiki": "http://www.ishen365.com/index.php/wiki",
|
||||
"doctornews": "http://www.ishen365.com/index.php/article/doctornews",
|
||||
"recipe": "http://www.ishen365.com/index.php/article/godkitchen"
|
||||
}
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
||||
}
|
||||
|
||||
def analyze_source(name, url):
|
||||
print(f"Analyzing {name}: {url}")
|
||||
try:
|
||||
response = requests.get(url, headers=HEADERS, timeout=10)
|
||||
response.raise_for_status()
|
||||
response.encoding = 'utf-8'
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Try to find list items
|
||||
# Based on previous scraper, list items were often in 'div.as_list' or similar.
|
||||
# Let's dump the first few potential list items to see classes.
|
||||
|
||||
# Check for common patterns
|
||||
list_items = soup.find_all('div', class_=lambda x: x and 'list' in x)
|
||||
print(f" Found {len(list_items)} divs with 'list' in class.")
|
||||
|
||||
# Save a snippet to file for manual inspection
|
||||
with open(f"scraper/source_{name}.html", "w", encoding="utf-8") as f:
|
||||
f.write(response.text)
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.path.exists("scraper"):
|
||||
os.makedirs("scraper")
|
||||
|
||||
# Analyze Detail Pages
|
||||
print("\nAnalyzing Detail Pages...")
|
||||
details = {
|
||||
"nutrient_detail": "http://www.ishen365.com/index.php/article/energy",
|
||||
"article_detail": "http://www.ishen365.com/index.php/article/10/show/4330",
|
||||
"recipe_detail": "http://www.ishen365.com/index.php/article/18/show/34"
|
||||
}
|
||||
for name, url in details.items():
|
||||
analyze_source(name, url)
|
||||
Reference in New Issue
Block a user