msh-system/msh_crmeb_22/scraper/knowledge_scraper.py

import requests
from bs4 import BeautifulSoup
import time
import random
import json
import re
import os
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

# Configuration
BASE_URL = "http://www.ishen365.com"
OUTPUT_FILE = "knowledge_data_insert.sql"
LOG_FILE = "knowledge_scraper.log"
MAX_WORKERS = 10

# Lock for file writing
file_lock = threading.Lock()

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

def log(msg):
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    formatted_msg = f"[{timestamp}] {msg}"
    print(formatted_msg)
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(formatted_msg + "\n")

def get_soup(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'
        return BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        log(f"Error fetching {url}: {e}")
        return None

def clean_text(text):
    if not text:
        return ""
    return text.strip().replace("'", "''").replace("\\", "\\\\")

def clean_html(html_content):
    if not html_content:
        return ""
    return str(html_content).replace("'", "''").replace("\\", "\\\\")

def save_sql(sql):
    with file_lock:
        with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
            f.write(sql + "\n")

def generate_sql(data, table_name='v2_knowledge'):
    title = clean_text(data.get('title', ''))
    content = clean_html(data.get('content', ''))
    summary = clean_text(data.get('summary', ''))
    cover_image = data.get('cover_image', '')

    if table_name == 'v2_knowledge':
        type_ = data.get('type', 'article')
        category = data.get('category', '')
        nutrient_name = clean_text(data.get('nutrient_name', ''))

        sql = f"""INSERT INTO `v2_knowledge`
(`title`, `content`, `summary`, `cover_image`, `type`, `category`, `nutrient_name`, `status`, `created_at`, `updated_at`)
VALUES
('{title}', '{content}', '{summary}', '{cover_image}', '{type_}', '{category}', '{nutrient_name}', 'published', NOW(), NOW());"""
        return sql

    elif table_name == 'v2_recipes':
        # v2_recipes fields: name, description, cover_image, category, meal_type, ingredients_json, steps_json, status, is_official
        # We map content to description
        # We try to extract video if available in content (not implemented here but content has it)

        sql = f"""INSERT INTO `v2_recipes`
(`name`, `description`, `cover_image`, `category`, `ingredients_json`, `steps_json`, `status`, `is_official`, `created_at`, `updated_at`)
VALUES
('{title}', '{content}', '{cover_image}', 'godkitchen', '[]', '[]', 'published', 1, NOW(), NOW());"""
        return sql

    elif table_name == 'v2_community_posts':
        # v2_community_posts fields: title, content, cover_image, user_id, status, audit_status, privacy
        # user_id = 1 (System)

        sql = f"""INSERT INTO `v2_community_posts`
(`title`, `content`, `cover_image`, `user_id`, `status`, `audit_status`, `privacy`, `created_at`, `updated_at`)
VALUES
('{title}', '{content}', '{cover_image}', 1, 'published', 'approved', 'public', NOW(), NOW());"""
        return sql

    return ""

def scrape_detail(url, item_data):
    # Fetch detail page to get content
    soup = get_soup(url)
    if not soup:
        return None

    # Extract content
    # Common pattern: div.txt for articles
    # For recipes (godkitchen): div.content_left (contains .video_play and p tags)

    content_div = soup.find('div', class_='txt')
    if not content_div:
        # Try finding content_left for recipes or other structures
        content_left = soup.find('div', class_='content_left')
        if content_left:
             # Remove title and time divs if present to avoid duplication
             for div in content_left.find_all('div', class_=['tit', 'tit2', 'lst_btns']):
                 div.decompose()
             content_div = content_left

    if content_div:
        item_data['content'] = content_div
    else:
        item_data['content'] = "<p>No content extracted.</p>"

    return item_data

def process_item(item, source_type, table_name='v2_knowledge', extra_info=None):
    try:
        full_data = scrape_detail(item['url'], item)
        if full_data:
            full_data['type'] = source_type
            if extra_info:
                full_data.update(extra_info)

            sql = generate_sql(full_data, table_name)
            save_sql(sql)
            return True
    except Exception as e:
        log(f"Error processing item {item.get('url')}: {e}")
    return False

# ---------------------------------------------------------------------------
# Task Specific Scrapers
# ---------------------------------------------------------------------------

def task2_nutrients():
    log("Starting Task 2: Nutrients")
    url = "http://www.ishen365.com/index.php/rsyys"
    soup = get_soup(url)
    if not soup:
        return

    nutrients = []
    links = soup.find_all('a')
    for link in links:
        div = link.find('div', class_='tianchong')
        if div:
            name = div.get_text(strip=True)
            href = link.get('href')
            if href:
                nutrients.append({'name': name, 'url': urljoin(url, href)})

    log(f"Found {len(nutrients)} nutrient categories.")

    all_items = []
    for nutrient in nutrients:
        log(f"Scanning Nutrient: {nutrient['name']}")
        n_soup = get_soup(nutrient['url'])
        if not n_soup:
            continue

        list_items = n_soup.find_all('li', class_='am-g')
        for li in list_items:
            h3 = li.find('h3', class_='am-list-item-hd')
            if h3:
                a = h3.find('a')
                if a:
                    title = a.get_text(strip=True)
                    href = urljoin(nutrient['url'], a.get('href'))
                    summary_div = li.find('div', class_='am-list-item-text')
                    summary = summary_div.get_text(strip=True) if summary_div else ""
                    thumb = li.find('div', class_='am-list-item-thumb')
                    img = thumb.find('img') if thumb else None
                    img_src = urljoin(nutrient['url'], img.get('src')) if img else ""

                    all_items.append({
                        'title': title,
                        'url': href,
                        'summary': summary,
                        'cover_image': img_src,
                        'nutrient_name': nutrient['name'],
                        'category': 'nutrients'
                    })

    log(f"Total Nutrient Articles to scrape: {len(all_items)}")

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_item, item, 'nutrients', 'v2_knowledge') for item in all_items]
        for _ in as_completed(futures):
            pass

def scrape_list_items(task):
    log(f"Processing {task['name']}...")
    all_items = []

    # Determine start page
    start_page = task.get('start_page', 1)
    page_url = f"{task['url']}?page={start_page}" if start_page > 1 else task['url']

    soup = get_soup(page_url)
    if not soup:
        return []

    current_soup = soup
    page_count = start_page

    while True:
        if task['limit'] and len(all_items) >= task['limit']:
            all_items = all_items[:task['limit']]
            break

        # Extract items
        items_list = current_soup.find_all('li', class_='am-g')
        new_items_found = False
        for li in items_list:
            if task['limit'] and len(all_items) >= task['limit']:
                break

            h3 = li.find('h3', class_='am-list-item-hd')
            if h3:
                a = h3.find('a')
                if a:
                    title = a.get_text(strip=True)
                    href = urljoin(task['url'], a.get('href'))
                    summary_div = li.find('div', class_='am-list-item-text')
                    summary = summary_div.get_text(strip=True) if summary_div else ""
                    thumb = li.find('div', class_='am-list-item-thumb')
                    img = thumb.find('img') if thumb else None
                    img_src = urljoin(task['url'], img.get('src')) if img else ""

                    all_items.append({
                        'title': title,
                        'url': href,
                        'summary': summary,
                        'cover_image': img_src,
                        'category': task['name']
                    })
                    new_items_found = True

        if not new_items_found:
            # Maybe the page is empty, or structure changed
            # If start_page was high (e.g. 21), it might be empty if not that many items.
            log(f"  No items found on page {page_count}. Stopping.")
            break

        # Next page
        next_link = current_soup.find('a', string=re.compile(r'下一页|Next'))
        if not next_link:
            break
        next_href = next_link.get('href')
        if not next_href or next_href == '#':
            break

        next_url = urljoin(task['url'], next_href)
        log(f"  Fetching next page: {next_url}")

        current_soup = get_soup(next_url)
        if not current_soup:
            break

        page_count += 1
        # Safety break for unlimited tasks
        if not task['limit'] and page_count > 20:
            break

    log(f"  Total items collected for {task['name']}: {len(all_items)}")
    return all_items

def task3_articles_guides():
    log("Starting Task 3: Articles & Guides")
    tasks = [
        {'name': 'scienceteach', 'url': 'http://www.ishen365.com/index.php/article/scienceteach', 'type': 'article', 'table': 'v2_knowledge', 'limit': None},
        {'name': 'wiki', 'url': 'http://www.ishen365.com/index.php/wiki', 'type': 'guide', 'table': 'v2_knowledge', 'limit': 100},
        {'name': 'doctornews', 'url': 'http://www.ishen365.com/index.php/article/doctornews', 'type': 'article', 'table': 'v2_knowledge', 'limit': 100}
    ]

    for task in tasks:
        items = scrape_list_items(task)
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = [executor.submit(process_item, item, task['type'], task['table']) for item in items]
            for _ in as_completed(futures):
                pass

def task4_recipes():
    log("Starting Task 4: Recipes")
    url = "http://www.ishen365.com/index.php/article/godkitchen"
    soup = get_soup(url)
    if not soup:
        return

    all_items = []
    current_soup = soup
    page_count = 1

    while True:
        video_list_ul = current_soup.find('ul', class_='video_list')
        if video_list_ul:
            lis = video_list_ul.find_all('li', class_='f_l')
            for li in lis:
                a = li.find('a', class_='video')
                if a:
                    href = urljoin(url, a.get('href'))
                    img = a.find('img')
                    img_src = urljoin(url, img.get('src')) if img else ""
                    title_span = a.find('span', class_='video_title')
                    title = title_span.get_text(strip=True) if title_span else ""

                    all_items.append({
                        'title': title,
                        'url': href,
                        'cover_image': img_src,
                        'summary': '',
                        'category': 'godkitchen'
                    })

        next_link = current_soup.find('a', string=re.compile(r'下一页|Next'))
        if not next_link:
            break
        next_href = next_link.get('href')
        if not next_href or next_href == '#':
            break
        next_url = urljoin(url, next_href)
        log(f"  Fetching next page: {next_url}")
        current_soup = get_soup(next_url)
        if not current_soup:
            break
        page_count += 1
        if page_count > 20:
            break

    log(f"Total Recipes to scrape: {len(all_items)}")

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_item, item, 'recipe', 'v2_recipes') for item in all_items]
        for _ in as_completed(futures):
            pass

def task5_community():
    log("Starting Task 5: Community Posts")
    # Source: Wiki, items 200-500
    # Page 21 starts at item 201 (assuming 10 per page)
    task = {
        'name': 'community_wiki',
        'url': 'http://www.ishen365.com/index.php/wiki',
        'type': 'article', # not used
        'table': 'v2_community_posts',
        'limit': 300,
        'start_page': 21
    }

    items = scrape_list_items(task)
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_item, item, 'article', 'v2_community_posts') for item in items]
        for _ in as_completed(futures):
            pass

def main():
    # Initialize output file
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        f.write("-- Bulk Insert SQL for Knowledge, Recipes, and Community\n")
        f.write("DELETE FROM `v2_knowledge`;\n")
        f.write("DELETE FROM `v2_recipes`;\n")
        f.write("DELETE FROM `v2_community_posts`;\n")

    task2_nutrients()
    task3_articles_guides()
    task4_recipes()
    task5_community()

    log("All tasks completed!")

if __name__ == "__main__":
    main()