docs: record sqszx202 data cleanup

This commit is contained in:
danaisuiyuan
2026-06-14 09:21:04 +08:00
parent 01e373faf6
commit c3f2494243
3 changed files with 473 additions and 0 deletions

5
.gitignore vendored
View File

@@ -37,3 +37,8 @@ deploy/docker/scripts/server.env
deploy/docker/step1-integral/.env
deploy/docker/step1-integral/houtai.env
deploy/docker/step2-single-shop/.env
# Local migration backups and generated Python cache
docs/sql/backups/
__pycache__/
**/__pycache__/

View File

@@ -0,0 +1,80 @@
# 公司名称:宿迁盛泽鑫商贸
- host ip: 59.110.91.202
## mysql数据库配置信息
- datasource:
rds: rm-bp1a178eq62lxba9xbo.mysql.rds.aliyuncs.com
name: sqszx202
username: yangtangyoupin
password: 5Fn8eWrbYFtAhCZw
## 数据清理任务
- **用户数据范围**`wa_users.id` / `eb_user.uid` 保留名单:
`93164, 93133, 93132, 93113, 93216, 93230, 93238, 93258, 93277, 93283, 93284, 93291, 93293, 93308, 93299, 93290, 93280, 93279, 93255, 93252, 93251, 93250, 93249, 93248, 93223, 93221, 93111, 93099, 93263, 93265, 93272, 93270, 93269, 93268, 93267, 93266, 93276, 93278, 93286, 93296, 93309, 93281, 93275, 93271, 93139, 93264, 93247, 93218, 93217, 93194, 93142`
- 保留wa_users表中id在用户id数据范围的 ,删除其余用户数据
- 保留eb_user表中uid在用户id数据范围的 ,删除其余用户数据
- wa_order
清空wa_order表中数据
- wa_merchandise
保留数据库中“created_at >= 2026-06-12”并且seller_id或buyer_id在用户id数据范围的寄售商品删除其余数据
(当前库表字段为 `user_id` 表示卖家,实现时按 `user_id` 与日期条件过滤。)
- wa_selfbonus_log
只保留 `user_id` 在用户id数据范围内的记录删除其余数据
- wa_sharebonus_log
只保留 `user_id` 在用户id数据范围内的记录删除其余数据
- wa_coupon_log
只保留 `user_id` 在用户id数据范围内的记录删除其余数据
- wa_withdraw
清空wa_withdraw表中数据
- eb_store_order
清空eb_store_order表中数据
- eb_user_integral_record
只保留用户在名单内的记录;表字段为 `uid`(与 `wa_users.id` / `eb_user.uid` 对应),实现按 `uid` 过滤。
## 执行结果
- 已于 **2026-06-14** 按当前保留名单执行清理并 `COMMIT`
- 执行脚本:`docs/sql/run_com_sqszx202_cleanup.py`
- 执行前备份:`docs/sql/backups/sqszx202_cleanup_before_20260614_090658.sql.gz`(已通过 `gzip -t` 校验)
- `wa_merchandise` 从源 dump 解析结果dump 中共 3156 行,满足 `created_at >= 2026-06-12``user_id` 在名单内的保留商品为 54 行。
- 删除行数:
- `wa_order`3168
- `wa_withdraw`171
- `eb_store_order`348
- `wa_merchandise`3102
- `wa_selfbonus_log`2255
- `wa_sharebonus_log`2514
- `wa_coupon_log`208
- `eb_user_integral_record`2343
- `eb_user`79
- `wa_users`79
- 保留后行数:
- `wa_order`0
- `wa_withdraw`0
- `eb_store_order`0
- `wa_merchandise`54
- `wa_selfbonus_log`1644
- `wa_sharebonus_log`1791
- `wa_coupon_log`8
- `eb_user_integral_record`1993
- `eb_user`51
- `wa_users`51
- 复核:清理后再次 dry-run以上表剩余待删除行数均为 0。
## 相关文件
- 新公司初始会员信息: '/Users/mac/Works26/miao-july/宿迁盛泽鑫/盛泽鑫团队成员信息表.xlsx'
- 源数据dump文件 '/Users/mac/Works26/miao-july/宿迁盛泽鑫/anpengran-yangtangyoupin_2026-06-14_02-15-02_mysql_data.sql'

View File

@@ -0,0 +1,388 @@
#!/usr/bin/env python3
"""Run data cleanup for docs/com-sqszx202-data-imgration.md.
Default mode is a read-only dry run. Use --execute to create a local SQL backup,
delete rows according to the migration document, and commit the transaction.
"""
from __future__ import annotations
import argparse
import gzip
import json
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
import pymysql
from pymysql.cursors import SSCursor
ROOT = Path(__file__).resolve().parents[2]
DOC = ROOT / "docs" / "com-sqszx202-data-imgration.md"
DEFAULT_DUMP = Path("/Users/mac/Works26/miao-july/宿迁盛泽鑫/anpengran-yangtangyoupin_2026-06-14_02-15-02_mysql_data.sql")
CUTOFF = "2026-06-12 00:00:00"
TABLES = [
"wa_order",
"wa_withdraw",
"eb_store_order",
"wa_merchandise",
"wa_selfbonus_log",
"wa_sharebonus_log",
"wa_coupon_log",
"eb_user_integral_record",
"eb_user",
"wa_users",
]
FILTERS = {
"wa_users": "id",
"eb_user": "uid",
"wa_selfbonus_log": "user_id",
"wa_sharebonus_log": "user_id",
"wa_coupon_log": "user_id",
"eb_user_integral_record": "uid",
}
CLEAR_TABLES = ["wa_order", "wa_withdraw", "eb_store_order"]
def parse_doc() -> tuple[dict[str, str], list[int]]:
text = DOC.read_text(encoding="utf-8")
def grab(name: str) -> str:
m = re.search(rf"^\s*{name}:\s*(.+?)\s*$", text, flags=re.M)
if not m:
raise ValueError(f"missing datasource {name} in {DOC}")
return m.group(1).strip()
ids_match = re.search(r"保留名单:\s*\n\s*`([^`]+)`", text)
if not ids_match:
raise ValueError(f"missing user id keep list in {DOC}")
ids = [int(x.strip()) for x in ids_match.group(1).split(",") if x.strip()]
if len(ids) != len(set(ids)):
raise ValueError("duplicate ids in keep list")
config = {
"host": grab("rds"),
"database": grab("name"),
"user": grab("username"),
"password": grab("password"),
}
return config, ids
def split_top_level_tuples(values_blob: str) -> list[str]:
out: list[str] = []
i = 0
n = len(values_blob)
while i < n:
if values_blob[i] != "(":
i += 1
continue
depth = 0
in_quote = False
start = i
j = i
while j < n:
c = values_blob[j]
if in_quote:
if c == "\\":
j += 2
continue
if c == "'":
if j + 1 < n and values_blob[j + 1] == "'":
j += 2
continue
in_quote = False
j += 1
continue
if c == "'":
in_quote = True
elif c == "(":
depth += 1
elif c == ")":
depth -= 1
if depth == 0:
out.append(values_blob[start : j + 1])
j += 1
break
j += 1
i = j
return out
def split_mysql_fields(inner: str) -> list[str]:
out: list[str] = []
cur: list[str] = []
i = 0
n = len(inner)
while i < n:
c = inner[i]
if c == "'":
cur.append(c)
i += 1
while i < n:
c = inner[i]
cur.append(c)
if c == "\\":
if i + 1 < n:
cur.append(inner[i + 1])
i += 2
continue
if c == "'":
if i + 1 < n and inner[i + 1] == "'":
cur.append(inner[i + 1])
i += 2
continue
i += 1
break
i += 1
continue
if c == ",":
out.append("".join(cur).strip())
cur = []
i += 1
continue
cur.append(c)
i += 1
out.append("".join(cur).strip())
return out
def unquote_sql_string(raw: str) -> str:
raw = raw.strip()
if raw.startswith("'") and raw.endswith("'"):
body = raw[1:-1]
body = body.replace("''", "'")
body = body.replace("\\'", "'").replace("\\\\", "\\")
return body
return raw
def extract_wa_merchandise_keep_ids(dump: Path, keep_users: set[int]) -> list[int]:
if not dump.is_file():
raise FileNotFoundError(f"dump not found: {dump}")
keep_ids: set[int] = set()
total_rows = 0
insert_lines = 0
with dump.open("r", encoding="utf-8", errors="replace") as f:
for line in f:
if "INSERT INTO `wa_merchandise`" not in line or "VALUES" not in line:
continue
insert_lines += 1
blob = line[line.index("VALUES") + len("VALUES") :].strip()
if blob.endswith(";"):
blob = blob[:-1].strip()
for tup in split_top_level_tuples(blob):
total_rows += 1
fields = split_mysql_fields(tup.strip()[1:-1])
if len(fields) < 10:
raise ValueError(f"malformed wa_merchandise tuple: {tup[:120]}")
row_id = int(fields[0])
user_id = int(fields[2])
created_at = unquote_sql_string(fields[8])
if created_at >= CUTOFF and user_id in keep_users:
keep_ids.add(row_id)
if insert_lines == 0:
raise ValueError("no INSERT INTO `wa_merchandise` found in dump")
print(f"dump_wa_merchandise_rows={total_rows} keep_by_rule={len(keep_ids)}")
return sorted(keep_ids)
def connect(config: dict[str, str], cursorclass=None):
kwargs = {
"host": config["host"],
"user": config["user"],
"password": config["password"],
"database": config["database"],
"charset": "utf8mb4",
"autocommit": False,
"read_timeout": 120,
"write_timeout": 120,
}
if cursorclass is not None:
kwargs["cursorclass"] = cursorclass
return pymysql.connect(**kwargs)
def placeholders(items: list[int] | set[int]) -> str:
if not items:
return "NULL"
return ",".join(["%s"] * len(items))
def count_where(cur, table: str, where: str = "1=1", params: tuple[Any, ...] = ()) -> int:
cur.execute(f"SELECT COUNT(*) FROM `{table}` WHERE {where}", params)
return int(cur.fetchone()[0])
def inspect_schema(cur) -> None:
cur.execute("SELECT DATABASE()")
database = cur.fetchone()[0]
if database != "sqszx202":
raise RuntimeError(f"refusing to run against database {database!r}")
for table in TABLES:
cur.execute("SHOW TABLES LIKE %s", (table,))
if not cur.fetchone():
raise RuntimeError(f"missing table `{table}`")
required = {
"wa_users": {"id"},
"eb_user": {"uid"},
"wa_order": set(),
"wa_withdraw": set(),
"eb_store_order": set(),
"wa_merchandise": {"id", "user_id", "created_at"},
"wa_selfbonus_log": {"user_id"},
"wa_sharebonus_log": {"user_id"},
"wa_coupon_log": {"user_id"},
"eb_user_integral_record": {"uid"},
}
for table, cols in required.items():
if not cols:
continue
cur.execute(f"SHOW COLUMNS FROM `{table}`")
actual = {row[0] for row in cur.fetchall()}
missing = cols - actual
if missing:
raise RuntimeError(f"table `{table}` missing columns: {sorted(missing)}")
def collect_counts(cur, keep_users: list[int], keep_merchandise_ids: list[int]) -> dict[str, dict[str, int]]:
counts: dict[str, dict[str, int]] = {}
user_clause = placeholders(keep_users)
merch_clause = placeholders(keep_merchandise_ids)
for table in CLEAR_TABLES:
total = count_where(cur, table)
counts[table] = {"before": total, "keep": 0, "delete": total}
for table, col in FILTERS.items():
total = count_where(cur, table)
keep = count_where(cur, table, f"`{col}` IN ({user_clause})", tuple(keep_users))
counts[table] = {"before": total, "keep": keep, "delete": total - keep}
total = count_where(cur, "wa_merchandise")
keep = (
count_where(cur, "wa_merchandise", f"`id` IN ({merch_clause})", tuple(keep_merchandise_ids))
if keep_merchandise_ids
else 0
)
counts["wa_merchandise"] = {"before": total, "keep": keep, "delete": total - keep}
return counts
def print_counts(title: str, counts: dict[str, dict[str, int]]) -> None:
print(title)
for table in TABLES:
c = counts[table]
print(f" {table}: before={c['before']} keep={c['keep']} delete={c['delete']}")
def backup_tables(config: dict[str, str], backup_path: Path) -> None:
backup_path.parent.mkdir(parents=True, exist_ok=True)
backup_conn = connect(config, cursorclass=SSCursor)
try:
with gzip.open(backup_path, "wt", encoding="utf-8") as out:
out.write("-- sqszx202 cleanup backup\n")
out.write(f"-- created_at: {datetime.now().isoformat(timespec='seconds')}\n")
out.write("SET NAMES utf8mb4;\n")
for table in TABLES:
with backup_conn.cursor() as cur:
cur.execute(f"SHOW CREATE TABLE `{table}`")
row = cur.fetchone()
create_sql = row[1]
out.write(f"\n-- Table `{table}`\n")
out.write(f"DROP TABLE IF EXISTS `{table}`;\n")
out.write(create_sql + ";\n")
with backup_conn.cursor() as cur:
cur.execute(f"SELECT * FROM `{table}`")
batch: list[str] = []
row_count = 0
for row in cur:
batch.append("(" + ",".join(backup_conn.literal(v) for v in row) + ")")
row_count += 1
if len(batch) >= 200:
out.write(f"INSERT INTO `{table}` VALUES\n")
out.write(",\n".join(batch) + ";\n")
batch = []
if batch:
out.write(f"INSERT INTO `{table}` VALUES\n")
out.write(",\n".join(batch) + ";\n")
print(f"backup {table}: rows={row_count}")
finally:
backup_conn.close()
def execute_cleanup(cur, keep_users: list[int], keep_merchandise_ids: list[int]) -> dict[str, int]:
user_clause = placeholders(keep_users)
merch_clause = placeholders(keep_merchandise_ids)
deleted: dict[str, int] = {}
cur.execute("SET FOREIGN_KEY_CHECKS = 0")
for table in CLEAR_TABLES:
cur.execute(f"DELETE FROM `{table}`")
deleted[table] = cur.rowcount
cur.execute(f"DELETE FROM `wa_merchandise` WHERE `id` NOT IN ({merch_clause})", tuple(keep_merchandise_ids))
deleted["wa_merchandise"] = cur.rowcount
for table in ["wa_selfbonus_log", "wa_sharebonus_log", "wa_coupon_log"]:
cur.execute(f"DELETE FROM `{table}` WHERE `user_id` NOT IN ({user_clause})", tuple(keep_users))
deleted[table] = cur.rowcount
cur.execute(f"DELETE FROM `eb_user_integral_record` WHERE `uid` NOT IN ({user_clause})", tuple(keep_users))
deleted["eb_user_integral_record"] = cur.rowcount
cur.execute(f"DELETE FROM `eb_user` WHERE `uid` NOT IN ({user_clause})", tuple(keep_users))
deleted["eb_user"] = cur.rowcount
cur.execute(f"DELETE FROM `wa_users` WHERE `id` NOT IN ({user_clause})", tuple(keep_users))
deleted["wa_users"] = cur.rowcount
cur.execute("SET FOREIGN_KEY_CHECKS = 1")
return deleted
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--execute", action="store_true", help="perform DELETEs and COMMIT")
parser.add_argument("--dump", type=Path, default=DEFAULT_DUMP)
parser.add_argument("--backup-dir", type=Path, default=ROOT / "docs" / "sql" / "backups")
args = parser.parse_args()
config, keep_users = parse_doc()
keep_merchandise_ids = extract_wa_merchandise_keep_ids(args.dump, set(keep_users))
print(f"keep_user_ids={len(keep_users)} keep_wa_merchandise_ids={len(keep_merchandise_ids)}")
conn = connect(config)
try:
with conn.cursor() as cur:
inspect_schema(cur)
before = collect_counts(cur, keep_users, keep_merchandise_ids)
print_counts("before_counts", before)
if not args.execute:
print("dry_run_only=true")
conn.rollback()
return 0
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = args.backup_dir / f"sqszx202_cleanup_before_{stamp}.sql.gz"
print(f"backup_path={backup_path}")
backup_tables(config, backup_path)
with conn.cursor() as cur:
deleted = execute_cleanup(cur, keep_users, keep_merchandise_ids)
after = collect_counts(cur, keep_users, keep_merchandise_ids)
print("deleted_rows")
print(json.dumps(deleted, ensure_ascii=False, indent=2))
print_counts("after_counts_before_commit", after)
conn.commit()
print("COMMIT ok")
with conn.cursor() as cur:
final_counts = collect_counts(cur, keep_users, keep_merchandise_ids)
print_counts("final_counts", final_counts)
return 0
except Exception as e:
try:
with conn.cursor() as cur:
cur.execute("SET FOREIGN_KEY_CHECKS = 1")
except Exception:
pass
conn.rollback()
print(f"ROLLBACK: {e}", file=sys.stderr)
raise
finally:
conn.close()
if __name__ == "__main__":
raise SystemExit(main())