kdrug-inventory-system/dev_scripts/analyze_excel_formats.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Excel 파일 형식 분석 도구
한의사랑과 한의정보 형식 비교
"""

import pandas as pd
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

def analyze_excel_format(file_path, format_name):
    """Excel 파일 형식 분석"""
    print(f"\n{'='*60}")
    print(f"📊 {format_name} 형식 분석")
    print(f"파일: {file_path}")
    print('='*60)

    try:
        # Excel 파일 읽기
        df = pd.read_excel(file_path)

        # 기본 정보
        print(f"\n1️⃣ 기본 정보:")
        print(f"   - 행 개수: {len(df)}")
        print(f"   - 열 개수: {len(df.columns)}")

        # 컬럼 정보
        print(f"\n2️⃣ 컬럼 목록:")
        for i, col in enumerate(df.columns, 1):
            print(f"   {i}. {col}")

        # 데이터 타입
        print(f"\n3️⃣ 데이터 타입:")
        for col in df.columns:
            print(f"   - {col}: {df[col].dtype}")

        # 샘플 데이터 (처음 3행)
        print(f"\n4️⃣ 샘플 데이터 (처음 3행):")
        print(df.head(3).to_string(index=False))

        # 누락 데이터 확인
        print(f"\n5️⃣ 누락 데이터:")
        null_counts = df.isnull().sum()
        for col in df.columns:
            if null_counts[col] > 0:
                print(f"   - {col}: {null_counts[col]}개 누락")
        if null_counts.sum() == 0:
            print("   - 누락 데이터 없음")

        # 고유값 개수 (참고용)
        print(f"\n6️⃣ 고유값 개수:")
        for col in df.columns:
            unique_count = df[col].nunique()
            print(f"   - {col}: {unique_count}개")

        return df

    except Exception as e:
        print(f"❌ 오류 발생: {str(e)}")
        return None

def compare_formats(df1, df2, name1, name2):
    """두 형식 비교"""
    print(f"\n{'='*60}")
    print(f"🔄 {name1} vs {name2} 형식 비교")
    print('='*60)

    if df1 is None or df2 is None:
        print("비교할 수 없습니다 (데이터 로드 실패)")
        return

    cols1 = set(df1.columns)
    cols2 = set(df2.columns)

    # 공통 컬럼
    common = cols1.intersection(cols2)
    print(f"\n✅ 공통 컬럼 ({len(common)}개):")
    for col in sorted(common):
        print(f"   - {col}")

    # 한의사랑에만 있는 컬럼
    only_in_1 = cols1 - cols2
    if only_in_1:
        print(f"\n📌 {name1}에만 있는 컬럼 ({len(only_in_1)}개):")
        for col in sorted(only_in_1):
            print(f"   - {col}")

    # 한의정보에만 있는 컬럼
    only_in_2 = cols2 - cols1
    if only_in_2:
        print(f"\n📌 {name2}에만 있는 컬럼 ({len(only_in_2)}개):")
        for col in sorted(only_in_2):
            print(f"   - {col}")

    # 컬럼명 매핑 추천
    print(f"\n🔗 컬럼 매핑 추천:")

    # 가능한 매핑 찾기
    mappings = []

    # 날짜 관련
    date_cols1 = [c for c in cols1 if '일' in c or '날짜' in c or 'date' in c.lower()]
    date_cols2 = [c for c in cols2 if '일' in c or '날짜' in c or 'date' in c.lower()]
    if date_cols1 and date_cols2:
        mappings.append((date_cols1[0], date_cols2[0], "날짜"))

    # 약재명 관련
    herb_cols1 = [c for c in cols1 if '약재' in c or '품목' in c or '제품' in c]
    herb_cols2 = [c for c in cols2 if '약재' in c or '품목' in c or '제품' in c]
    if herb_cols1 and herb_cols2:
        mappings.append((herb_cols1[0], herb_cols2[0], "약재명"))

    # 수량 관련
    qty_cols1 = [c for c in cols1 if '수량' in c or '량' in c or '구입량' in c]
    qty_cols2 = [c for c in cols2 if '수량' in c or '량' in c or '구입량' in c]
    if qty_cols1 and qty_cols2:
        mappings.append((qty_cols1[0], qty_cols2[0], "수량"))

    # 금액 관련
    amt_cols1 = [c for c in cols1 if '금액' in c or '액' in c or '가격' in c]
    amt_cols2 = [c for c in cols2 if '금액' in c or '액' in c or '가격' in c]
    if amt_cols1 and amt_cols2:
        mappings.append((amt_cols1[0], amt_cols2[0], "금액"))

    # 업체 관련
    supplier_cols1 = [c for c in cols1 if '업체' in c or '도매' in c or '공급' in c]
    supplier_cols2 = [c for c in cols2 if '업체' in c or '도매' in c or '공급' in c]
    if supplier_cols1 and supplier_cols2:
        mappings.append((supplier_cols1[0], supplier_cols2[0], "공급업체"))

    # 원산지 관련
    origin_cols1 = [c for c in cols1 if '원산지' in c or '산지' in c]
    origin_cols2 = [c for c in cols2 if '원산지' in c or '산지' in c]
    if origin_cols1 and origin_cols2:
        mappings.append((origin_cols1[0], origin_cols2[0], "원산지"))

    for col1, col2, mapping_type in mappings:
        print(f"   - {mapping_type}: [{name1}]{col1} ↔ [{name2}]{col2}")

def main():
    """메인 함수"""
    print("\n" + "="*60)
    print("🏥 한약 입고장 Excel 형식 분석기")
    print("="*60)

    # 파일 경로
    hanisarang_path = '/root/kdrug/sample/한의사랑.xlsx'
    haninfo_path = '/root/kdrug/sample/한의정보.xlsx'
    current_path = '/root/kdrug/sample/order_view_20260215154829.xlsx'

    # 각 형식 분석
    df_hanisarang = None
    df_haninfo = None
    df_current = None

    if os.path.exists(hanisarang_path):
        df_hanisarang = analyze_excel_format(hanisarang_path, "한의사랑")
    else:
        print(f"❌ 한의사랑 파일을 찾을 수 없음: {hanisarang_path}")

    if os.path.exists(haninfo_path):
        df_haninfo = analyze_excel_format(haninfo_path, "한의정보")
    else:
        print(f"❌ 한의정보 파일을 찾을 수 없음: {haninfo_path}")

    # 현재 사용 중인 형식도 분석
    if os.path.exists(current_path):
        df_current = analyze_excel_format(current_path, "현재 사용 중")

    # 형식 비교
    if df_hanisarang is not None and df_haninfo is not None:
        compare_formats(df_hanisarang, df_haninfo, "한의사랑", "한의정보")

    # 통합 매핑 제안
    print(f"\n{'='*60}")
    print("💡 통합 컬럼 매핑 제안")
    print('='*60)

    print("""
시스템에서 사용할 표준 컬럼:
1. insurance_code (보험코드/제품코드)
2. supplier_name (업체명/도매상)
3. herb_name (약재명/품목명)
4. receipt_date (구입일자/입고일)
5. quantity (구입량/수량) - 그램 단위
6. total_amount (구입액/금액)
7. origin_country (원산지)
8. unit_price (단가) - 계산 가능한 경우

각 형식별 매핑 규칙을 자동으로 적용하여
어떤 형식의 Excel 파일도 처리 가능하도록 구현 가능
""")

if __name__ == "__main__":
    main()