kdrug-inventory-system/dev_scripts/analyze_product_code.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
한약재 제품 코드 엑셀 파일 분석
"""

import pandas as pd
import openpyxl

def analyze_excel_file():
    file_path = 'sample/(게시)한약재제품코드_2510.xlsx'

    # 엑셀 파일 열기
    wb = openpyxl.load_workbook(file_path, read_only=True)

    print("=== 엑셀 파일 시트 목록 ===")
    for i, sheet_name in enumerate(wb.sheetnames, 1):
        print(f"{i}. {sheet_name}")

    # 4번째 시트 데이터 읽기
    if len(wb.sheetnames) >= 4:
        sheet_name = wb.sheetnames[3]  # 0-based index
        print(f"\n=== 4번째 시트 '{sheet_name}' 분석 ===")

        # pandas로 데이터 읽기
        df = pd.read_excel(file_path, sheet_name=sheet_name)

        print(f"\n데이터 크기: {df.shape[0]}행 x {df.shape[1]}열")
        print(f"\n컬럼 목록:")
        for i, col in enumerate(df.columns, 1):
            # NaN이 아닌 값들의 예시
            non_null_count = df[col].notna().sum()
            sample_values = df[col].dropna().head(3).tolist()
            print(f"  {i}. {col} (유효값: {non_null_count}개)")
            if sample_values:
                print(f"     예시: {sample_values[:3]}")

        print(f"\n=== 데이터 샘플 (처음 10행) ===")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', None)
        pd.set_option('display.max_colwidth', 50)
        print(df.head(10))

        # 주요 컬럼 분석
        if '주성분코드' in df.columns:
            print(f"\n=== 주성분코드 분석 ===")
            print(f"유일한 주성분코드 수: {df['주성분코드'].nunique()}")
            print(f"주성분코드 샘플: {df['주성분코드'].unique()[:10].tolist()}")

        if '제품명' in df.columns:
            print(f"\n=== 제품명 분석 ===")
            print(f"유일한 제품 수: {df['제품명'].nunique()}")
            print(f"제품명 샘플: {df['제품명'].head(10).tolist()}")

        # 컬럼 정보를 더 자세히 분석
        print(f"\n=== 데이터 타입 및 null 값 정보 ===")
        print(df.info())

    wb.close()

if __name__ == "__main__":
    analyze_excel_file()