Python(25)- LightGBM

두설날 2024. 6. 19. 08:49

*이 글을 읽기전에 작성자 개인의견이 있으니, 다른 블로그와 교차로 읽는것을 권장합니다.*

1. credit 데이터셋

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

credit_df = pd.read_csv('/content/drive/MyDrive/KDT/6. 머신러닝과 딥러닝/Data/credit.csv')
credit_df

credit_df.info()

ID: 고유 식별자
Customer_ID: 고객 ID
Name: 이름
Age: 나이
SSN: 주민등록번호
Occupation: 직업
Annual_Income: 연간 소득
Num_Bank_Accounts: 은행 계좌 수
Num_Credit_Card: 신용 카드 수
Interest_Rate: 이자율
Num_of_Loan: 대출 수
Type_of_Loan: 대출 유형
Delay_from_due_date: 마감일로부터 연체 기간
Num_of_Delayed_Payment: 연체된 결제 수
Num_Credit_Inquiries: 신용조회 수
Outstanding_Debt: 미상환 잔금
Credit_Utilization_Ratio: 신용카드 사용률
Credit_History_Age: 카드 사용 기간
Payment_of_Min_Amount: 리볼빙 여부
Total_EMI_per_month: 월별 총 지출 금액
Amount_invested_monthly: 매월 투자 금액
Payment_Behaviour: 지불 행동
Monthly_Balance: 월별 잔고
Credit_Score: 신용 점수

credit_df.drop(['ID', 'Customer_ID', 'SSN', 'Name'], axis=1, inplace=True)
credit_df.info()

credit_df['Credit_Score'].value_counts()

credit_df['Credit_Score'] = credit_df['Credit_Score'].replace({'Poor':0, 'Standard':1, 'Good':2})
credit_df.head()

credit_df.describe()

sns.barplot(x='Payment_of_Min_Amount', y='Credit_Score', data=credit_df)

plt.figure(figsize=(20,5))
sns.barplot(x='Occupation', y='Credit_Score', data=credit_df)

plt.figure(figsize=(12,12))
sns.heatmap(credit_df.corr(numeric_only=True), cmap='coolwarm', vmin=-1, vmax=1, annot=True)

credit_df.info()

for i in credit_df.columns:
    if credit_df[i].dtype == 'O':
        print(i)

credit_df

for i in ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Outstanding_Debt', 'Amount_invested_monthly']:
    credit_df[i] = pd.to_numeric(credit_df[i].str.replace('_', ''))
    
credit_df.info()

credit_df['Credit_History_Age']

# Credit_History_Age의 데이터를 개월로 변경
# 22 Years and 1 Months -> 22 * 12 + 1 = 265
credit_df['Credit_History_Age'] = credit_df['Credit_History_Age'].str.replace(' Months', '')

# 22 Years and 1
credit_df['Credit_History_Age'] = pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[0])*12 + pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[1])
credit_df.head()

credit_df.describe()

credit_df[credit_df['Age'] < 0]

# <0 제거
credit_df = credit_df[credit_df['Age'] >= 0]

credit_df.sort_values('Age').head(5)

credit_df.sort_values('Age').tail(20)

sns.boxplot(y=credit_df['Age'])

# 100세보다 큰사람 범위
credit_df[credit_df['Age'] >= 100].sort_values('Age')

credit_df = credit_df[credit_df['Age']<110]

credit_df.describe()

len(credit_df[credit_df['Num_Bank_Accounts'] > 10]) / len(credit_df)
# 갯수가 50~10개 사이가 거의 없음
# 통장 10개에서 타협

credit_df = credit_df[credit_df['Num_Bank_Accounts'] <= 10]
credit_df.describe()

len(credit_df[credit_df['Num_Credit_Card'] > 20]) / len(credit_df)

credit_df = credit_df[credit_df['Num_Credit_Card'] <= 20]
credit_df.describe()

# 이자
credit_df = credit_df[credit_df['Interest_Rate'] <= 40]
credit_df.describe()

len(credit_df[credit_df['Num_of_Loan']>20])

credit_df = credit_df[(credit_df['Num_of_Loan'] <= 20) & (credit_df['Num_of_Loan'] >= 0)]
# 카드대출 0~20이하 저장

credit_df.describe()

credit_df = credit_df[credit_df['Delay_from_due_date']>=0]
len(credit_df[credit_df['Num_of_Delayed_Payment'] > 30])

# 연체횟수 0~30
credit_df = credit_df[(credit_df['Num_of_Delayed_Payment'] <= 30) & (credit_df['Num_of_Delayed_Payment'] >= 0)]
credit_df.describe()

credit_df.info()

credit_df['Num_Credit_Inquiries'] = credit_df['Num_Credit_Inquiries'].fillna(0)
credit_df.isna().mean()

# 얼마만큼 카드를 오래 썼는지
sns.displot(credit_df['Credit_History_Age'])

sns.displot(credit_df['Amount_invested_monthly'])

sns.displot(credit_df['Monthly_Balance'])

credit_df = credit_df.fillna(credit_df.median(numeric_only=True))
credit_df.isna().mean()
# Type_of_Loan     제외 null값 처리

credit_df.head()

# 문제
# Type_of_Loan의 모든 대출 상품을 변수에 저장
# NaN인 데이터는 'No Loan'으로 대체
# 마지막은 대출상품 만큼의 column을 만들고 해당 대출 상품을 받았다면 1 아니면 0으로 데이터 처리
# 원핫인코딩처럼

# Auto Loan, Auto Loan, and Not Specified
credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].str.replace('and ', '')
credit_df.isna().mean()

credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].fillna('No Loan')
credit_df.isna().mean()

# set()로 중복 처리
type_list = set(credit_df['Type_of_Loan'].str.split(', ').sum())
type_list

for i in type_list:
    credit_df[i] = credit_df['Type_of_Loan'].apply(lambda x: 1 if i in x  else 0)
    
credit_df.head()

credit_df.drop('Type_of_Loan', axis=1, inplace=True)
credit_df.info()

# Occupation
# _______ 를 'Unknown'
credit_df['Occupation'].value_counts()

credit_df['Occupation'] = credit_df['Occupation'].replace('_______', 'Unknown')
credit_df['Occupation'].value_counts()

# Payment_of_Min_Amount
# '!@9#%8'를 'Unknown'
credit_df['Payment_of_Min_Amount'].value_counts()

# Payment_Behaviour
credit_df['Payment_Behaviour'].value_counts()

credit_df['Payment_Behaviour'] = credit_df['Payment_Behaviour'].replace('!@9#%8', 'Unknown')
credit_df['Payment_Behaviour'].value_counts()

credit_df.info()

# 위 object를 원핫인코딩
credit_df = pd.get_dummies(credit_df, columns=['Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour'])
credit_df.head()

credit_df.info()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(credit_df.drop('Credit_Score', axis=1), credit_df['Credit_Score'], test_size=0.2, random_state=2024)

X_train.shape, y_train.shape

X_test.shape, y_test.shape

2. lightGBM(LGBM)

Microsoft에서 개발한 Gradient Boosting Framework
리프 중심 히스토그램 기반 알고리즘
작은 데이터셋에서도 높은 성능을 보이며, 특히 대용량 데이터셋에서 다른 알고리즘보다 빠르게 학습
메모리 사용량이 상대적으로 적은편
적은 데이터셋을 사용할 경우 과적합 가능성이 매우 큼(일반적으로 데이터가 10, 000개 이상은 사용해야 함)
조기 중단(early stopping)을 지원

2-1. 리프 중심 히스토그램 기반 알고리즘

리프(나뭇가지 모형)
데이터를 나눌 수 있는 피쳐를 데이터의 특징을 짚어서, 오류가 난다면 다른 트리로 넘어가는 방식
트리를 균형적으로 분할하는 것이 아니라, 최대한 불균형하게 분할
특성들의 분포를 히스토그램으로 나타내고, 해당 히스토그램을 이용하여 빠르게 후보 분할 기준을 선택
후보 분할 기준 중에서 최적의 분할 기준으로 선택하기 위해, 데이터 포인트들을 히스토그램에 올바르게 배치하고 이를 이용하여 최적의 분할 기준을 선택

2-2. GBM(Gradient Boostring Model)

순차적으로 모델을 학습시킴
첫번째 모델을 학습시키고, 두번째 모델은 첫번째 모델의 오류를 학습하는 식으로 진행
부스팅에서는 각 데이터 포인트에 가중치를 부여, 초기에는 모든 데이터 포인트에 동일한 가중치를 부여하지만, 이후 모델이 학습되면서 잘못 예측된 데이터 포인트의 가중치를 증가시켜 다음 모델이 데이터 포인트에 더 주의를 기울이도록 함
트리가 모두 학습된 후 예측 결과를 결합하여 최종 예측을 만드는데, 일반적으로 분류 문제에서는 다수결 투표 방식으로, 회귀 문제에서는 예측값의 평균을 사용
- Gradient: 다변수 함수의 각 변수에 대한 변화율을 나타내는 벡터

2-3. 부스팅 모델의 주요 개념

약한 학습기(Weak Leaner): 단독으로는 성능이 좋지 않은 간단한 모델(주로 깊이가 얕은 결정 트리, 깊이가 1인 매우 간단한 학습기)을 사용
약한 학습기를 순차적으로 학습시키고 그 다음에는 첫번째 학습기의 오류를 보완하는 두번째 학습기를 학습시킴

from lightgbm import LGBMClassifier

base_model = LGBMClassifier(randome_state=2024)
base_model.fit(X_train, y_train)

pred = base_model.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

accuracy_score(y_test, pred)

confusion_matrix(y_test, pred)

print(classification_report(y_test, pred))

proba = base_model.predict_proba(X_test)
proba

5.31741083e-02, 5.47962223e-01, 3.98863668e-01

roc_auc_score(y_test, proba, multi_class='ovr')
# 2진분류일땐 상관없으나, 분류가 여러개일땐 적어놓아야함