Telco Customer Churn Analysis
import pandas as pd
import sklearn as sk
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(df.head())
df.T
print(df.dtypes)
#We try to predict whether customers will leave the service.
#Today, the fact that companies make a discount is to determine the possibility of the customer to leave.
#The Columns Except Churn -- Attribute
#Churn -- Target
#SeniorCitizen -- int64 -- Actually, True is expressed as 0 and 1 for False,
but it's not a problem since it's a single column.
#TotalCharges -- object -- It should have been a number, but it counted as text because of the comma!
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors="coerce")
print(df.dtypes)
print(df.isnull().sum())
df.TotalCharges = df.TotalCharges.fillna(0)
print(df.isnull().sum())
df.columns = df.columns.str.lower().str.replace(" ","_")
string_columns = list(df.dtypes[df.dtypes=="object"].index)
for col in string_columns:
df[col]=df[col].str.lower().str.replace(" ","_")
print(df.head())
df.churn = (df.churn == "yes").astype(int)
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df,train_size=0.8,random_state=42)
df_train, df_val = train_test_split(df_train_full,train_size=0.75,random_state=42)
y_train = df_train.churn.values
y_val = df_val.churn.values
del df_train["churn"]
del df_val["churn"]
categorical = ["gender","seniorcitizen","partner","dependents",
"phoneservice", "multiplelines", "internetservice",
"onlinesecurity", "onlinebackup", "deviceprotection",
"techsupport", "streamingtv", "streamingmovies","contract",
"paperlessbilling", "paymentmethod"]
numerical = ["tenure","monthlycharges","totalcharges"]
train_dict = df_train[categorical+numerical].to_dict(orient="records")
print(train_dict[:1])
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear",random_state=42)
model.fit(X_train, y_train)
val_dict = df_train[categorical+numerical].to_dict(orient="records")
X_val = dv.transform(val_dict)
y_pred = model.predict_proba(X_val)
print(model.score(X_train,y_train))
print(model.intercept_[0])
#print(dict(zip(dv.get_features_names_out(),model.coef_[0].
round(3))))