%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
df = pd.read_csv("resources/ks-projects-201801.csv", encoding='latin1', low_memory=False)
df_test = df.drop(df.index[[1, 50000]])
df_validate = df.sample(50000)
df_validate.head()
df_test.head()
df_test.isnull().sum()
df_test.dropna(inplace=True)
df_test.isnull().sum()
df2=df_test[['category', 'usd_goal_real', 'country', 'usd_pledged_real',]]
df2_dummies = pd.get_dummies(df2, prefix_sep="_", drop_first=True)
df2.head()
df2_dummies.head()
#i dont know what these values mean
from sklearn import preprocessing
from sklearn import utils
y_bins = [-1, 49999, 99999, 499999, 999999, 24999999]
y_bins_labels = ["0-50,000", "50,000-100,000", "100,000-500,000", "500,000-1,000,000", "1,000,000+"]
df2_dummies["USD_Pledged_Bins"] = pd.cut(df2_dummies["usd_pledged_real"], y_bins, labels=y_bins_labels)
df2_dummies.head()
# lab_enc = preprocessing.LabelEncoder()
# encoded = lab_enc.fit_transform(y)
# print(utils.multiclass.type_of_target(encoded))
# from sklearn.preprocessing import LabelEncoder
# X = df2["category"]
# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(X)
# encoded_X = label_encoder.transform(X)
# encoded_X
# for label, original_class in zip(encoded_X, X):
# print('Original Class: ' + str(original_class))
# print('Encoded Label: ' + str(label))
# print('-' * 12)
# from keras.utils import to_categorical
# # Step 2: One-hot encoding
# one_hot_X = to_categorical(encoded_X)
# one_hot_X
# df2.head()
# #I need to fix this so to that one hot encoding don't know how to yet
# df2['categories']=np.where(df2.category=='Product Design',1,
# np.where(df2.category=='Tabletop Games',2,
# np.where(df2.category=='Video Games',3,
# np.where(df2.category=='Hardware',4,
# np.where(df2.category=='Documentary',5,
# np.where(df2.category=='Technology',6,
# np.where(df2.category=='Food',7,
# np.where(df2.category=='Gadgets',8,
# np.where(df2.category=='Music',9,
# np.where(df2.category=='Narrative Film',10,0))))))))))
# df2 = df2[df2['categories'] != 0]
# df2.head()
# from sklearn.preprocessing import LabelEncoder
# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(X)
# encoded_X = label_encoder.transform(X)
# for label, original_class in zip(encoded_X, X):
# print('Original Class: ' + str(original_class))
# print('Encoded Label: ' + str(label))
# print('-' * 12)
# from keras.utils import to_categorical
# # Step 2: One-hot encoding
# one_hot_X = to_categorical(encoded_X)
# one_hot_X
X = df2_dummies.drop(columns=["usd_pledged_real","USD_Pledged_Bins"])
y = df2_dummies["USD_Pledged_Bins"]
X.head()
y.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=3) # Define the split - into 3 folds
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
print(kf)
KFold(n_splits=3, random_state=None, shuffle=False)
for train_index, test_index in kf.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
X_test.head()
rfc = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=500)
rfc.fit(X_train, y_train)
print('train score:', rfc.score(X_train, y_train))
print('test score:', rfc.score(X_test, y_test))
scores = cross_val_score(rfc, X, y, cv=10)
print(scores)
print()
print('Average:', np.mean(scores))
# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(penalty='l2', solver='liblinear')
# Fit the model.
fit = lr.fit(X_train, y_train)
# Display.
# print('Coefficients')
# print(fit.coef_)
# print(fit.intercept_)
pred_y_sklearn_train = lr.predict(X_train)
pred_y_sklearn_test = lr.predict(X_test)
print('\n Accuracy by success (train)')
print(pd.crosstab(pred_y_sklearn_train, y_train))
print('\n Percentage accuracy (train)')
print(lr.score(X_train, y_train))
print('\n Accuracy by success (test)')
print(pd.crosstab(pred_y_sklearn_test, y_test))
print('\n Percentage accuracy (test)')
print(lr.score(X_test, y_test))
# # CV
# scores = cross_val_score(lr, X, y, cv=10)
# print(scores)