%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("resources/ks-projects-201801.csv", encoding='latin1', low_memory=False)

df_test = df.drop(df.index[[1, 50000]])

df_validate = df.sample(50000)

df_validate.head()

df_test.head()

df_test.isnull().sum()

df_test.dropna(inplace=True)

df_test.isnull().sum()

ID                  0
name                0
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

df2=df_test[['category', 'usd_goal_real', 'country', 'usd_pledged_real',]]

df2_dummies = pd.get_dummies(df2, prefix_sep="_", drop_first=True)

df2.head()

df2_dummies.head()

#i dont know what these values mean
from sklearn import preprocessing
from sklearn import utils

y_bins = [-1, 49999, 99999, 499999, 999999, 24999999]

y_bins_labels = ["0-50,000", "50,000-100,000", "100,000-500,000", "500,000-1,000,000", "1,000,000+"]

df2_dummies["USD_Pledged_Bins"] = pd.cut(df2_dummies["usd_pledged_real"], y_bins, labels=y_bins_labels)

df2_dummies.head()

# lab_enc = preprocessing.LabelEncoder()
# encoded = lab_enc.fit_transform(y)

# print(utils.multiclass.type_of_target(encoded))

# from sklearn.preprocessing import LabelEncoder

# X = df2["category"]

# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(X)
# encoded_X = label_encoder.transform(X)

# encoded_X

# for label, original_class in zip(encoded_X, X):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)

# from keras.utils import to_categorical

# # Step 2: One-hot encoding
# one_hot_X = to_categorical(encoded_X)
# one_hot_X

# df2.head()

# #I need to fix this so to that one hot encoding don't know how to yet
# df2['categories']=np.where(df2.category=='Product Design',1,
#     np.where(df2.category=='Tabletop Games',2,
#              np.where(df2.category=='Video Games',3,
#                       np.where(df2.category=='Hardware',4,
#                                np.where(df2.category=='Documentary',5,
#                                         np.where(df2.category=='Technology',6,
#                                                  np.where(df2.category=='Food',7,
#                                                           np.where(df2.category=='Gadgets',8,
#                                                                    np.where(df2.category=='Music',9,
#                                                                             np.where(df2.category=='Narrative Film',10,0))))))))))

# df2 = df2[df2['categories'] != 0]
# df2.head()

# from sklearn.preprocessing import LabelEncoder

# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(X)
# encoded_X = label_encoder.transform(X)

# for label, original_class in zip(encoded_X, X):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)

# from keras.utils import to_categorical

# # Step 2: One-hot encoding
# one_hot_X = to_categorical(encoded_X)
# one_hot_X

X = df2_dummies.drop(columns=["usd_pledged_real","USD_Pledged_Bins"])
y = df2_dummies["USD_Pledged_Bins"]

X.head()

y.head()

0          0-50,000
2          0-50,000
3          0-50,000
4          0-50,000
5    50,000-100,000
Name: USD_Pledged_Bins, dtype: category
Categories (5, object): [0-50,000 < 50,000-100,000 < 100,000-500,000 < 500,000-1,000,000 < 1,000,000+]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

from sklearn.model_selection import KFold # import KFold

kf = KFold(n_splits=3) # Define the split - into 3 folds 
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
print(kf) 
KFold(n_splits=3, random_state=None, shuffle=False)

KFold(n_splits=3, random_state=None, shuffle=False)

KFold(n_splits=3, random_state=None, shuffle=False)

for train_index, test_index in kf.split(X):
     print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

TRAIN: [124953 124954 124955 ... 374855 374856 374857] TEST: [     0      1      2 ... 124950 124951 124952]
TRAIN: [     0      1      2 ... 374855 374856 374857] TEST: [124953 124954 124955 ... 249903 249904 249905]
TRAIN: [     0      1      2 ... 249903 249904 249905] TEST: [249906 249907 249908 ... 374855 374856 374857]

X_test.head()

rfc = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=500)

rfc.fit(X_train, y_train)

print('train score:', rfc.score(X_train, y_train))
print('test score:', rfc.score(X_test, y_test))

train score: 0.9787794823274988
test score: 0.9668142773302033

scores = cross_val_score(rfc, X, y, cv=10)

print(scores)
print()
print('Average:', np.mean(scores))

[0.96655019 0.96724198 0.96734868 0.9664417  0.96628074 0.96646659
 0.9671602  0.96705261 0.96643901 0.96705261]

Average: 0.9668034307003317

# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(penalty='l2', solver='liblinear')

# Fit the model.
fit = lr.fit(X_train, y_train)

# Display.
# print('Coefficients')
# print(fit.coef_)
# print(fit.intercept_)

pred_y_sklearn_train = lr.predict(X_train)
pred_y_sklearn_test = lr.predict(X_test)

print('\n Accuracy by success (train)')
print(pd.crosstab(pred_y_sklearn_train, y_train))

print('\n Percentage accuracy (train)')
print(lr.score(X_train, y_train))

print('\n Accuracy by success (test)')
print(pd.crosstab(pred_y_sklearn_test, y_test))

print('\n Percentage accuracy (test)')
print(lr.score(X_test, y_test))

# # CV
# scores = cross_val_score(lr, X, y, cv=10)

# print(scores)

 Accuracy by success (train)
USD_Pledged_Bins  0-50,000  50,000-100,000  100,000-500,000  \
row_0                                                         
0-50,000            273203            4228             3175   

USD_Pledged_Bins  500,000-1,000,000  1,000,000+  
row_0                                            
0-50,000                        331         206  

 Percentage accuracy (train)
0.9717581444318372

 Accuracy by success (test)
USD_Pledged_Bins  0-50,000  50,000-100,000  100,000-500,000  \
row_0                                                         
0-50,000             91068            1409             1059   

USD_Pledged_Bins  500,000-1,000,000  1,000,000+  
row_0                                            
0-50,000                        110          69  

 Percentage accuracy (test)
0.9717547884543563

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	state	backers	country	usd pledged	usd_pledged_real	usd_goal_real
0	1000002330	The Songs of Adelaide & Abullah	Poetry	Publishing	GBP	2015-10-09	1000.0	2015-08-11 12:12:28	0.0	failed	0	GB	0.0	0.0	1533.95
2	1000004038	Where is Hank?	Narrative Film	Film & Video	USD	2013-02-26	45000.0	2013-01-12 00:20:50	220.0	failed	3	US	220.0	220.0	45000.00
3	1000007540	ToshiCapital Rekordz Needs Help to Complete Album	Music	Music	USD	2012-04-16	5000.0	2012-03-17 03:24:11	1.0	failed	1	US	1.0	1.0	5000.00
4	1000011046	Community Film Project: The Art of Neighborhoo...	Film & Video	Film & Video	USD	2015-08-29	19500.0	2015-07-04 08:35:03	1283.0	canceled	14	US	1283.0	1283.0	19500.00
5	1000014025	Monarch Espresso Bar	Restaurants	Food	USD	2016-04-01	50000.0	2016-02-26 13:38:27	52375.0	successful	224	US	52375.0	52375.0	50000.00

	usd_goal_real	usd_pledged_real	...	country_US
0	1533.95	0.0	...	0
2	45000.00	220.0	...	1
3	5000.00	1.0	...	1
4	19500.00	1283.0	...	1
5	50000.00	52375.0	...	1

	usd_goal_real	usd_pledged_real	...	country_US	USD_Pledged_Bins
0	1533.95	0.0	...	0	0-50,000
2	45000.00	220.0	...	1	0-50,000
3	5000.00	1.0	...	1	0-50,000
4	19500.00	1283.0	...	1	0-50,000
5	50000.00	52375.0	...	1	50,000-100,000

	usd_goal_real	...	country_US
0	1533.95	...	0
2	45000.00	...	1
3	5000.00	...	1
4	19500.00	...	1
5	50000.00	...	1

	usd_goal_real	...	country_US
252437	30000.00	...	1
252438	300.00	...	1
252439	1000.00	...	1
252440	13761.47	...	0
252441	28000.00	...	1