In [1]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv("resources/ks-projects-201801.csv", encoding='latin1', low_memory=False)

df_test = df.drop(df.index[[1, 50000]])

df_validate = df.sample(50000)

df_validate.head()

df_test.head()
Out[1]:
ID name category main_category currency deadline goal launched pledged state backers country usd pledged usd_pledged_real usd_goal_real
0 1000002330 The Songs of Adelaide & Abullah Poetry Publishing GBP 2015-10-09 1000.0 2015-08-11 12:12:28 0.0 failed 0 GB 0.0 0.0 1533.95
2 1000004038 Where is Hank? Narrative Film Film & Video USD 2013-02-26 45000.0 2013-01-12 00:20:50 220.0 failed 3 US 220.0 220.0 45000.00
3 1000007540 ToshiCapital Rekordz Needs Help to Complete Album Music Music USD 2012-04-16 5000.0 2012-03-17 03:24:11 1.0 failed 1 US 1.0 1.0 5000.00
4 1000011046 Community Film Project: The Art of Neighborhoo... Film & Video Film & Video USD 2015-08-29 19500.0 2015-07-04 08:35:03 1283.0 canceled 14 US 1283.0 1283.0 19500.00
5 1000014025 Monarch Espresso Bar Restaurants Food USD 2016-04-01 50000.0 2016-02-26 13:38:27 52375.0 successful 224 US 52375.0 52375.0 50000.00
In [2]:
df_test.isnull().sum()

df_test.dropna(inplace=True)

df_test.isnull().sum()
Out[2]:
ID                  0
name                0
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64
In [3]:
df2=df_test[['category', 'usd_goal_real', 'country', 'usd_pledged_real',]]

df2_dummies = pd.get_dummies(df2, prefix_sep="_", drop_first=True)

df2.head()

df2_dummies.head()
Out[3]:
usd_goal_real usd_pledged_real category_Academic category_Accessories category_Action category_Animals category_Animation category_Anthologies category_Apparel category_Apps ... country_IT country_JP country_LU country_MX country_NL country_NO country_NZ country_SE country_SG country_US
0 1533.95 0.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 45000.00 220.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 5000.00 1.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 19500.00 1283.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
5 50000.00 52375.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 181 columns

In [4]:
#i dont know what these values mean
from sklearn import preprocessing
from sklearn import utils

y_bins = [-1, 49999, 99999, 499999, 999999, 24999999]

y_bins_labels = ["0-50,000", "50,000-100,000", "100,000-500,000", "500,000-1,000,000", "1,000,000+"]

df2_dummies["USD_Pledged_Bins"] = pd.cut(df2_dummies["usd_pledged_real"], y_bins, labels=y_bins_labels)

df2_dummies.head()

# lab_enc = preprocessing.LabelEncoder()
# encoded = lab_enc.fit_transform(y)

# print(utils.multiclass.type_of_target(encoded))
Out[4]:
usd_goal_real usd_pledged_real category_Academic category_Accessories category_Action category_Animals category_Animation category_Anthologies category_Apparel category_Apps ... country_JP country_LU country_MX country_NL country_NO country_NZ country_SE country_SG country_US USD_Pledged_Bins
0 1533.95 0.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0-50,000
2 45000.00 220.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0-50,000
3 5000.00 1.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0-50,000
4 19500.00 1283.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0-50,000
5 50000.00 52375.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 50,000-100,000

5 rows × 182 columns

In [5]:
# from sklearn.preprocessing import LabelEncoder

# X = df2["category"]

# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(X)
# encoded_X = label_encoder.transform(X)
In [6]:
# encoded_X
In [7]:
# for label, original_class in zip(encoded_X, X):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)
In [ ]:
# from keras.utils import to_categorical

# # Step 2: One-hot encoding
# one_hot_X = to_categorical(encoded_X)
# one_hot_X
In [ ]:
# df2.head()
In [ ]:
# #I need to fix this so to that one hot encoding don't know how to yet
# df2['categories']=np.where(df2.category=='Product Design',1,
#     np.where(df2.category=='Tabletop Games',2,
#              np.where(df2.category=='Video Games',3,
#                       np.where(df2.category=='Hardware',4,
#                                np.where(df2.category=='Documentary',5,
#                                         np.where(df2.category=='Technology',6,
#                                                  np.where(df2.category=='Food',7,
#                                                           np.where(df2.category=='Gadgets',8,
#                                                                    np.where(df2.category=='Music',9,
#                                                                             np.where(df2.category=='Narrative Film',10,0))))))))))
In [1]:
# df2 = df2[df2['categories'] != 0]
# df2.head()
In [11]:
# from sklearn.preprocessing import LabelEncoder

# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(X)
# encoded_X = label_encoder.transform(X)
In [2]:
# for label, original_class in zip(encoded_X, X):
#     print('Original Class: ' + str(original_class))
#     print('Encoded Label: ' + str(label))
#     print('-' * 12)
In [3]:
# from keras.utils import to_categorical

# # Step 2: One-hot encoding
# one_hot_X = to_categorical(encoded_X)
# one_hot_X
In [5]:
X = df2_dummies.drop(columns=["usd_pledged_real","USD_Pledged_Bins"])
y = df2_dummies["USD_Pledged_Bins"]
In [6]:
X.head()
Out[6]:
usd_goal_real category_Academic category_Accessories category_Action category_Animals category_Animation category_Anthologies category_Apparel category_Apps category_Architecture ... country_IT country_JP country_LU country_MX country_NL country_NO country_NZ country_SE country_SG country_US
0 1533.95 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 45000.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 5000.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 19500.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
5 50000.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 180 columns

In [7]:
y.head()
Out[7]:
0          0-50,000
2          0-50,000
3          0-50,000
4          0-50,000
5    50,000-100,000
Name: USD_Pledged_Bins, dtype: category
Categories (5, object): [0-50,000 < 50,000-100,000 < 100,000-500,000 < 500,000-1,000,000 < 1,000,000+]
In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)
In [21]:
from sklearn.model_selection import KFold # import KFold

kf = KFold(n_splits=3) # Define the split - into 3 folds 
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
print(kf) 
KFold(n_splits=3, random_state=None, shuffle=False)
KFold(n_splits=3, random_state=None, shuffle=False)
Out[21]:
KFold(n_splits=3, random_state=None, shuffle=False)
In [22]:
for train_index, test_index in kf.split(X):
     print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
TRAIN: [124953 124954 124955 ... 374855 374856 374857] TEST: [     0      1      2 ... 124950 124951 124952]
TRAIN: [     0      1      2 ... 374855 374856 374857] TEST: [124953 124954 124955 ... 249903 249904 249905]
TRAIN: [     0      1      2 ... 249903 249904 249905] TEST: [249906 249907 249908 ... 374855 374856 374857]
In [23]:
X_test.head()
Out[23]:
usd_goal_real category_Academic category_Accessories category_Action category_Animals category_Animation category_Anthologies category_Apparel category_Apps category_Architecture ... country_IT country_JP country_LU country_MX country_NL country_NO country_NZ country_SE country_SG country_US
252437 30000.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
252438 300.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
252439 1000.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
252440 13761.47 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
252441 28000.00 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 180 columns

In [9]:
rfc = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=500)

rfc.fit(X_train, y_train)

print('train score:', rfc.score(X_train, y_train))
print('test score:', rfc.score(X_test, y_test))
train score: 0.9787794823274988
test score: 0.9668142773302033
In [10]:
scores = cross_val_score(rfc, X, y, cv=10)

print(scores)
print()
print('Average:', np.mean(scores))
[0.96655019 0.96724198 0.96734868 0.9664417  0.96628074 0.96646659
 0.9671602  0.96705261 0.96643901 0.96705261]

Average: 0.9668034307003317
In [11]:
# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(penalty='l2', solver='liblinear')

# Fit the model.
fit = lr.fit(X_train, y_train)

# Display.
# print('Coefficients')
# print(fit.coef_)
# print(fit.intercept_)

pred_y_sklearn_train = lr.predict(X_train)
pred_y_sklearn_test = lr.predict(X_test)

print('\n Accuracy by success (train)')
print(pd.crosstab(pred_y_sklearn_train, y_train))

print('\n Percentage accuracy (train)')
print(lr.score(X_train, y_train))

print('\n Accuracy by success (test)')
print(pd.crosstab(pred_y_sklearn_test, y_test))

print('\n Percentage accuracy (test)')
print(lr.score(X_test, y_test))

# # CV
# scores = cross_val_score(lr, X, y, cv=10)

# print(scores)
 Accuracy by success (train)
USD_Pledged_Bins  0-50,000  50,000-100,000  100,000-500,000  \
row_0                                                         
0-50,000            273203            4228             3175   

USD_Pledged_Bins  500,000-1,000,000  1,000,000+  
row_0                                            
0-50,000                        331         206  

 Percentage accuracy (train)
0.9717581444318372

 Accuracy by success (test)
USD_Pledged_Bins  0-50,000  50,000-100,000  100,000-500,000  \
row_0                                                         
0-50,000             91068            1409             1059   

USD_Pledged_Bins  500,000-1,000,000  1,000,000+  
row_0                                            
0-50,000                        110          69  

 Percentage accuracy (test)
0.9717547884543563