In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sb


from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report
/Users/Steve/anaconda3/envs/PythonData/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [2]:
%matplotlib inline
rcParams['figure.figsize'] = 10, 8
sb.set_style('whitegrid')
In [3]:
# import file and store into variable name
data = "resources/kickstart201801_cleaned.csv"
In [4]:
# Read our Data file with the pandas library
# Not every CSV requires an encoding, but be aware this can come up
data_df = pd.read_csv(data, encoding="ISO-8859-1")
In [5]:
data_df.head()
Out[5]:
ID name category main_category currency deadline goal launched pledged state backers country usd pledged usd_pledged_real usd_goal_real
0 1000002330 The Songs of Adelaide & Abullah Poetry Publishing GBP 2015-10-09 1000.0 2015-08-11 12:12:28 0.0 failed 0 GB 0.0 0.0 1533.95
1 1000003930 Greeting From Earth: ZGAC Arts Capsule For ET Narrative Film Film & Video USD 2017-11-01 30000.0 2017-09-02 04:43:57 2421.0 failed 15 US 100.0 2421.0 30000.00
2 1000004038 Where is Hank? Narrative Film Film & Video USD 2013-02-26 45000.0 2013-01-12 00:20:50 220.0 failed 3 US 220.0 220.0 45000.00
3 1000007540 ToshiCapital Rekordz Needs Help to Complete Album Music Music USD 2012-04-16 5000.0 2012-03-17 03:24:11 1.0 failed 1 US 1.0 1.0 5000.00
4 1000011046 Community Film Project: The Art of Neighborhoo... Film & Video Film & Video USD 2015-08-29 19500.0 2015-07-04 08:35:03 1283.0 canceled 14 US 1283.0 1283.0 19500.00
In [7]:
from datetime import datetime

#converting launch string to datetime type
data_df['launched_timestamp'] = pd.to_datetime(data_df['launched'], infer_datetime_format=True)
data_df.head()
Out[7]:
ID name category main_category currency deadline goal launched pledged state backers country usd pledged usd_pledged_real usd_goal_real launched_timestamp
0 1000002330 The Songs of Adelaide & Abullah Poetry Publishing GBP 2015-10-09 1000.0 2015-08-11 12:12:28 0.0 failed 0 GB 0.0 0.0 1533.95 2015-08-11 12:12:28
1 1000003930 Greeting From Earth: ZGAC Arts Capsule For ET Narrative Film Film & Video USD 2017-11-01 30000.0 2017-09-02 04:43:57 2421.0 failed 15 US 100.0 2421.0 30000.00 2017-09-02 04:43:57
2 1000004038 Where is Hank? Narrative Film Film & Video USD 2013-02-26 45000.0 2013-01-12 00:20:50 220.0 failed 3 US 220.0 220.0 45000.00 2013-01-12 00:20:50
3 1000007540 ToshiCapital Rekordz Needs Help to Complete Album Music Music USD 2012-04-16 5000.0 2012-03-17 03:24:11 1.0 failed 1 US 1.0 1.0 5000.00 2012-03-17 03:24:11
4 1000011046 Community Film Project: The Art of Neighborhoo... Film & Video Film & Video USD 2015-08-29 19500.0 2015-07-04 08:35:03 1283.0 canceled 14 US 1283.0 1283.0 19500.00 2015-07-04 08:35:03
In [8]:
#converting deadline string to datatime type
data_df['deadline_timestamp'] = pd.to_datetime(data_df['deadline'], infer_datetime_format=True)
data_df.head()
Out[8]:
ID name category main_category currency deadline goal launched pledged state backers country usd pledged usd_pledged_real usd_goal_real launched_timestamp deadline_timestamp
0 1000002330 The Songs of Adelaide & Abullah Poetry Publishing GBP 2015-10-09 1000.0 2015-08-11 12:12:28 0.0 failed 0 GB 0.0 0.0 1533.95 2015-08-11 12:12:28 2015-10-09
1 1000003930 Greeting From Earth: ZGAC Arts Capsule For ET Narrative Film Film & Video USD 2017-11-01 30000.0 2017-09-02 04:43:57 2421.0 failed 15 US 100.0 2421.0 30000.00 2017-09-02 04:43:57 2017-11-01
2 1000004038 Where is Hank? Narrative Film Film & Video USD 2013-02-26 45000.0 2013-01-12 00:20:50 220.0 failed 3 US 220.0 220.0 45000.00 2013-01-12 00:20:50 2013-02-26
3 1000007540 ToshiCapital Rekordz Needs Help to Complete Album Music Music USD 2012-04-16 5000.0 2012-03-17 03:24:11 1.0 failed 1 US 1.0 1.0 5000.00 2012-03-17 03:24:11 2012-04-16
4 1000011046 Community Film Project: The Art of Neighborhoo... Film & Video Film & Video USD 2015-08-29 19500.0 2015-07-04 08:35:03 1283.0 canceled 14 US 1283.0 1283.0 19500.00 2015-07-04 08:35:03 2015-08-29
In [9]:
#calculating duration of each project
data_df['duration'] = pd.Series(delta.days for delta in (data_df['deadline_timestamp'] - data_df['launched_timestamp']))
data_df.head()
Out[9]:
ID name category main_category currency deadline goal launched pledged state backers country usd pledged usd_pledged_real usd_goal_real launched_timestamp deadline_timestamp duration
0 1000002330 The Songs of Adelaide & Abullah Poetry Publishing GBP 2015-10-09 1000.0 2015-08-11 12:12:28 0.0 failed 0 GB 0.0 0.0 1533.95 2015-08-11 12:12:28 2015-10-09 58
1 1000003930 Greeting From Earth: ZGAC Arts Capsule For ET Narrative Film Film & Video USD 2017-11-01 30000.0 2017-09-02 04:43:57 2421.0 failed 15 US 100.0 2421.0 30000.00 2017-09-02 04:43:57 2017-11-01 59
2 1000004038 Where is Hank? Narrative Film Film & Video USD 2013-02-26 45000.0 2013-01-12 00:20:50 220.0 failed 3 US 220.0 220.0 45000.00 2013-01-12 00:20:50 2013-02-26 44
3 1000007540 ToshiCapital Rekordz Needs Help to Complete Album Music Music USD 2012-04-16 5000.0 2012-03-17 03:24:11 1.0 failed 1 US 1.0 1.0 5000.00 2012-03-17 03:24:11 2012-04-16 29
4 1000011046 Community Film Project: The Art of Neighborhoo... Film & Video Film & Video USD 2015-08-29 19500.0 2015-07-04 08:35:03 1283.0 canceled 14 US 1283.0 1283.0 19500.00 2015-07-04 08:35:03 2015-08-29 55
In [10]:
#view which columns have null values and how many
print(data_df.isnull().sum())
ID                    0
name                  0
category              0
main_category         0
currency              0
deadline              0
goal                  0
launched              0
pledged               0
state                 0
backers               0
country               0
usd pledged           0
usd_pledged_real      0
usd_goal_real         0
launched_timestamp    0
deadline_timestamp    0
duration              0
dtype: int64
In [11]:
#value counts on main category
data_df['main_category'].value_counts()
Out[11]:
Film & Video    62696
Music           49530
Publishing      39379
Games           35225
Technology      32562
Design          30066
Art             28152
Food            24599
Fashion         22812
Theater         10912
Comics          10819
Photography     10778
Crafts           8809
Journalism       4754
Dance            3767
Name: main_category, dtype: int64
In [12]:
#dropping columns we are not going to use for model
data_df = data_df.drop(['ID','name','category','currency', 'deadline', 'launched', 'country','goal','pledged','usd pledged', 'usd_pledged_real','launched_timestamp','deadline_timestamp' ], 1)
data_df.head()
Out[12]:
main_category state backers usd_goal_real duration
0 Publishing failed 0 1533.95 58
1 Film & Video failed 15 30000.00 59
2 Film & Video failed 3 45000.00 44
3 Music failed 1 5000.00 29
4 Film & Video canceled 14 19500.00 55
In [13]:
#turning our Y variable of success into binary 1 and 0
data_df['success'] = np.where(data_df.state == 'successful', 1, 0)
data_df.head()
Out[13]:
main_category state backers usd_goal_real duration success
0 Publishing failed 0 1533.95 58 0
1 Film & Video failed 15 30000.00 59 0
2 Film & Video failed 3 45000.00 44 0
3 Music failed 1 5000.00 29 0
4 Film & Video canceled 14 19500.00 55 0
In [14]:
#dropping more columns we wont use for our model
data_df = data_df.drop(['state', 'backers'], 1)
data_df.head()
Out[14]:
main_category usd_goal_real duration success
0 Publishing 1533.95 58 0
1 Film & Video 30000.00 59 0
2 Film & Video 45000.00 44 0
3 Music 5000.00 29 0
4 Film & Video 19500.00 55 0
In [15]:
#get dummies for main category column and store into own df
main_cat_dmy = pd.get_dummies(data_df['main_category'], prefix_sep="_", drop_first=True)
main_cat_dmy.head()
Out[15]:
Comics Crafts Dance Design Fashion Film & Video Food Games Journalism Music Photography Publishing Technology Theater
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 1 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 1 0 0 0 0
4 0 0 0 0 0 1 0 0 0 0 0 0 0 0
In [16]:
#drop main category
data_df.drop(['main_category'],axis=1,inplace=True)
data_df.head()
Out[16]:
usd_goal_real duration success
0 1533.95 58 0
1 30000.00 59 0
2 45000.00 44 0
3 5000.00 29 0
4 19500.00 55 0
In [17]:
#concatenate dummies with data df
data_dmy = pd.concat([data_df,main_cat_dmy],axis=1)
data_dmy.head()
Out[17]:
usd_goal_real duration success Comics Crafts Dance Design Fashion Film & Video Food Games Journalism Music Photography Publishing Technology Theater
0 1533.95 58 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
1 30000.00 59 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
2 45000.00 44 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
3 5000.00 29 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
4 19500.00 55 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
In [18]:
X = data_dmy.iloc[:,[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15]]
y = data_dmy.iloc[:,2]
In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=25)
In [20]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier
Out[20]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [21]:
classifier.fit(X_train, y_train)
Out[21]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [22]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
Training Data Score: 0.6430362573455995
Testing Data Score: 0.642684379946291
In [23]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})
Out[23]:
Prediction Actual
154259 0 0
50036 0 1
176911 0 0
284196 0 0
104965 0 0
294841 0 0
205935 0 1
123475 0 0
225886 0 1
132483 0 1
40495 0 0
242371 0 1
289283 0 0
227102 0 0
284551 0 1
99713 0 0
12708 0 0
357879 0 0
103027 0 1
291681 0 1
365795 0 0
216444 0 0
24976 0 0
303340 0 1
88880 0 0
251675 0 1
148805 0 0
177130 0 1
163468 0 0
116514 0 0
... ... ...
12022 0 0
248672 0 0
13148 0 0
138629 0 1
102604 0 0
115378 0 0
296323 0 0
304036 0 1
244718 0 0
243199 0 0
121195 0 0
280298 0 0
243109 0 0
270883 0 0
154403 0 0
94411 0 0
127833 0 0
99903 0 0
20774 0 0
322446 0 1
323852 0 1
56258 0 1
218689 0 1
64377 0 0
321446 0 1
104524 0 1
321069 0 1
314587 0 1
127473 0 0
140978 0 0

112458 rows × 2 columns

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predictions)
confusion_matrix
Out[24]:
array([[72275,     0],
       [40183,     0]])

The results from the confusion matrix are telling us that 72275 and 0 are the number of correct predictions. 40183 and 0 are the number of incorrect predictions.

In [25]:
print(classification_report(y_test, predictions))
             precision    recall  f1-score   support

          0       0.64      1.00      0.78     72275
          1       0.00      0.00      0.00     40183

avg / total       0.41      0.64      0.50    112458

/Users/Steve/anaconda3/envs/PythonData/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.

The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.

The support is the number of occurrences of each class in y_test.

Taking the independent variables of main category and goal amount, we can predict success with 64% accuracy. To better tune our model, maybe we can use more features such as sub categories, country location, or length of project.