Added example of using Titanic survival data from Kaggle
https://www.kaggle.com/c/titanic-gettingStarted
This commit is contained in:
parent
c650f86c3a
commit
a18cbbae19
1 changed files with 195 additions and 0 deletions
195
README.md
195
README.md
|
@ -491,6 +491,201 @@ user> clustered-ds
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Example: Predicting survival on the Titanic
|
||||||
|
|
||||||
|
https://www.kaggle.com/c/titanic-gettingStarted
|
||||||
|
|
||||||
|
First globally replace all double quoted strings `""foo""` with
|
||||||
|
backslash quoted strings: `\"foo\"`. Weka does not handle the former.
|
||||||
|
|
||||||
|
```clojure
|
||||||
|
user> (def titanicds (load-instances :csv "file:///home/josh/git/clj-ml/titanic-train.csv"))
|
||||||
|
user> titanicds
|
||||||
|
#<Instances @relation stream
|
||||||
|
|
||||||
|
@attribute PassengerId numeric
|
||||||
|
@attribute Survived numeric
|
||||||
|
@attribute Pclass numeric
|
||||||
|
@attribute Name {'Braund, Mr. Owen Harris','Cumings, Mrs. John Bradley (Florence Briggs Thayer)', ...}
|
||||||
|
@attribute Sex {male,female}
|
||||||
|
@attribute Age numeric
|
||||||
|
@attribute SibSp numeric
|
||||||
|
@attribute Parch numeric
|
||||||
|
@attribute Ticket {'A/5 21171','PC 17599','STON/O2. 3101282',113803.0, ...}
|
||||||
|
@attribute Fare numeric
|
||||||
|
@attribute Cabin {C85,C123,E46,G6,C103,D56,A6,'C23 C25 C27', ...}
|
||||||
|
@attribute Embarked {S,C,Q}
|
||||||
|
|
||||||
|
@data
|
||||||
|
1,0,3,'Braund, Mr. Owen Harris',male,22,1,0,'A/5 21171',7.25,?,S
|
||||||
|
2,1,1,'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',female,38,1,0,'PC 17599',71.2833,C85,C
|
||||||
|
3,1,3,'Heikkinen, Miss. Laina',female,26,0,0,'STON/O2. 3101282',7.925,?,S
|
||||||
|
4,1,1,'Futrelle, Mrs. Jacques Heath (Lily May Peel)',female,35,1,0,113803.0,53.1,C123,S
|
||||||
|
5,0,3,'Allen, Mr. William Henry',male,35,0,0,373450.0,8.05,?,S
|
||||||
|
6,0,3,'Moran, Mr. James',male,?,0,0,330877.0,8.4583,?,Q
|
||||||
|
7,0,1,'McCarthy, Mr. Timothy J',male,54,0,0,17463.0,51.8625,E46,S
|
||||||
|
8,0,3,'Palsson, Master. Gosta Leonard',male,2,3,1,349909.0,21.075,?,S
|
||||||
|
9,1,3,'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',female,27,0,2,347742.0,11.1333,?,S
|
||||||
|
10,1,2,'Nasser, Mrs. Nicholas (Adele Achem)',female,14,1,0,237736.0,30.0708,?,C
|
||||||
|
11,1,3,'Sandstrom, Miss. Marguerite Rut',female,4,1,1,'PP 9549',16.7,G6,S
|
||||||
|
...
|
||||||
|
>
|
||||||
|
|
||||||
|
#'user/titanicds
|
||||||
|
user> (def titanicds (dataset-set-class titanicds :Survived))
|
||||||
|
#'user/titanicds
|
||||||
|
user> (dataset-class-index titanicds)
|
||||||
|
1
|
||||||
|
|
||||||
|
user> (def titanicds (make-apply-filter :numeric-to-nominal
|
||||||
|
{:attributes [:Survived]}
|
||||||
|
titanicds))
|
||||||
|
#'user/titanicds
|
||||||
|
user> titanicds
|
||||||
|
#<Instances @relation stream-weka.filters.unsupervised.attribute.NumericToNominal-R2
|
||||||
|
|
||||||
|
@attribute PassengerId numeric
|
||||||
|
@attribute Survived {0,1}
|
||||||
|
@attribute Pclass numeric
|
||||||
|
...
|
||||||
|
>
|
||||||
|
|
||||||
|
user> (def titanicds (make-apply-filter :remove-attributes
|
||||||
|
{:attributes [:PassengerId :Name :Ticket :Cabin]}
|
||||||
|
titanicds))
|
||||||
|
#'user/titanicds
|
||||||
|
user> titanicds
|
||||||
|
#<Instances @relation 'stream-weka.filters.unsupervised.attribute.NumericToNominal...'
|
||||||
|
|
||||||
|
@attribute Survived {0,1}
|
||||||
|
@attribute Pclass numeric
|
||||||
|
@attribute Sex {male,female}
|
||||||
|
@attribute Age numeric
|
||||||
|
@attribute SibSp numeric
|
||||||
|
@attribute Parch numeric
|
||||||
|
@attribute Fare numeric
|
||||||
|
@attribute Embarked {S,C,Q}
|
||||||
|
|
||||||
|
@data
|
||||||
|
0,3,male,22,1,0,7.25,S
|
||||||
|
1,1,female,38,1,0,71.2833,C
|
||||||
|
1,3,female,26,0,0,7.925,S
|
||||||
|
1,1,female,35,1,0,53.1,S
|
||||||
|
0,3,male,35,0,0,8.05,S
|
||||||
|
0,3,male,?,0,0,8.4583,Q
|
||||||
|
...
|
||||||
|
>
|
||||||
|
|
||||||
|
user> (dataset-class-index titanicds)
|
||||||
|
0
|
||||||
|
user> (dataset-class-name titanicds)
|
||||||
|
:Survived
|
||||||
|
|
||||||
|
user> (def evaluation (classifier-evaluate (make-classifier :decision-tree :random-forest)
|
||||||
|
:cross-validation titanicds 10))
|
||||||
|
#'user/evaluation
|
||||||
|
user> (println (:summary evaluation))
|
||||||
|
|
||||||
|
Correctly Classified Instances 727 81.5937 %
|
||||||
|
Incorrectly Classified Instances 164 18.4063 %
|
||||||
|
Kappa statistic 0.6039
|
||||||
|
Mean absolute error 0.2409
|
||||||
|
Root mean squared error 0.3819
|
||||||
|
Relative absolute error 50.9302 %
|
||||||
|
Root relative squared error 78.532 %
|
||||||
|
Total Number of Instances 891
|
||||||
|
|
||||||
|
nil
|
||||||
|
user> (println (:confusion-matrix evaluation))
|
||||||
|
=== Confusion Matrix ===
|
||||||
|
|
||||||
|
a b <-- classified as
|
||||||
|
483 66 | a = 0
|
||||||
|
98 244 | b = 1
|
||||||
|
|
||||||
|
nil
|
||||||
|
```
|
||||||
|
|
||||||
|
Ok, looks good, let's try training on the full training data and
|
||||||
|
testing on the testing data.
|
||||||
|
|
||||||
|
```clojure
|
||||||
|
user> (def titanic-testds (load-instances :csv "file:///home/josh/git/clj-ml/titanic-test.csv"))
|
||||||
|
|
||||||
|
user> (def titanic-test-passids (map (comp int :PassengerId)
|
||||||
|
(dataset-as-maps titanic-testds)))
|
||||||
|
#'user/titanic-test-passids
|
||||||
|
user> titanic-test-passids
|
||||||
|
(892 893 894 895 896 897 898 899 900 ...)
|
||||||
|
|
||||||
|
user> (def titanic-testds (->> titanic-testds
|
||||||
|
(make-apply-filter :remove-attributes
|
||||||
|
{:attributes [:PassengerId :Name :Ticket :Cabin]})
|
||||||
|
(make-apply-filter :add-attribute
|
||||||
|
{:type :nominal :name :Survived
|
||||||
|
:column 0 :labels ["0" "1"]})))
|
||||||
|
#'user/titanic-testds
|
||||||
|
|
||||||
|
user> (def titanic-testds (dataset-set-class titanic-testds :Survived))
|
||||||
|
#'user/titanic-testds
|
||||||
|
|
||||||
|
user> titanic-testds
|
||||||
|
#<Instances @relation 'stream-weka.filters.unsupervised.attribute.Remove...'
|
||||||
|
|
||||||
|
@attribute Survived {0,1}
|
||||||
|
@attribute Pclass numeric
|
||||||
|
@attribute Sex {male,female}
|
||||||
|
@attribute Age numeric
|
||||||
|
@attribute SibSp numeric
|
||||||
|
@attribute Parch numeric
|
||||||
|
@attribute Fare numeric
|
||||||
|
@attribute Embarked {Q,S,C}
|
||||||
|
|
||||||
|
@data
|
||||||
|
?,3,male,34.5,0,0,7.8292,Q
|
||||||
|
?,3,female,47,1,0,7,S
|
||||||
|
?,2,male,62,0,0,9.6875,Q
|
||||||
|
?,3,matitanle,27,0,0,8.6625,S
|
||||||
|
?,3,female,22,1,1,12.2875,S
|
||||||
|
?,3,male,14,0,0,9.225,S
|
||||||
|
?,3,female,30,0,0,7.6292,Q
|
||||||
|
...
|
||||||
|
>
|
||||||
|
|
||||||
|
user> (def classifier (classifier-train (make-classifier :decision-tree :random-forest) titanicds))
|
||||||
|
#'user/classifier
|
||||||
|
|
||||||
|
user> (def preds (for [instance (dataset-seq titanic-testds)]
|
||||||
|
(name (classifier-classify classifier instance))))
|
||||||
|
#'user/preds
|
||||||
|
|
||||||
|
user> preds
|
||||||
|
("0" "1" "0" "0" "0" "0" "1" "0" "0" "0" ...)
|
||||||
|
|
||||||
|
#'user/preds
|
||||||
|
|
||||||
|
user> (spit "titanic-predictions.csv"
|
||||||
|
(clojure.string/join "\n" (cons "Survived,PassengerId"
|
||||||
|
(map (fn [c1 c2] (format "%s,%d" c1 c2))
|
||||||
|
preds titanic-test-passids))))
|
||||||
|
nil
|
||||||
|
|
||||||
|
user> (println (slurp "titanic-predictions.csv"))
|
||||||
|
Survived,PassengerId
|
||||||
|
0,892
|
||||||
|
1,893
|
||||||
|
0,894
|
||||||
|
0,895
|
||||||
|
0,896
|
||||||
|
0,897
|
||||||
|
1,898
|
||||||
|
0,899
|
||||||
|
0,900
|
||||||
|
0,901
|
||||||
|
0,902
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
## Thanks YourKit!
|
## Thanks YourKit!
|
||||||
|
|
||||||
YourKit is kindly supporting open source projects with its
|
YourKit is kindly supporting open source projects with its
|
||||||
|
|
Loading…
Reference in a new issue