clj-ml/test/clj_ml/data_test.clj

246 lines
9.5 KiB
Clojure
Raw Normal View History

2010-02-28 12:14:17 +00:00
(ns clj-ml.data-test
(:use [clj-ml.data] :reload-all)
(:use [clojure.test]))
(deftest make-instance-num
(let [dataset (make-dataset :test
[:a :b]
1)
inst (make-instance dataset [1 2])]
(is (instance? weka.core.Instance inst))
2013-07-31 10:50:59 +00:00
(is (= 2 (.numValues inst)))
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1)))))
2010-02-28 12:14:17 +00:00
(deftest make-instance-ord
(let [dataset (make-dataset :test
[:a {:b [:b1 :b2]}]
1)
inst (make-instance dataset [1 :b1])]
(is (instance? weka.core.Instance inst))
2013-07-31 10:50:59 +00:00
(is (= 2 (.numValues inst)))
(is (= 1.0 (.value inst 0)))
(is (= "b1" (.stringValue inst 1)))))
(deftest make-instance-nils
(let [dataset (make-dataset :test
[:a :b]
1)
inst (make-instance dataset [1 nil])]
(is (instance? weka.core.Instance inst))
2013-07-31 10:50:59 +00:00
(is (= 2 (.numValues inst)))
(is (= 1.0 (.value inst 0)))
(is (Double/isNaN (.value inst 1)))))
2010-03-03 13:46:08 +00:00
(deftest dataset-make-dataset-with-default-class
(let [ds (clj-ml.data/make-dataset :test [:a :b {:c [:d :e]}] [] {:class :c})
ds2 (clj-ml.data/make-dataset :test [:a :b {:c [:d :e]}] [] {:class 2})]
2010-12-30 17:14:57 +00:00
(is (= (clj-ml.data/dataset-class-name ds)
:c))
(is (= (clj-ml.data/dataset-class-index ds2)
2010-03-03 13:46:08 +00:00
2))))
2010-02-28 12:14:17 +00:00
(deftest dataset-change-class
(let [dataset (make-dataset :test
[:a :b]
2010-02-28 19:51:17 +00:00
2)
2013-07-31 10:50:59 +00:00
_ (clj-ml.data/dataset-set-class dataset 1)]
2010-02-28 12:14:17 +00:00
(is (= 1 (.classIndex dataset)))
(is (= 0 (.classIndex (dataset-set-class dataset 0))))
(testing "when a string or symbol is passed in"
(is (= 1 (.classIndex (dataset-set-class dataset "b"))))
(is (= 0 (.classIndex (dataset-set-class dataset "a")))))))
2010-12-07 00:36:36 +00:00
(deftest dataset-class-values-test
(let [dataset (make-dataset :test
[:age :iq {:favorite-color [:red :blue :green]}]
[[12 100 :red]
[14 110 :blue]
[ 25 120 :green]])]
(testing "when the class is numeric"
(dataset-set-class dataset :iq)
(is (= [100.0 110.0 120.0] (dataset-class-values dataset))))
2010-12-07 00:36:36 +00:00
(testing "when the class is nominal"
(dataset-set-class dataset :favorite-color)
(is (= ["red" "blue" "green"] (dataset-class-values dataset))))))
(deftest dataset-name-utils
(let [dataset (make-dataset :test
[:age :iq {:favorite-color [:red :blue :green]}]
[[12 100 :red]
[14 110 :blue]
[ 25 120 :green]])]
(is (= "test" (dataset-name dataset)))
(is (= "new-name" (dataset-name (dataset-set-name dataset "new-name"))))
(is (= "new-name-extra" (dataset-name (dataset-append-name dataset "-extra"))))))
2010-02-28 12:14:17 +00:00
(deftest dataset-count-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(is (= 1 (dataset-count dataset)))))
2010-02-28 12:14:17 +00:00
(deftest dataset-add-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1))))))
2010-02-28 12:14:17 +00:00
(deftest dataset-add-2
(let [dataset (make-dataset :test
[:a :b]
2)
instance (make-instance dataset [1 2])]
(dataset-add dataset instance)
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1))))))
2010-02-28 12:14:17 +00:00
(deftest dataset-extract-at-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1)))
(let [inst-ext (dataset-extract-at dataset 0)]
(is (= 0 (.numInstances dataset)))
(is (= 1.0 (.value inst-ext 0)))
(is (= 2.0 (.value inst-ext 1)))))))
2010-02-28 12:14:17 +00:00
(deftest dataset-pop-1
(let [dataset (make-dataset :test
[:a :b]
2)]
(dataset-add dataset [1 2])
(let [inst (.lastInstance dataset)]
(is (= 1.0 (.value inst 0)))
(is (= 2.0 (.value inst 1)))
(let [inst-ext (dataset-pop dataset)]
(is (= 0 (.numInstances dataset)))
(is (= 1.0 (.value inst-ext 0)))
(is (= 2.0 (.value inst-ext 1)))))))
2010-02-28 12:14:17 +00:00
(deftest dataset-seq-1
(let [dataset (make-dataset :test [:a :b {:c [:e :f]}] [[1 2 :e] [3 4 :f]])
seq (dataset-seq dataset)]
(is (sequential? seq))))
(deftest working-sequences-and-helpers
(let [ds (make-dataset "test" [:a :b {:c [:d :e]}] [{:a 1 :b 2 :c nil} [4 nil :e]])]
2010-02-28 12:14:17 +00:00
(is (= 2 (dataset-count ds)))
(is (= [{:a 1.0 :b 2.0 :c nil} {:a 4.0 :b nil :c :e}] (dataset-as-maps ds)))
(is (= [{:weight 1.0} {:weight 1.0}] (map meta (dataset-as-maps ds))))
(is (= [[1.0 2.0 nil] [4.0 nil :e]] (dataset-as-vecs ds)))
(is (= [{:weight 1.0} {:weight 1.0}] (map #(meta (instance-to-map %1)) (dataset-seq ds))))
(is (= [{:a 1.0 :b 2.0 :c nil} {:a 4.0 :b nil :c :e}] (map #(instance-to-map %1) (dataset-seq ds))))))
2010-03-03 13:46:08 +00:00
(deftest dataset-instance-predicates
(let [ds (make-dataset "test" [:a :b {:c [:d :e]}] [{:a 1 :b 2 :c :d} [4 5 :e]])
inst (dataset-at ds 0)]
(is (is-dataset? ds))
(is (not (is-dataset? inst)))
2010-11-04 17:50:42 +00:00
(is (not (is-dataset? "something else")))
2010-03-03 13:46:08 +00:00
(is (is-instance? inst))
(is (not (is-instance? ds)))))
(deftest attributes-tests
(let [ds (make-dataset "test" [:a :b {:c [:d :e]}] [{:a 1 :b 2 :c :d} [4 5 :e]])
attrs (attributes ds)]
(is (every? #(instance? weka.core.Attribute %) attrs))
(is (= (first attrs) (attribute-at ds 0) (attribute-at ds :a)))
(is (= '("a" "b" "c") (map #(.name %) attrs)))
(is (= '("a" "b" "c") (map #(.name %) (attributes (dataset-at ds 0)))))
(is (= [(.attribute ds 2)] (nominal-attributes ds)))
(is (= [(.attribute ds 0) (.attribute ds 1)] (numeric-attributes ds)))
(is (= '(:a :b :c) (attribute-names ds)))))
2010-11-08 17:37:02 +00:00
(deftest replacing-attributes
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}] [[1 :foo] [2 :bar]])
_ (dataset-replace-attribute! ds :b (nominal-attribute :b [:baz :shaz]))]
(is (= [:a {:b [:baz :shaz]}] (dataset-format ds)))))
(deftest dataset-label-helpers
(let [ds (make-dataset "test" [:a :b {:c [:d :e]}]
[{:a 1 :b 2 :c :d} [4 5 :e]])]
(dataset-set-class ds :c)
(is (= {:d 0 :e 1} (dataset-class-labels ds) (dataset-labels-at ds :c)))
(is (= #{:d :e} (attribute-labels (first (nominal-attributes ds)))))))
(deftest dataset-format-and-headers-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}] [[1 :foo] [2 :bar]])]
(is (= [:a {:b [:foo :bar]}] (dataset-format ds)))
(let [headers (headers-only ds)]
(is (= 0 (dataset-count headers)))
(is (= "test" (dataset-name headers)))
(is (= [:a {:b [:foo :bar]}] (dataset-format headers))))))
2010-12-30 17:14:57 +00:00
(deftest dataset-class-helpers
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}] [[1 :foo] [2 :bar]])]
(is (= nil (dataset-class-name ds)))
(dataset-set-class ds :b)
(is (= :b (dataset-class-name ds)))))
(deftest split-dataset-percentage-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])
[a b] (split-dataset ds :percentage 25)]
(is (= (dataset-count @a) 1))
(is (= (dataset-count @b) 3))))
(deftest split-dataset-num-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])
[a b] (split-dataset ds :num 1)]
(is (= (dataset-count @a) 1))
(is (= (dataset-count @b) 3))))
(deftest do-split-dataset-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])
[a b] (do-split-dataset ds :percentage 25)]
(is (= (dataset-count a) 1))
(is (= (dataset-count b) 3))))
2011-12-08 22:27:47 +00:00
(deftest take-dataset-test
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
[[1 :foo]
[2 :bar]
[3 :bar]
[4 :foo]])]
(is (= (dataset-as-vecs (take-dataset ds 2)) [[1.0 :foo] [2.0 :bar]]))))
(deftest docs-to-dataset-test
(let [docs [{:id 10
:title "Document title 1"
:fulltext "This is the fulltext..."
:has-class? false}
{:id 11
:title "Another document title"
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
:has-class? true}]
{:keys [dataset docids]} (docs-to-dataset docs "bananas-model" "."
:stemmer true :lowercase false)
docid-ds-vecs (apply hash-map (interleave docids (dataset-as-vecs dataset)))]
(is (= [:no 0.4804530139182014 0.0 0.4804530139182014 0.0]
(get docid-ds-vecs 10)))
(is (= [:yes 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014]
(get docid-ds-vecs 11)))))