adds split-dataset via new remove-percentage filter wrapper
This commit is contained in:
parent
b1130cf80d
commit
0c32c318af
5 changed files with 66 additions and 2 deletions
|
@ -12,6 +12,7 @@
|
|||
that can be transformed using usual Clojure functions like map, reduce, etc."
|
||||
(:use [clj-ml utils]
|
||||
[clojure.contrib.seq :only [find-first]])
|
||||
(:require [clj-ml.filters :as filters])
|
||||
(:import (weka.core Instance Instances FastVector Attribute)
|
||||
(cljml ClojureInstances)))
|
||||
|
||||
|
@ -431,3 +432,18 @@ The intention is for this to be used on data-formats and not on datasets with da
|
|||
(doto dataset
|
||||
(.deleteAttributeAt (int attr-pos))
|
||||
(.insertAttributeAt new-attr (int attr-pos)))))
|
||||
|
||||
(defn split-dataset
|
||||
"Splits the dataset into two parts based on the percentage given.
|
||||
The first dataset returned will have 'percentage ammount of the original dataset and the second has the
|
||||
remaining portion. Both datasets are Delay objects that need to be dereffed. If you want to have the
|
||||
split immediately you can use do-split-dataset."
|
||||
[ds percentage]
|
||||
[(delay (filters/remove-percentage ds {:percentage percentage :invert true}))
|
||||
(delay (filters/remove-percentage ds {:percentage percentage}))])
|
||||
|
||||
(defn do-split-dataset
|
||||
"Splits the dataset into two parts based on the percentage given. The same as split-dataset but
|
||||
actual datasets are returned and not Delay objects that need dereffing."
|
||||
[ds percentage]
|
||||
(map deref (split-dataset ds percentage)))
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
The previous sample of code could be rewritten with the make-apply-filter function:
|
||||
|
||||
(def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))"
|
||||
(:use [clj-ml data utils options-utils]
|
||||
(:use [clj-ml utils options-utils]
|
||||
[clojure.contrib [def :only [defvar defvar-]]])
|
||||
(:require [clojure.contrib [string :as str]])
|
||||
(:import (weka.filters Filter)
|
||||
|
@ -135,6 +135,13 @@
|
|||
|
||||
(deffilter remove-attributes)
|
||||
|
||||
(defmethod make-filter-options :remove-percentage
|
||||
([kind m]
|
||||
(->> (check-option-values m {:percentage "-P"})
|
||||
(check-options m {:invert "-V"}))))
|
||||
|
||||
(deffilter remove-percentage)
|
||||
|
||||
(defmethod make-filter-options :remove-useless-attributes
|
||||
([kind m]
|
||||
(check-option-values m {:max-variance "-M"})))
|
||||
|
@ -170,6 +177,7 @@
|
|||
:numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal
|
||||
:add-attribute weka.filters.unsupervised.attribute.Add
|
||||
:remove-attributes weka.filters.unsupervised.attribute.Remove
|
||||
:remove-percentage weka.filters.unsupervised.instance.RemovePercentage
|
||||
:remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless
|
||||
:select-append-attributes weka.filters.unsupervised.attribute.Copy
|
||||
:project-attributes weka.filters.unsupervised.attribute.Remove}
|
||||
|
@ -188,6 +196,7 @@
|
|||
- :numeric-to-nominal
|
||||
- :add-attribute
|
||||
- :remove-attributes
|
||||
- :remove-percentage
|
||||
- :remove-useless-attributes
|
||||
- :select-append-attributes
|
||||
- :project-attributes
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
(ns #^{:author "Ben Mabey <ben@benmabey.com>"
|
||||
:skip-wiki true}
|
||||
clj-ml.options-utils
|
||||
(:use [clj-ml data])
|
||||
(:use [clojure.contrib.seq :only [find-first]])
|
||||
(:require [clojure.contrib [string :as str]]))
|
||||
|
||||
;; Manipulation of array of options
|
||||
|
@ -26,6 +26,18 @@
|
|||
(conj (conj opts flag) (str val-in-map)))))
|
||||
|
||||
|
||||
;; attr-name and dataset-index-attr copy and pasted from data due to Clojure's inability
|
||||
;; to handle circular dependencies. :(
|
||||
(defn- attr-name [^weka.core.Attribute attr]
|
||||
(.name attr))
|
||||
|
||||
(defn- dataset-index-attr
|
||||
"Returns the index of an attribute in the attributes definition of a dataset."
|
||||
[^weka.core.Instances dataset attr]
|
||||
(if (number? attr)
|
||||
attr
|
||||
(find-first #(= (name attr) (attr-name (.attribute dataset (int %)))) (range (.numAttributes dataset)))))
|
||||
|
||||
(defn extract-attributes
|
||||
"Transforms the :attributes value from m into the appropriate weka flag"
|
||||
([m] (extract-attributes "-R" m))
|
||||
|
|
|
@ -180,3 +180,23 @@
|
|||
(is (= nil (dataset-class-name ds)))
|
||||
(dataset-set-class ds :b)
|
||||
(is (= :b (dataset-class-name ds)))))
|
||||
|
||||
(deftest split-dataset-test
|
||||
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
|
||||
[[1 :foo]
|
||||
[2 :bar]
|
||||
[3 :bar]
|
||||
[4 :foo]])
|
||||
[a b] (split-dataset ds 25)]
|
||||
(is (= (dataset-count @a) 1))
|
||||
(is (= (dataset-count @b) 3))))
|
||||
|
||||
(deftest do-split-dataset-test
|
||||
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
|
||||
[[1 :foo]
|
||||
[2 :bar]
|
||||
[3 :bar]
|
||||
[4 :foo]])
|
||||
[a b] (do-split-dataset ds 25)]
|
||||
(is (= (dataset-count a) 1))
|
||||
(is (= (dataset-count b) 3))))
|
||||
|
|
|
@ -107,6 +107,13 @@
|
|||
(is (= (dataset-format res)
|
||||
[:b {:c '(:g :m)}]))))
|
||||
|
||||
(deftest remove-precentage-test
|
||||
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]
|
||||
[ [1 2 :g]
|
||||
[2 3 :m]
|
||||
[4 2 :m]
|
||||
[4 5 :g]])]
|
||||
(is (= (dataset-count (remove-percentage ds {:percentage 75})) 1))))
|
||||
|
||||
(deftest make-apply-filter-numeric-to-nominal
|
||||
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]
|
||||
|
|
Loading…
Reference in a new issue