adds split-dataset via new remove-percentage filter wrapper
This commit is contained in:
parent
b1130cf80d
commit
0c32c318af
5 changed files with 66 additions and 2 deletions
|
@ -12,6 +12,7 @@
|
||||||
that can be transformed using usual Clojure functions like map, reduce, etc."
|
that can be transformed using usual Clojure functions like map, reduce, etc."
|
||||||
(:use [clj-ml utils]
|
(:use [clj-ml utils]
|
||||||
[clojure.contrib.seq :only [find-first]])
|
[clojure.contrib.seq :only [find-first]])
|
||||||
|
(:require [clj-ml.filters :as filters])
|
||||||
(:import (weka.core Instance Instances FastVector Attribute)
|
(:import (weka.core Instance Instances FastVector Attribute)
|
||||||
(cljml ClojureInstances)))
|
(cljml ClojureInstances)))
|
||||||
|
|
||||||
|
@ -431,3 +432,18 @@ The intention is for this to be used on data-formats and not on datasets with da
|
||||||
(doto dataset
|
(doto dataset
|
||||||
(.deleteAttributeAt (int attr-pos))
|
(.deleteAttributeAt (int attr-pos))
|
||||||
(.insertAttributeAt new-attr (int attr-pos)))))
|
(.insertAttributeAt new-attr (int attr-pos)))))
|
||||||
|
|
||||||
|
(defn split-dataset
|
||||||
|
"Splits the dataset into two parts based on the percentage given.
|
||||||
|
The first dataset returned will have 'percentage ammount of the original dataset and the second has the
|
||||||
|
remaining portion. Both datasets are Delay objects that need to be dereffed. If you want to have the
|
||||||
|
split immediately you can use do-split-dataset."
|
||||||
|
[ds percentage]
|
||||||
|
[(delay (filters/remove-percentage ds {:percentage percentage :invert true}))
|
||||||
|
(delay (filters/remove-percentage ds {:percentage percentage}))])
|
||||||
|
|
||||||
|
(defn do-split-dataset
|
||||||
|
"Splits the dataset into two parts based on the percentage given. The same as split-dataset but
|
||||||
|
actual datasets are returned and not Delay objects that need dereffing."
|
||||||
|
[ds percentage]
|
||||||
|
(map deref (split-dataset ds percentage)))
|
||||||
|
|
|
@ -36,7 +36,7 @@
|
||||||
The previous sample of code could be rewritten with the make-apply-filter function:
|
The previous sample of code could be rewritten with the make-apply-filter function:
|
||||||
|
|
||||||
(def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))"
|
(def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))"
|
||||||
(:use [clj-ml data utils options-utils]
|
(:use [clj-ml utils options-utils]
|
||||||
[clojure.contrib [def :only [defvar defvar-]]])
|
[clojure.contrib [def :only [defvar defvar-]]])
|
||||||
(:require [clojure.contrib [string :as str]])
|
(:require [clojure.contrib [string :as str]])
|
||||||
(:import (weka.filters Filter)
|
(:import (weka.filters Filter)
|
||||||
|
@ -135,6 +135,13 @@
|
||||||
|
|
||||||
(deffilter remove-attributes)
|
(deffilter remove-attributes)
|
||||||
|
|
||||||
|
(defmethod make-filter-options :remove-percentage
|
||||||
|
([kind m]
|
||||||
|
(->> (check-option-values m {:percentage "-P"})
|
||||||
|
(check-options m {:invert "-V"}))))
|
||||||
|
|
||||||
|
(deffilter remove-percentage)
|
||||||
|
|
||||||
(defmethod make-filter-options :remove-useless-attributes
|
(defmethod make-filter-options :remove-useless-attributes
|
||||||
([kind m]
|
([kind m]
|
||||||
(check-option-values m {:max-variance "-M"})))
|
(check-option-values m {:max-variance "-M"})))
|
||||||
|
@ -170,6 +177,7 @@
|
||||||
:numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal
|
:numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal
|
||||||
:add-attribute weka.filters.unsupervised.attribute.Add
|
:add-attribute weka.filters.unsupervised.attribute.Add
|
||||||
:remove-attributes weka.filters.unsupervised.attribute.Remove
|
:remove-attributes weka.filters.unsupervised.attribute.Remove
|
||||||
|
:remove-percentage weka.filters.unsupervised.instance.RemovePercentage
|
||||||
:remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless
|
:remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless
|
||||||
:select-append-attributes weka.filters.unsupervised.attribute.Copy
|
:select-append-attributes weka.filters.unsupervised.attribute.Copy
|
||||||
:project-attributes weka.filters.unsupervised.attribute.Remove}
|
:project-attributes weka.filters.unsupervised.attribute.Remove}
|
||||||
|
@ -188,6 +196,7 @@
|
||||||
- :numeric-to-nominal
|
- :numeric-to-nominal
|
||||||
- :add-attribute
|
- :add-attribute
|
||||||
- :remove-attributes
|
- :remove-attributes
|
||||||
|
- :remove-percentage
|
||||||
- :remove-useless-attributes
|
- :remove-useless-attributes
|
||||||
- :select-append-attributes
|
- :select-append-attributes
|
||||||
- :project-attributes
|
- :project-attributes
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
(ns #^{:author "Ben Mabey <ben@benmabey.com>"
|
(ns #^{:author "Ben Mabey <ben@benmabey.com>"
|
||||||
:skip-wiki true}
|
:skip-wiki true}
|
||||||
clj-ml.options-utils
|
clj-ml.options-utils
|
||||||
(:use [clj-ml data])
|
(:use [clojure.contrib.seq :only [find-first]])
|
||||||
(:require [clojure.contrib [string :as str]]))
|
(:require [clojure.contrib [string :as str]]))
|
||||||
|
|
||||||
;; Manipulation of array of options
|
;; Manipulation of array of options
|
||||||
|
@ -26,6 +26,18 @@
|
||||||
(conj (conj opts flag) (str val-in-map)))))
|
(conj (conj opts flag) (str val-in-map)))))
|
||||||
|
|
||||||
|
|
||||||
|
;; attr-name and dataset-index-attr copy and pasted from data due to Clojure's inability
|
||||||
|
;; to handle circular dependencies. :(
|
||||||
|
(defn- attr-name [^weka.core.Attribute attr]
|
||||||
|
(.name attr))
|
||||||
|
|
||||||
|
(defn- dataset-index-attr
|
||||||
|
"Returns the index of an attribute in the attributes definition of a dataset."
|
||||||
|
[^weka.core.Instances dataset attr]
|
||||||
|
(if (number? attr)
|
||||||
|
attr
|
||||||
|
(find-first #(= (name attr) (attr-name (.attribute dataset (int %)))) (range (.numAttributes dataset)))))
|
||||||
|
|
||||||
(defn extract-attributes
|
(defn extract-attributes
|
||||||
"Transforms the :attributes value from m into the appropriate weka flag"
|
"Transforms the :attributes value from m into the appropriate weka flag"
|
||||||
([m] (extract-attributes "-R" m))
|
([m] (extract-attributes "-R" m))
|
||||||
|
|
|
@ -180,3 +180,23 @@
|
||||||
(is (= nil (dataset-class-name ds)))
|
(is (= nil (dataset-class-name ds)))
|
||||||
(dataset-set-class ds :b)
|
(dataset-set-class ds :b)
|
||||||
(is (= :b (dataset-class-name ds)))))
|
(is (= :b (dataset-class-name ds)))))
|
||||||
|
|
||||||
|
(deftest split-dataset-test
|
||||||
|
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
|
||||||
|
[[1 :foo]
|
||||||
|
[2 :bar]
|
||||||
|
[3 :bar]
|
||||||
|
[4 :foo]])
|
||||||
|
[a b] (split-dataset ds 25)]
|
||||||
|
(is (= (dataset-count @a) 1))
|
||||||
|
(is (= (dataset-count @b) 3))))
|
||||||
|
|
||||||
|
(deftest do-split-dataset-test
|
||||||
|
(let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
|
||||||
|
[[1 :foo]
|
||||||
|
[2 :bar]
|
||||||
|
[3 :bar]
|
||||||
|
[4 :foo]])
|
||||||
|
[a b] (do-split-dataset ds 25)]
|
||||||
|
(is (= (dataset-count a) 1))
|
||||||
|
(is (= (dataset-count b) 3))))
|
||||||
|
|
|
@ -107,6 +107,13 @@
|
||||||
(is (= (dataset-format res)
|
(is (= (dataset-format res)
|
||||||
[:b {:c '(:g :m)}]))))
|
[:b {:c '(:g :m)}]))))
|
||||||
|
|
||||||
|
(deftest remove-precentage-test
|
||||||
|
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]
|
||||||
|
[ [1 2 :g]
|
||||||
|
[2 3 :m]
|
||||||
|
[4 2 :m]
|
||||||
|
[4 5 :g]])]
|
||||||
|
(is (= (dataset-count (remove-percentage ds {:percentage 75})) 1))))
|
||||||
|
|
||||||
(deftest make-apply-filter-numeric-to-nominal
|
(deftest make-apply-filter-numeric-to-nominal
|
||||||
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]
|
(let [ds (make-dataset :test [:a :b {:c [:g :m]}]
|
||||||
|
|
Loading…
Reference in a new issue