adds split-dataset via new remove-percentage filter wrapper

2011-12-07 10:40:41 -07:00 · 2011-12-07 10:40:41 -07:00 · 0c32c318af
commit 0c32c318af
parent b1130cf80d
5 changed files with 66 additions and 2 deletions
--- a/src/clj_ml/data.clj
+++ b/src/clj_ml/data.clj
@ -12,6 +12,7 @@
   that can be transformed using usual Clojure functions like map, reduce, etc."
  (:use [clj-ml utils]
        [clojure.contrib.seq :only [find-first]])
+  (:require [clj-ml.filters :as filters])
  (:import (weka.core Instance Instances FastVector Attribute)
           (cljml ClojureInstances)))

@ -431,3 +432,18 @@ The intention is for this to be used on data-formats and not on datasets with da
    (doto dataset
      (.deleteAttributeAt (int attr-pos))
      (.insertAttributeAt new-attr (int attr-pos)))))
+
+(defn split-dataset
+  "Splits the dataset into two parts based on the percentage given.
+The first dataset returned will have 'percentage ammount of the original dataset and the second has the
+remaining portion. Both datasets are Delay objects that need to be dereffed.  If you want to have the
+split immediately you can use do-split-dataset."
+  [ds percentage]
+  [(delay (filters/remove-percentage ds {:percentage percentage :invert true}))
+   (delay (filters/remove-percentage ds {:percentage percentage}))])
+
+(defn do-split-dataset
+  "Splits the dataset into two parts based on the percentage given. The same as split-dataset but
+actual datasets are returned and not Delay objects that need dereffing."
+  [ds percentage]
+  (map deref (split-dataset ds percentage)))
--- a/src/clj_ml/filters.clj
+++ b/src/clj_ml/filters.clj
@ -36,7 +36,7 @@
   The previous sample of code could be rewritten with the make-apply-filter function:

     (def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))"
-  (:use [clj-ml data utils options-utils]
+  (:use [clj-ml utils options-utils]
        [clojure.contrib [def :only [defvar defvar-]]])
  (:require [clojure.contrib [string :as str]])
  (:import (weka.filters Filter)
@ -135,6 +135,13 @@

 (deffilter remove-attributes)

+(defmethod make-filter-options :remove-percentage
+  ([kind m]
+     (->> (check-option-values m {:percentage "-P"})
+          (check-options m {:invert "-V"}))))
+
+(deffilter remove-percentage)
+
 (defmethod make-filter-options :remove-useless-attributes
  ([kind m]
     (check-option-values m {:max-variance "-M"})))
@ -170,6 +177,7 @@
   :numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal
   :add-attribute weka.filters.unsupervised.attribute.Add
   :remove-attributes weka.filters.unsupervised.attribute.Remove
+   :remove-percentage weka.filters.unsupervised.instance.RemovePercentage
   :remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless
   :select-append-attributes weka.filters.unsupervised.attribute.Copy
   :project-attributes weka.filters.unsupervised.attribute.Remove}
@ -188,6 +196,7 @@
     - :numeric-to-nominal
     - :add-attribute
     - :remove-attributes
+     - :remove-percentage
     - :remove-useless-attributes
     - :select-append-attributes
     - :project-attributes
--- a/src/clj_ml/options_utils.clj
+++ b/src/clj_ml/options_utils.clj
@ -6,7 +6,7 @@
 (ns #^{:author "Ben Mabey <ben@benmabey.com>"
       :skip-wiki true}
  clj-ml.options-utils
-  (:use [clj-ml data])
+  (:use     [clojure.contrib.seq :only [find-first]])
  (:require [clojure.contrib [string :as str]]))

 ;; Manipulation of array of options
@ -26,6 +26,18 @@
      (conj  (conj opts flag) (str val-in-map)))))


+;; attr-name and dataset-index-attr copy and pasted from data due to Clojure's inability
+;; to handle circular dependencies. :(
+(defn- attr-name [^weka.core.Attribute attr]
+  (.name attr))
+
+(defn- dataset-index-attr
+  "Returns the index of an attribute in the attributes definition of a dataset."
+  [^weka.core.Instances dataset attr]
+  (if (number? attr)
+    attr
+    (find-first #(= (name attr) (attr-name (.attribute dataset (int %)))) (range (.numAttributes dataset)))))
+
 (defn extract-attributes
  "Transforms the :attributes value from m into the appropriate weka flag"
  ([m] (extract-attributes "-R" m))
--- a/test/clj_ml/data_test.clj
+++ b/test/clj_ml/data_test.clj
@ -180,3 +180,23 @@
    (is (= nil (dataset-class-name ds)))
    (dataset-set-class ds :b)
    (is (= :b (dataset-class-name ds)))))
+
+(deftest split-dataset-test
+  (let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
+                         [[1 :foo]
+                          [2 :bar]
+                          [3 :bar]
+                          [4 :foo]])
+        [a b] (split-dataset ds 25)]
+    (is (= (dataset-count @a) 1))
+    (is (= (dataset-count @b) 3))))
+
+(deftest do-split-dataset-test
+  (let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
+                         [[1 :foo]
+                          [2 :bar]
+                          [3 :bar]
+                          [4 :foo]])
+        [a b] (do-split-dataset ds 25)]
+    (is (= (dataset-count a) 1))
+    (is (= (dataset-count b) 3))))
--- a/test/clj_ml/filters_test.clj
+++ b/test/clj_ml/filters_test.clj
@ -107,6 +107,13 @@
    (is (= (dataset-format res)
           [:b {:c '(:g :m)}]))))

+(deftest remove-precentage-test
+  (let [ds (make-dataset :test [:a :b {:c [:g :m]}]
+                                     [ [1 2 :g]
+                                       [2 3 :m]
+                                       [4 2 :m]
+                                       [4 5 :g]])]
+    (is (= (dataset-count (remove-percentage ds {:percentage 75})) 1))))

 (deftest make-apply-filter-numeric-to-nominal
  (let [ds (make-dataset :test [:a :b {:c [:g :m]}]