adds split-dataset via new remove-percentage filter wrapper

2011-12-07 10:40:41 -07:00 · 2011-12-07 10:40:41 -07:00 · 0c32c318af
commit 0c32c318af
parent b1130cf80d
5 changed files with 66 additions and 2 deletions
--- a/src/clj_ml/data.clj
+++ b/src/clj_ml/data.clj
@ -12,6 +12,7 @@
   that can be transformed using usual Clojure functions like map, reduce, etc."
  (:use [clj-ml utils]
        [clojure.contrib.seq :only [find-first]])
  (:require [clj-ml.filters :as filters])
  (:import (weka.core Instance Instances FastVector Attribute)
           (cljml ClojureInstances)))
@ -431,3 +432,18 @@ The intention is for this to be used on data-formats and not on datasets with da
    (doto dataset
      (.deleteAttributeAt (int attr-pos))
      (.insertAttributeAt new-attr (int attr-pos)))))
 (defn split-dataset
  "Splits the dataset into two parts based on the percentage given.
 The first dataset returned will have 'percentage ammount of the original dataset and the second has the
 remaining portion. Both datasets are Delay objects that need to be dereffed.  If you want to have the
 split immediately you can use do-split-dataset."
  [ds percentage]
  [(delay (filters/remove-percentage ds {:percentage percentage :invert true}))
   (delay (filters/remove-percentage ds {:percentage percentage}))])
 (defn do-split-dataset
  "Splits the dataset into two parts based on the percentage given. The same as split-dataset but
 actual datasets are returned and not Delay objects that need dereffing."
  [ds percentage]
  (map deref (split-dataset ds percentage)))
--- a/src/clj_ml/filters.clj
+++ b/src/clj_ml/filters.clj
@ -36,7 +36,7 @@
   The previous sample of code could be rewritten with the make-apply-filter function:
     (def filtered-ds (make-apply-filter :remove-attributes {:attributes [:a :c]} ds))"
-  (:use [clj-ml data utils options-utils]
+  (:use [clj-ml utils options-utils]
        [clojure.contrib [def :only [defvar defvar-]]])
  (:require [clojure.contrib [string :as str]])
  (:import (weka.filters Filter)
@ -135,6 +135,13 @@
 (deffilter remove-attributes)
 (defmethod make-filter-options :remove-percentage
  ([kind m]
     (->> (check-option-values m {:percentage "-P"})
          (check-options m {:invert "-V"}))))
 (deffilter remove-percentage)
 (defmethod make-filter-options :remove-useless-attributes
  ([kind m]
     (check-option-values m {:max-variance "-M"})))
@ -170,6 +177,7 @@
   :numeric-to-nominal weka.filters.unsupervised.attribute.NumericToNominal
   :add-attribute weka.filters.unsupervised.attribute.Add
   :remove-attributes weka.filters.unsupervised.attribute.Remove
   :remove-percentage weka.filters.unsupervised.instance.RemovePercentage
   :remove-useless-attributes weka.filters.unsupervised.attribute.RemoveUseless
   :select-append-attributes weka.filters.unsupervised.attribute.Copy
   :project-attributes weka.filters.unsupervised.attribute.Remove}
@ -188,6 +196,7 @@
     - :numeric-to-nominal
     - :add-attribute
     - :remove-attributes
     - :remove-percentage
     - :remove-useless-attributes
     - :select-append-attributes
     - :project-attributes
--- a/src/clj_ml/options_utils.clj
+++ b/src/clj_ml/options_utils.clj
@ -6,7 +6,7 @@
 (ns #^{:author "Ben Mabey <ben@benmabey.com>"
       :skip-wiki true}
  clj-ml.options-utils
-  (:use [clj-ml data])
+  (:use     [clojure.contrib.seq :only [find-first]])
  (:require [clojure.contrib [string :as str]]))
 ;; Manipulation of array of options
@ -26,6 +26,18 @@
      (conj  (conj opts flag) (str val-in-map)))))
 ;; attr-name and dataset-index-attr copy and pasted from data due to Clojure's inability
 ;; to handle circular dependencies. :(
 (defn- attr-name [^weka.core.Attribute attr]
  (.name attr))
 (defn- dataset-index-attr
  "Returns the index of an attribute in the attributes definition of a dataset."
  [^weka.core.Instances dataset attr]
  (if (number? attr)
    attr
    (find-first #(= (name attr) (attr-name (.attribute dataset (int %)))) (range (.numAttributes dataset)))))
 (defn extract-attributes
  "Transforms the :attributes value from m into the appropriate weka flag"
  ([m] (extract-attributes "-R" m))
--- a/test/clj_ml/data_test.clj
+++ b/test/clj_ml/data_test.clj
@ -180,3 +180,23 @@
    (is (= nil (dataset-class-name ds)))
    (dataset-set-class ds :b)
    (is (= :b (dataset-class-name ds)))))
 (deftest split-dataset-test
  (let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
                         [[1 :foo]
                          [2 :bar]
                          [3 :bar]
                          [4 :foo]])
        [a b] (split-dataset ds 25)]
    (is (= (dataset-count @a) 1))
    (is (= (dataset-count @b) 3))))
 (deftest do-split-dataset-test
  (let [ds (make-dataset "test" [:a {:b [:foo :bar]}]
                         [[1 :foo]
                          [2 :bar]
                          [3 :bar]
                          [4 :foo]])
        [a b] (do-split-dataset ds 25)]
    (is (= (dataset-count a) 1))
    (is (= (dataset-count b) 3))))
--- a/test/clj_ml/filters_test.clj
+++ b/test/clj_ml/filters_test.clj
@ -107,6 +107,13 @@
    (is (= (dataset-format res)
           [:b {:c '(:g :m)}]))))
 (deftest remove-precentage-test
  (let [ds (make-dataset :test [:a :b {:c [:g :m]}]
                                     [ [1 2 :g]
                                       [2 3 :m]
                                       [4 2 :m]
                                       [4 5 :g]])]
    (is (= (dataset-count (remove-percentage ds {:percentage 75})) 1))))
 (deftest make-apply-filter-numeric-to-nominal
  (let [ds (make-dataset :test [:a :b {:c [:g :m]}]