Fixed some bugs in new docs-to-dataset, added test.

2013-12-24 08:21:17 -05:00 · 2013-12-24 08:21:17 -05:00 · 19ce704093
commit 19ce704093
parent 05b59b0cfe
3 changed files with 25 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -307,12 +307,14 @@ user> (classifier-classify classifier2 instance)
 Text document handling:
 ```clojure
-user> (def docs [{:title "Document title 1"
+user> (def docs [{:id 10
                  :title "Document title 1"
                  :fulltext "This is the fulltext..."
-                  :has-term? false}
+                  :has-class? false}
-                 {:title "Another document title"
+                 {:id 11
                  :title "Another document title"
                  :fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
-                  :has-term? true}])
+                  :has-class? true}])
 #'user/docs
 user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false)
--- a/src/clj_ml/data.clj
+++ b/src/clj_ml/data.clj
@ -535,7 +535,7 @@ split immediately you can use do-split-dataset."
  [docs model-prefix model-dir & opts]
  (let [parsed-opts (apply hash-map opts)
        original-ordering (map :id docs)
-        docs-with-class (filter :has-class? doc)
+        docs-with-class (filter :has-class? docs)
        docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)]
                             (if (:resample parsed-opts)
                               (take (count docs-with-class) dwoc)
@ -544,7 +544,7 @@ split immediately you can use do-split-dataset."
                      (concat (take (/ (:keep-n parsed-opts) 2) docs-with-class)
                              (take (/ (:keep-n parsed-opts) 2) docs-without-class))
                      (concat docs-with-class docs-without-class))
-        docs-shuffled (my-shuffle (sort-by :id docs-keep-n))
+        docs-shuffled (shuffle (sort-by :id docs-keep-n))
        ds (make-dataset
            :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
            (for [doc docs-shuffled]
--- a/test/clj_ml/data_test.clj
+++ b/test/clj_ml/data_test.clj
@ -229,3 +229,20 @@
                          [3 :bar]
                          [4 :foo]])]
    (is (= (dataset-as-vecs (take-dataset ds 2)) [[1.0 :foo] [2.0 :bar]]))))
 (deftest docs-to-dataset-test
  (let [docs [{:id 10
               :title "Document title 1"
               :fulltext "This is the fulltext..."
               :has-class? false}
              {:id 11
               :title "Another document title"
               :fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
               :has-class? true}]
        {:keys [dataset docids]} (docs-to-dataset docs "bananas-model" "."
                                                  :stemmer true :lowercase false)
        docid-ds-vecs (apply hash-map (interleave docids (dataset-as-vecs dataset)))]
    (is (= [:no 0.4804530139182014 0.0 0.4804530139182014 0.0]
           (get docid-ds-vecs 10)))
    (is (= [:yes 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014 0.0 0.4804530139182014]
           (get docid-ds-vecs 11)))))