Simplified docs-to-dataset function.

2013-12-24 08:00:15 -05:00 · 2013-12-24 08:00:15 -05:00 · be08a41f33
commit be08a41f33
parent 1e751dfae3
2 changed files with 40 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -126,7 +126,7 @@ user> ds
 user> (use 'clj-ml.data)
 nil

-user> (def ds (make-dataset"my-name" [:length :width {:style nil} {:kind [:good :bad]}]
+user> (def ds (make-dataset "my-name" [:length :width {:style nil} {:kind [:good :bad]}]
                            [[12 24 "longish" :good]
                             [8 5 "shortish" :bad]]))
 #'user/ds
@ -309,13 +309,13 @@ Text document handling:
 ```clojure
 user> (def docs [{:title "Document title 1"
                  :fulltext "This is the fulltext..."
-                  :terms {"Topic" ["Sports"]}}
+                  :has-term? false}
                 {:title "Another document title"
                  :fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
-                  :terms {"Topic" ["Politics" "Food"]}}])
+                  :has-term? true}])
 #'user/docs

-user> (docs-to-dataset docs "Topic" "Sports" 1 "/tmp" :stemmer true :lowercase false)
+user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false)
 #<Instances @relation 'docs-weka.filters.unsupervised.attribute.StringToWordVector...'

@attribute class {no,yes}
--- a/src/clj_ml/data.clj
+++ b/src/clj_ml/data.clj
@ -512,37 +512,47 @@ split immediately you can use do-split-dataset."

 ;; text-document datasets

-(defn str-to-fname
-  [s]
-  (str/replace s #"\W" "_"))
-
 (defn dataset-filename
-  [datadir vocab term tag]
-  (format "%s/instances/%s-%s-%s.arff" datadir (str-to-fname vocab) (str-to-fname term) (name tag)))
+  [model-prefix model-dir tag]
+  (format "%s/instances/%s-%s.arff" model-dir model-prefix (name tag)))

 (defn docs-to-dataset
-  [docs vocab term keep-n datadir & opts]
+  "Docs are expected to be maps with this structure: {:id
+  [any], :has-class? [true/false], :title [string], :fulltext
+  [string]}. Of course, title or fulltext could be nil. model-prefix
+  is a filename prefix to saving/loading the model (necessary to
+  initialize the string-to-wordvec filters), and model-dir is a folder
+  to save/load the model.
+
+  opts are optional parameters: :keep-n [int], :lowercase
+  [true/false], :words-to-keep [int], :normalize [int], :transform-tf
+  [true/false], :transform-idf [true/false], :stemmer
+  [true/false], :resample [true/false], :training
+  [true/false], :testing [true/false].
+
+  A map is returned with structure {:dataset [the dataset], :docids
+  [seq of docids as ordered in dataset]}."
+  [docs model-prefix model-dir & opts]
  (let [parsed-opts (apply hash-map opts)
        original-ordering (map :id docs)
-        docid-map (into {} (for [doc docs] [(:id doc) doc]))
-        docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs)
-        docs-without-term (let [dwt (filter (fn [doc] (not-any? #{term} (get-in doc [:terms vocab]))) docs)]
-                            (if (:resample parsed-opts)
-                              (take (count docs-with-term) dwt)
-                              dwt))
-        docs-keep-n (if keep-n (concat (take (/ keep-n 2) docs-with-term)
-                                       (take (/ keep-n 2) docs-without-term))
-                        (concat docs-with-term docs-without-term))
-        docs-ordered (for [docid original-ordering] (docid-map docid))
+        docs-with-class (filter :has-class? doc)
+        docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)]
+                             (if (:resample parsed-opts)
+                               (take (count docs-with-class) dwoc)
+                               dwoc))
+        docs-keep-n (if (:keep-n parsed-opts)
+                      (concat (take (/ (:keep-n parsed-opts) 2) docs-with-class)
+                              (take (/ (:keep-n parsed-opts) 2) docs-without-class))
+                      (concat docs-with-class docs-without-class))
+        docs-shuffled (my-shuffle (sort-by :id docs-keep-n))
        ds (make-dataset
            :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
-            (for [doc docs-ordered]
+            (for [doc docs-shuffled]
              (let [orig-fulltext (:fulltext doc "")
                    fulltext (str/replace orig-fulltext #"\s+" " ")
                    fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
-                    title (str/replace (:title doc "") #"[^ \w\d]" "")
-                    has-tid? (some #{term} (get-in doc [:terms vocab]))]
-                [(if has-tid? :yes :no) title fulltext-fixed])))
+                    title (str/replace (:title doc "") #"[^ \w\d]" "")]
+                [(if (:has-class? doc) :yes :no) title fulltext-fixed])))
        ds-title (let [f (filters/make-filter
                          :string-to-word-vector
                          {:dataset-format ds
@ -557,7 +567,7 @@ split immediately you can use do-split-dataset."
                                      "weka.core.stemmers.SnowballStemmer -S English")})]
                   ;; if testing, initialize the filter with the training instances
                   (when (:testing parsed-opts)
-                     (let [ds-file (file (dataset-filename datadir vocab term :orig))]
+                     (let [ds-file (file (dataset-filename model-prefix model-dir :orig))]
                       (filters/filter-apply f (load-instances :arff ds-file))))
                   (filters/filter-apply f ds))
        ds-title-fulltext (let [f (filters/make-filter
@ -574,13 +584,13 @@ split immediately you can use do-split-dataset."
                                               "weka.core.stemmers.SnowballStemmer -S English")})]
                            ;; if testing, initialize the filter with the training instances
                            (when (:testing parsed-opts)
-                              (let [ds-file (file (dataset-filename datadir vocab term :title))]
+                              (let [ds-file (file (dataset-filename model-prefix model-dir :title))]
                                (filters/filter-apply f (load-instances :arff ds-file))))
                            (filters/filter-apply f ds-title))
        ds-class (dataset-set-class ds-title-fulltext 0)]
    ;; if training, save unfiltered instances to re-initialize filter later
    (when (:training parsed-opts)
-      (save-instances :arff (file (dataset-filename datadir vocab term :orig)) ds)
-      (save-instances :arff (file (dataset-filename datadir vocab term :title)) ds-title))
-    ds-class))
+      (save-instances :arff (file (dataset-filename model-prefix model-dir :orig)) ds)
+      (save-instances :arff (file (dataset-filename model-prefix model-dir :title)) ds-title))
+    {:dataset ds-class :docids (map :id docs-shuffled)}))