From 10310d74e8622b484b47686bbc119298c4851440 Mon Sep 17 00:00:00 2001
From: Joshua Eckroth <eckroth@cse.ohio-state.edu>
Date: Tue, 6 Aug 2013 03:42:20 -0400
Subject: [PATCH] Simpler usage for docs-to-dataset.

---
 src/clj_ml/data.clj | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/clj_ml/data.clj b/src/clj_ml/data.clj
index 6014b7a..94926df 100644
--- a/src/clj_ml/data.clj
+++ b/src/clj_ml/data.clj
@@ -513,16 +513,18 @@ split immediately you can use do-split-dataset."
 
 ;; text-document datasets
 
+(defn str-to-fname
+  [s]
+  (str/replace s #"\W" "_"))
+
 (defn dataset-filename
-  [datadir vocab-id term-id tag]
-  (format "%s/instances/%s-%d-%s.arff" datadir vocab-id term-id (name tag)))
+  [datadir vocab term tag]
+  (format "%s/instances/%s-%s-%s.arff" datadir (str-to-fname vocab) (str-to-fname term) (name tag)))
 
 (defn docs-to-dataset
-  [docs vocab-id vocab-name term-id term-name keep-n datadir & opts]
+  [docs vocab term keep-n datadir & opts]
   (let [parsed-opts (apply hash-map opts)
-        docs-with-term (filter (fn [doc] (some #(= term-name %)
-                                               (get-in doc [:terms vocab-name])))
-                               docs)
+        docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs)
         docs-without-term (let [dwt (set/difference (set docs) (set docs-with-term))]
                             (if (:resample parsed-opts)
                               (take (count docs-with-term) dwt)
@@ -533,11 +535,11 @@ split immediately you can use do-split-dataset."
         ds (make-dataset
             :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
             (for [doc docs-keep-n]
-              (let [fulltext (str/replace (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) "")))
-                                          #"\s+" " ")
+              (let [orig-fulltext (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) "")))
+                    fulltext (str/replace orig-fulltext #"\s+" " ")
                     fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
                     title (str/replace (:title doc "") #"[^ \w\d]" "")
-                    has-tid? (some #(= term-name %) (get-in doc [:terms vocab-name]))]
+                    has-tid? (some #{term} (get-in doc [:terms vocab]))]
                 [(if has-tid? :yes :no) title fulltext-fixed])))
         ds-title (let [f (filters/make-filter
                           :string-to-word-vector
@@ -553,7 +555,8 @@ split immediately you can use do-split-dataset."
                                       "weka.core.stemmers.SnowballStemmer -S English")})]
                    ;; if testing, initialize the filter with the training instances
                    (when (:testing parsed-opts)
-                     (filters/filter-apply f (load-instances :arff (file (dataset-filename datadir vocab-id term-id :orig)))))
+                     (let [ds-file (file (dataset-filename datadir vocab term :orig))]
+                       (filters/filter-apply f (load-instances :arff ds-file))))
                    (filters/filter-apply f ds))
         ds-title-fulltext (let [f (filters/make-filter
                                    :string-to-word-vector
@@ -569,12 +572,13 @@ split immediately you can use do-split-dataset."
                                                "weka.core.stemmers.SnowballStemmer -S English")})]
                             ;; if testing, initialize the filter with the training instances
                             (when (:testing parsed-opts)
-                              (filters/filter-apply f (load-instances :arff (file (dataset-filename datadir vocab-id term-id :title)))))
+                              (let [ds-file (file (dataset-filename datadir vocab term :title))]
+                                (filters/filter-apply f (load-instances :arff ds-file))))
                             (filters/filter-apply f ds-title))
         ds-class (dataset-set-class ds-title-fulltext 0)]
     ;; if training, save unfiltered instances to re-initialize filter later
     (when (:training parsed-opts)
-      (save-instances :arff (file (dataset-filename datadir vocab-id term-id :orig)) ds)
-      (save-instances :arff (file (dataset-filename datadir vocab-id term-id :title)) ds-title))
+      (save-instances :arff (file (dataset-filename datadir vocab term :orig)) ds)
+      (save-instances :arff (file (dataset-filename datadir vocab term :title)) ds-title))
     ds-class))