Simpler usage for docs-to-dataset.

This commit is contained in:
Joshua Eckroth 2013-08-06 03:42:20 -04:00
parent 59f4cf3697
commit 10310d74e8

View file

@ -513,16 +513,18 @@ split immediately you can use do-split-dataset."
;; text-document datasets ;; text-document datasets
(defn str-to-fname
[s]
(str/replace s #"\W" "_"))
(defn dataset-filename (defn dataset-filename
[datadir vocab-id term-id tag] [datadir vocab term tag]
(format "%s/instances/%s-%d-%s.arff" datadir vocab-id term-id (name tag))) (format "%s/instances/%s-%s-%s.arff" datadir (str-to-fname vocab) (str-to-fname term) (name tag)))
(defn docs-to-dataset (defn docs-to-dataset
[docs vocab-id vocab-name term-id term-name keep-n datadir & opts] [docs vocab term keep-n datadir & opts]
(let [parsed-opts (apply hash-map opts) (let [parsed-opts (apply hash-map opts)
docs-with-term (filter (fn [doc] (some #(= term-name %) docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs)
(get-in doc [:terms vocab-name])))
docs)
docs-without-term (let [dwt (set/difference (set docs) (set docs-with-term))] docs-without-term (let [dwt (set/difference (set docs) (set docs-with-term))]
(if (:resample parsed-opts) (if (:resample parsed-opts)
(take (count docs-with-term) dwt) (take (count docs-with-term) dwt)
@ -533,11 +535,11 @@ split immediately you can use do-split-dataset."
ds (make-dataset ds (make-dataset
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}] :docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
(for [doc docs-keep-n] (for [doc docs-keep-n]
(let [fulltext (str/replace (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) ""))) (let [orig-fulltext (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) "")))
#"\s+" " ") fulltext (str/replace orig-fulltext #"\s+" " ")
fulltext-fixed (str/replace fulltext #"[^ \w\d]" "") fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
title (str/replace (:title doc "") #"[^ \w\d]" "") title (str/replace (:title doc "") #"[^ \w\d]" "")
has-tid? (some #(= term-name %) (get-in doc [:terms vocab-name]))] has-tid? (some #{term} (get-in doc [:terms vocab]))]
[(if has-tid? :yes :no) title fulltext-fixed]))) [(if has-tid? :yes :no) title fulltext-fixed])))
ds-title (let [f (filters/make-filter ds-title (let [f (filters/make-filter
:string-to-word-vector :string-to-word-vector
@ -553,7 +555,8 @@ split immediately you can use do-split-dataset."
"weka.core.stemmers.SnowballStemmer -S English")})] "weka.core.stemmers.SnowballStemmer -S English")})]
;; if testing, initialize the filter with the training instances ;; if testing, initialize the filter with the training instances
(when (:testing parsed-opts) (when (:testing parsed-opts)
(filters/filter-apply f (load-instances :arff (file (dataset-filename datadir vocab-id term-id :orig))))) (let [ds-file (file (dataset-filename datadir vocab term :orig))]
(filters/filter-apply f (load-instances :arff ds-file))))
(filters/filter-apply f ds)) (filters/filter-apply f ds))
ds-title-fulltext (let [f (filters/make-filter ds-title-fulltext (let [f (filters/make-filter
:string-to-word-vector :string-to-word-vector
@ -569,12 +572,13 @@ split immediately you can use do-split-dataset."
"weka.core.stemmers.SnowballStemmer -S English")})] "weka.core.stemmers.SnowballStemmer -S English")})]
;; if testing, initialize the filter with the training instances ;; if testing, initialize the filter with the training instances
(when (:testing parsed-opts) (when (:testing parsed-opts)
(filters/filter-apply f (load-instances :arff (file (dataset-filename datadir vocab-id term-id :title))))) (let [ds-file (file (dataset-filename datadir vocab term :title))]
(filters/filter-apply f (load-instances :arff ds-file))))
(filters/filter-apply f ds-title)) (filters/filter-apply f ds-title))
ds-class (dataset-set-class ds-title-fulltext 0)] ds-class (dataset-set-class ds-title-fulltext 0)]
;; if training, save unfiltered instances to re-initialize filter later ;; if training, save unfiltered instances to re-initialize filter later
(when (:training parsed-opts) (when (:training parsed-opts)
(save-instances :arff (file (dataset-filename datadir vocab-id term-id :orig)) ds) (save-instances :arff (file (dataset-filename datadir vocab term :orig)) ds)
(save-instances :arff (file (dataset-filename datadir vocab-id term-id :title)) ds-title)) (save-instances :arff (file (dataset-filename datadir vocab term :title)) ds-title))
ds-class)) ds-class))