Simplified docs-to-dataset function.
This commit is contained in:
parent
1e751dfae3
commit
be08a41f33
2 changed files with 40 additions and 30 deletions
|
@ -126,7 +126,7 @@ user> ds
|
||||||
user> (use 'clj-ml.data)
|
user> (use 'clj-ml.data)
|
||||||
nil
|
nil
|
||||||
|
|
||||||
user> (def ds (make-dataset"my-name" [:length :width {:style nil} {:kind [:good :bad]}]
|
user> (def ds (make-dataset "my-name" [:length :width {:style nil} {:kind [:good :bad]}]
|
||||||
[[12 24 "longish" :good]
|
[[12 24 "longish" :good]
|
||||||
[8 5 "shortish" :bad]]))
|
[8 5 "shortish" :bad]]))
|
||||||
#'user/ds
|
#'user/ds
|
||||||
|
@ -309,13 +309,13 @@ Text document handling:
|
||||||
```clojure
|
```clojure
|
||||||
user> (def docs [{:title "Document title 1"
|
user> (def docs [{:title "Document title 1"
|
||||||
:fulltext "This is the fulltext..."
|
:fulltext "This is the fulltext..."
|
||||||
:terms {"Topic" ["Sports"]}}
|
:has-term? false}
|
||||||
{:title "Another document title"
|
{:title "Another document title"
|
||||||
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
|
:fulltext "Some more \"fulltext\"; rabbit artificial machine bananas"
|
||||||
:terms {"Topic" ["Politics" "Food"]}}])
|
:has-term? true}])
|
||||||
#'user/docs
|
#'user/docs
|
||||||
|
|
||||||
user> (docs-to-dataset docs "Topic" "Sports" 1 "/tmp" :stemmer true :lowercase false)
|
user> (docs-to-dataset docs "bananas-model" "my-models" :stemmer true :lowercase false)
|
||||||
#<Instances @relation 'docs-weka.filters.unsupervised.attribute.StringToWordVector...'
|
#<Instances @relation 'docs-weka.filters.unsupervised.attribute.StringToWordVector...'
|
||||||
|
|
||||||
@attribute class {no,yes}
|
@attribute class {no,yes}
|
||||||
|
|
|
@ -512,37 +512,47 @@ split immediately you can use do-split-dataset."
|
||||||
|
|
||||||
;; text-document datasets
|
;; text-document datasets
|
||||||
|
|
||||||
(defn str-to-fname
|
|
||||||
[s]
|
|
||||||
(str/replace s #"\W" "_"))
|
|
||||||
|
|
||||||
(defn dataset-filename
|
(defn dataset-filename
|
||||||
[datadir vocab term tag]
|
[model-prefix model-dir tag]
|
||||||
(format "%s/instances/%s-%s-%s.arff" datadir (str-to-fname vocab) (str-to-fname term) (name tag)))
|
(format "%s/instances/%s-%s.arff" model-dir model-prefix (name tag)))
|
||||||
|
|
||||||
(defn docs-to-dataset
|
(defn docs-to-dataset
|
||||||
[docs vocab term keep-n datadir & opts]
|
"Docs are expected to be maps with this structure: {:id
|
||||||
|
[any], :has-class? [true/false], :title [string], :fulltext
|
||||||
|
[string]}. Of course, title or fulltext could be nil. model-prefix
|
||||||
|
is a filename prefix to saving/loading the model (necessary to
|
||||||
|
initialize the string-to-wordvec filters), and model-dir is a folder
|
||||||
|
to save/load the model.
|
||||||
|
|
||||||
|
opts are optional parameters: :keep-n [int], :lowercase
|
||||||
|
[true/false], :words-to-keep [int], :normalize [int], :transform-tf
|
||||||
|
[true/false], :transform-idf [true/false], :stemmer
|
||||||
|
[true/false], :resample [true/false], :training
|
||||||
|
[true/false], :testing [true/false].
|
||||||
|
|
||||||
|
A map is returned with structure {:dataset [the dataset], :docids
|
||||||
|
[seq of docids as ordered in dataset]}."
|
||||||
|
[docs model-prefix model-dir & opts]
|
||||||
(let [parsed-opts (apply hash-map opts)
|
(let [parsed-opts (apply hash-map opts)
|
||||||
original-ordering (map :id docs)
|
original-ordering (map :id docs)
|
||||||
docid-map (into {} (for [doc docs] [(:id doc) doc]))
|
docs-with-class (filter :has-class? doc)
|
||||||
docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs)
|
docs-without-class (let [dwoc (filter #(not (:has-class? %)) docs)]
|
||||||
docs-without-term (let [dwt (filter (fn [doc] (not-any? #{term} (get-in doc [:terms vocab]))) docs)]
|
|
||||||
(if (:resample parsed-opts)
|
(if (:resample parsed-opts)
|
||||||
(take (count docs-with-term) dwt)
|
(take (count docs-with-class) dwoc)
|
||||||
dwt))
|
dwoc))
|
||||||
docs-keep-n (if keep-n (concat (take (/ keep-n 2) docs-with-term)
|
docs-keep-n (if (:keep-n parsed-opts)
|
||||||
(take (/ keep-n 2) docs-without-term))
|
(concat (take (/ (:keep-n parsed-opts) 2) docs-with-class)
|
||||||
(concat docs-with-term docs-without-term))
|
(take (/ (:keep-n parsed-opts) 2) docs-without-class))
|
||||||
docs-ordered (for [docid original-ordering] (docid-map docid))
|
(concat docs-with-class docs-without-class))
|
||||||
|
docs-shuffled (my-shuffle (sort-by :id docs-keep-n))
|
||||||
ds (make-dataset
|
ds (make-dataset
|
||||||
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
|
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
|
||||||
(for [doc docs-ordered]
|
(for [doc docs-shuffled]
|
||||||
(let [orig-fulltext (:fulltext doc "")
|
(let [orig-fulltext (:fulltext doc "")
|
||||||
fulltext (str/replace orig-fulltext #"\s+" " ")
|
fulltext (str/replace orig-fulltext #"\s+" " ")
|
||||||
fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
|
fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
|
||||||
title (str/replace (:title doc "") #"[^ \w\d]" "")
|
title (str/replace (:title doc "") #"[^ \w\d]" "")]
|
||||||
has-tid? (some #{term} (get-in doc [:terms vocab]))]
|
[(if (:has-class? doc) :yes :no) title fulltext-fixed])))
|
||||||
[(if has-tid? :yes :no) title fulltext-fixed])))
|
|
||||||
ds-title (let [f (filters/make-filter
|
ds-title (let [f (filters/make-filter
|
||||||
:string-to-word-vector
|
:string-to-word-vector
|
||||||
{:dataset-format ds
|
{:dataset-format ds
|
||||||
|
@ -557,7 +567,7 @@ split immediately you can use do-split-dataset."
|
||||||
"weka.core.stemmers.SnowballStemmer -S English")})]
|
"weka.core.stemmers.SnowballStemmer -S English")})]
|
||||||
;; if testing, initialize the filter with the training instances
|
;; if testing, initialize the filter with the training instances
|
||||||
(when (:testing parsed-opts)
|
(when (:testing parsed-opts)
|
||||||
(let [ds-file (file (dataset-filename datadir vocab term :orig))]
|
(let [ds-file (file (dataset-filename model-prefix model-dir :orig))]
|
||||||
(filters/filter-apply f (load-instances :arff ds-file))))
|
(filters/filter-apply f (load-instances :arff ds-file))))
|
||||||
(filters/filter-apply f ds))
|
(filters/filter-apply f ds))
|
||||||
ds-title-fulltext (let [f (filters/make-filter
|
ds-title-fulltext (let [f (filters/make-filter
|
||||||
|
@ -574,13 +584,13 @@ split immediately you can use do-split-dataset."
|
||||||
"weka.core.stemmers.SnowballStemmer -S English")})]
|
"weka.core.stemmers.SnowballStemmer -S English")})]
|
||||||
;; if testing, initialize the filter with the training instances
|
;; if testing, initialize the filter with the training instances
|
||||||
(when (:testing parsed-opts)
|
(when (:testing parsed-opts)
|
||||||
(let [ds-file (file (dataset-filename datadir vocab term :title))]
|
(let [ds-file (file (dataset-filename model-prefix model-dir :title))]
|
||||||
(filters/filter-apply f (load-instances :arff ds-file))))
|
(filters/filter-apply f (load-instances :arff ds-file))))
|
||||||
(filters/filter-apply f ds-title))
|
(filters/filter-apply f ds-title))
|
||||||
ds-class (dataset-set-class ds-title-fulltext 0)]
|
ds-class (dataset-set-class ds-title-fulltext 0)]
|
||||||
;; if training, save unfiltered instances to re-initialize filter later
|
;; if training, save unfiltered instances to re-initialize filter later
|
||||||
(when (:training parsed-opts)
|
(when (:training parsed-opts)
|
||||||
(save-instances :arff (file (dataset-filename datadir vocab term :orig)) ds)
|
(save-instances :arff (file (dataset-filename model-prefix model-dir :orig)) ds)
|
||||||
(save-instances :arff (file (dataset-filename datadir vocab term :title)) ds-title))
|
(save-instances :arff (file (dataset-filename model-prefix model-dir :title)) ds-title))
|
||||||
ds-class))
|
{:dataset ds-class :docids (map :id docs-shuffled)}))
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue