Keep original docs ordering in docs-to-dataset.
This commit is contained in:
parent
c220b141db
commit
1e751dfae3
1 changed files with 4 additions and 1 deletions
|
@ -523,6 +523,8 @@ split immediately you can use do-split-dataset."
|
||||||
(defn docs-to-dataset
|
(defn docs-to-dataset
|
||||||
[docs vocab term keep-n datadir & opts]
|
[docs vocab term keep-n datadir & opts]
|
||||||
(let [parsed-opts (apply hash-map opts)
|
(let [parsed-opts (apply hash-map opts)
|
||||||
|
original-ordering (map :id docs)
|
||||||
|
docid-map (into {} (for [doc docs] [(:id doc) doc]))
|
||||||
docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs)
|
docs-with-term (filter (fn [doc] (some #{term} (get-in doc [:terms vocab]))) docs)
|
||||||
docs-without-term (let [dwt (filter (fn [doc] (not-any? #{term} (get-in doc [:terms vocab]))) docs)]
|
docs-without-term (let [dwt (filter (fn [doc] (not-any? #{term} (get-in doc [:terms vocab]))) docs)]
|
||||||
(if (:resample parsed-opts)
|
(if (:resample parsed-opts)
|
||||||
|
@ -531,9 +533,10 @@ split immediately you can use do-split-dataset."
|
||||||
docs-keep-n (if keep-n (concat (take (/ keep-n 2) docs-with-term)
|
docs-keep-n (if keep-n (concat (take (/ keep-n 2) docs-with-term)
|
||||||
(take (/ keep-n 2) docs-without-term))
|
(take (/ keep-n 2) docs-without-term))
|
||||||
(concat docs-with-term docs-without-term))
|
(concat docs-with-term docs-without-term))
|
||||||
|
docs-ordered (for [docid original-ordering] (docid-map docid))
|
||||||
ds (make-dataset
|
ds (make-dataset
|
||||||
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
|
:docs [{:class [:no :yes]} {:title nil} {:fulltext nil}]
|
||||||
(for [doc docs-keep-n]
|
(for [doc docs-ordered]
|
||||||
(let [orig-fulltext (:fulltext doc "")
|
(let [orig-fulltext (:fulltext doc "")
|
||||||
fulltext (str/replace orig-fulltext #"\s+" " ")
|
fulltext (str/replace orig-fulltext #"\s+" " ")
|
||||||
fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
|
fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
|
||||||
|
|
Loading…
Reference in a new issue