Don't limit the size of fulltext in docs-to-dataset
This commit is contained in:
parent
3ead98c527
commit
1105dac7b8
1 changed files with 2 additions and 3 deletions
|
@ -523,11 +523,10 @@ split immediately you can use do-split-dataset."
|
||||||
(for [doc docs-keep-n]
|
(for [doc docs-keep-n]
|
||||||
(let [fulltext (str/replace (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) "")))
|
(let [fulltext (str/replace (.text (Jsoup/parse (or (:fulltext doc) (:extracted doc) "")))
|
||||||
#"\s+" " ")
|
#"\s+" " ")
|
||||||
fulltext-sub (str/replace (subs fulltext 0 (min (count fulltext) 10000))
|
fulltext-fixed (str/replace fulltext #"[^ \w\d]" "")
|
||||||
#"[^ \w\d]" "")
|
|
||||||
title (str/replace (:title doc "") #"[^ \w\d]" "")
|
title (str/replace (:title doc "") #"[^ \w\d]" "")
|
||||||
has-tid? (some #(= term-name %) (get-in doc [:terms vocab-name]))]
|
has-tid? (some #(= term-name %) (get-in doc [:terms vocab-name]))]
|
||||||
[(if has-tid? :yes :no) title fulltext-sub])))
|
[(if has-tid? :yes :no) title fulltext-fixed])))
|
||||||
ds-title (let [f (filters/make-filter
|
ds-title (let [f (filters/make-filter
|
||||||
:string-to-word-vector
|
:string-to-word-vector
|
||||||
{:dataset-format ds
|
{:dataset-format ds
|
||||||
|
|
Loading…
Reference in a new issue