Skip to content

Commit 7d1d901

Browse files
committed
Search indexing improvements
This is an attempt to make it easier to find related projects when they are split across old, non-verifiable groups and new verified groups.
1 parent a9fbddd commit 7d1d901

File tree

2 files changed

+32
-13
lines changed

2 files changed

+32
-13
lines changed

src/clojars/search.clj

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,27 +124,44 @@
124124
(DirectoryReader/open index))
125125

126126
(defn- hyphen-remover
127-
"Replaces hyphens. This is used to expand a token into its components for use in
128-
the full content string. Should be used alongside the unexpanded token so it
129-
is still searchable."
130-
[kw]
127+
"Replaces hyphens with spaces. This is used to expand a token into its
128+
components for use in the full content string. Should be used alongside the
129+
unexpanded token so it is still searchable."
130+
[f]
131131
(fn [m]
132-
(when-some [v (get m kw)]
132+
(when-some [v (f m)]
133133
(str/replace v #"[-]" " "))))
134134

135135
(defn- period-remover
136+
"Replaces periods with spaces. This is used to expand a token into its
137+
components for use in the full content string. Should be used alongside the
138+
unexpanded token so it is still searchable."
139+
[f]
140+
(fn [m]
141+
(when-some [v (f m)]
142+
(str/replace v #"[.]" " "))))
143+
144+
(defn- sentence-period-remover
136145
"Removes periods at the end of sentences since the whitespace tokenizer won't."
137-
[kw]
146+
[f]
138147
(fn [m]
139-
(when-some [v (get m kw)]
148+
(when-some [v (f m)]
140149
(str/replace v #"\.(\s|$)" " "))))
141150

142-
(def ^:private content-fields
151+
(def ^:private content-items
143152
[:artifact-id
144153
(hyphen-remover :artifact-id)
145154
:group-id
146155
(hyphen-remover :group-id)
147-
(period-remover :description)
156+
;; Include 'group name' & 'group name/artifact-name' in content (for a
157+
;; group-id of group.name) to aid in searching for things where new projects
158+
;; had to be deployed under a domain-based group
159+
(period-remover :group-id)
160+
(period-remover #(->> % ((juxt :group-id :artifact-id)) (str/join "/")))
161+
;; Include 'group-name/artifact-name' in content to allow
162+
;; the "group-name/artifact-name" phrase to find it
163+
#(->> % ((juxt :group-id :artifact-id)) (str/join "/"))
164+
(sentence-period-remover :description)
148165
:url
149166
:version
150167
#(->> % :authors (str/join " "))])
@@ -191,7 +208,7 @@
191208
(.add (string-field "version" version)))
192209
;; content field containing all values to use as the default search field
193210
(.add (text-field content-field-name
194-
(str/join " " ((apply juxt content-fields) jar))))
211+
(str/join " " ((apply juxt content-items) jar))))
195212
;; adds a boost field based on the ratio of downloads of the jar to the
196213
;; total number of downloads. This is then applied to the query below.
197214
(.add (DoubleDocValuesField. boost-field-name download-boost))))

test/clojars/unit/search_test.clj

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,18 @@
8383
(is (match? [at-at] (search/search lc "at-at" 1))))))
8484

8585
(deftest search-by-group+artifact-id-as-single-term
86-
(let [at-at (assoc at-at :description "similar to lein-ring/at-at.")]
86+
(let [lein-ring (assoc lein-ring :group-id "org.lein-ring")
87+
at-at (assoc at-at :description "similar to lein-ring/at-at.")]
8788
(with-lucene-search-component [lein-ring
8889
at-at
8990
c]
9091
(is (match? [lein-ring] (search/search lc "lein-ring/lein-ring" 1)))
92+
(is (match? [lein-ring] (search/search lc "org.lein-ring/lein-ring" 1)))
93+
(is (match? [lein-ring] (search/search lc "\"lein-ring/lein-ring\"" 1)))
9194
(is (match? [at-at] (search/search lc "at-at/at-at" 1)))
9295
(is (match? [at-at] (search/search lc "lein-ring/at-at" 1)))
9396
(is (empty? (search/search lc "lein-ring/nope" 1)))
94-
(is (empty? (search/search lc "nope/lein-ring" 1)))
95-
(is (empty? (search/search lc "\"lein-ring/lein-ring\"" 1))))))
97+
(is (empty? (search/search lc "nope/lein-ring" 1))))))
9698

9799
(deftest search-by-description
98100
(let [lein-ring (merge lein-ring

0 commit comments

Comments
 (0)