Skip to content

Commit 1c00bff

Browse files
committed
[search] Rework search to always use db jar data
Search was originally pom-driven, then grew to support indexing either pom data or db jar data. We only used the pom data when indexing on upload, and ended up with a different :at timestamp value, depending on the source of the data. This simplifies the indexing to only support db jar data as input, which fixes that discrepancy. Fixes #897.
1 parent b519dc3 commit 1c00bff

File tree

6 files changed

+141
-144
lines changed

6 files changed

+141
-144
lines changed

src/clojars/routes/repo.clj

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
File
2323
IOException)
2424
(java.util
25-
Date
2625
UUID)
2726
org.apache.commons.io.FileUtils))
2827

@@ -388,7 +387,7 @@
388387
(str "invalid gradle module file: " (.getMessage e))
389388
{:file module-file})))
390389

391-
{:keys [group-path name version] :as posted-metadata}
390+
{:keys [group group-path name version] :as posted-metadata}
392391
(read-metadata dir)
393392

394393
md-file (io/file dir group-path name "maven-metadata.xml")]
@@ -422,8 +421,7 @@
422421
(log/audit db {:tag :deployed})
423422
(log/info {:tag :deploy-finalized})
424423
(future
425-
(search/index! search (assoc pom
426-
:at (Date. (.lastModified pom-file))))
424+
(search/index! search (db/find-jar db group name version))
427425
(log/info {:tag :deploy-indexed}))
428426
(spit (io/file dir ".finalized") "")
429427
(emit-deploy-events db event-emitter (assoc posted-metadata :deployer-username account))))

src/clojars/search.clj

Lines changed: 33 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
[clojars.db :as db]
66
[clojars.maven :as maven]
77
[clojars.stats :as stats]
8-
[clojure.set :as set]
98
[clojure.string :as str]
109
[com.stuartsierra.component :as component])
1110
(:import
@@ -55,27 +54,22 @@
5554
(set! *warn-on-reflection* true)
5655

5756
(defprotocol Search
58-
(index! [t pom])
57+
(index! [t version-details])
5958
(search [t query page])
6059
(delete!
6160
[t group-id]
6261
[t group-id artifact-id]))
6362

64-
(def ^:private renames
65-
{:name :artifact-id
66-
:jar_name :artifact-id
67-
:group :group-id
68-
:group_name :group-id
69-
:created :at
70-
:homepage :url})
71-
72-
(defn- doc-id ^String [group-id artifact-id]
63+
(defn- doc-id ^String
64+
[group-id artifact-id]
7365
(str group-id ":" artifact-id))
7466

75-
(defn- jar->id ^String [{:keys [artifact-id group-id]}]
76-
(doc-id group-id artifact-id))
67+
(defn- jar->id ^String
68+
[{:keys [group_name jar_name]}]
69+
(doc-id group_name jar_name))
7770

78-
(defn delete-from-index [^IndexWriter index-writer ^String group-id & [artifact-id]]
71+
(defn delete-from-index
72+
[^IndexWriter index-writer ^String group-id & [artifact-id]]
7973
(let [term (if artifact-id
8074
(Term. "id" (doc-id group-id artifact-id))
8175
(Term. "group-id" group-id))]
@@ -152,22 +146,23 @@
152146
(str/replace v #"\.(\s|$)" " "))))
153147

154148
(def ^:private content-items
155-
[:artifact-id
156-
(hyphen-remover :artifact-id)
157-
:group-id
158-
(hyphen-remover :group-id)
149+
[:jar_name
150+
(hyphen-remover :jar_name)
151+
:group_name
152+
(hyphen-remover :group_name)
159153
;; Include 'group name' & 'group name/artifact-name' in content (for a
160-
;; group-id of group.name) to aid in searching for things where new projects
154+
;; group_name of group.name) to aid in searching for things where new projects
161155
;; had to be deployed under a domain-based group
162-
(period-remover :group-id)
163-
(period-remover #(->> % ((juxt :group-id :artifact-id)) (str/join "/")))
156+
(period-remover :group_name)
157+
(period-remover #(->> % ((juxt :group_name :jar_name)) (str/join "/")))
164158
;; Include 'group-name/artifact-name' in content to allow
165159
;; the "group-name/artifact-name" phrase to find it
166-
#(->> % ((juxt :group-id :artifact-id)) (str/join "/"))
160+
#(->> % ((juxt :group_name :jar_name)) (str/join "/"))
167161
(sentence-period-remover :description)
168-
:url
162+
:homepage
169163
:version
170-
#(->> % :authors (str/join " "))])
164+
#(when-some [authors (:authors %)]
165+
(str/replace authors #"," " "))])
171166

172167
(def ^:private ^String content-field-name "_content")
173168
(def ^:private ^String boost-field-name "_download_boost")
@@ -182,28 +177,20 @@
182177
[^String name ^String value]
183178
(TextField. name value Field$Store/YES))
184179

185-
(defn jar->doc
186-
^Iterable [{:keys [at
187-
artifact-id
188-
group-id
189-
description
190-
licenses
191-
url
192-
version]
193-
:or {at (Date.)}
194-
:as jar}
195-
download-boost]
180+
(defn- jar->doc ^Iterable
181+
[{:keys [created description group_name homepage jar_name licenses version] :as jar}
182+
download-boost]
196183
(doto (Document.)
197184
;; id: We need a unique identifier for each doc so that we can use updateDocument
198185
(.add (string-field "id" (jar->id jar)))
199-
(.add (string-field "artifact-id" artifact-id))
200-
(.add (string-field "group-id" group-id))
186+
(.add (string-field "artifact-id" jar_name))
187+
(.add (string-field "group-id" group_name))
201188
(cond-> description
202189
(.add (text-field "description" description)))
203-
(.add (string-field "at" (str (.getTime ^Date at))))
190+
(.add (string-field "at" (str (.getTime ^Date created))))
204191
(.add (text-field "licenses" (str/join " " (map :name licenses))))
205-
(cond-> url
206-
(.add (string-field "url" url)))
192+
(cond-> homepage
193+
(.add (string-field "url" homepage)))
207194
;; version isn't really useful to search, since we only store the
208195
;; most-recently-seen value, but we have it here because we've had it
209196
;; historically
@@ -226,10 +213,10 @@
226213
(def download-score-weight 50)
227214

228215
(defn- calculate-document-boost
229-
[stats {:as _jar :keys [artifact-id group-id]}]
216+
[stats {:as _jar :keys [group_name jar_name]}]
230217
(let [total (stats/total-downloads stats)]
231218
(* download-score-weight
232-
(/ (or (stats/download-count stats group-id artifact-id) 0)
219+
(/ (or (stats/download-count stats group_name jar_name) 0)
233220
(max 1 total)))))
234221

235222
(defn disk-index
@@ -239,11 +226,10 @@
239226

240227
(defn- index-jar
241228
[^IndexWriter index-writer stats jar]
242-
(let [jar' (set/rename-keys jar renames)
243-
doc (jar->doc jar' (calculate-document-boost stats jar'))]
229+
(let [doc (jar->doc jar (calculate-document-boost stats jar))]
244230
;; always delete and replace the doc, since we are indexing every version
245231
;; and the last one wins
246-
(.updateDocument index-writer (Term. "id" (jar->id jar')) doc)))
232+
(.updateDocument index-writer (Term. "id" (jar->id jar)) doc)))
247233

248234
(defn- track-index-status
249235
[{:keys [indexed last-time] :as status}]
@@ -384,9 +370,9 @@
384370

385371
(defrecord LuceneSearch [stats index-factory ^Directory index]
386372
Search
387-
(index! [_t pom]
373+
(index! [_t version-details]
388374
(with-open [index-writer (index-writer index false)]
389-
(index-jar index-writer stats pom)))
375+
(index-jar index-writer stats version-details)))
390376
(search [_t query page]
391377
(-search index query page))
392378
(delete! [_t group-id]

src/clojars/web/search.clj

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,13 @@
1313
[hiccup.element :refer [link-to]]
1414
[ring.util.codec :refer [url-encode]]))
1515

16-
(defn- jar->json [jar]
17-
(let [m {:jar_name (:artifact-id jar)
18-
:group_name (:group-id jar)
19-
:version (:version jar)
20-
:description (:description jar)}
21-
created (:at jar)]
22-
(if created
23-
(assoc m :created created)
24-
m)))
16+
(defn- jar->json
17+
[{:keys [artifact-id at description group-id version]}]
18+
{:created at
19+
:description description
20+
:group_name group-id
21+
:jar_name artifact-id
22+
:version version})
2523

2624
(defn json-search [search query page]
2725
(let [response {:status 200
@@ -43,15 +41,14 @@
4341
:error-message (format "Invalid search syntax for query `%s`" query)}
4442
(log/trace-id))))))
4543

46-
(defn- jar->xml [jar]
47-
(let [attrs {:jar_name (:artifact-id jar)
48-
:group_name (:group-id jar)
49-
:version (:version jar)
50-
:description (some-> (:description jar) xml-escape)}
51-
created (:at jar)]
52-
{:tag :result :attrs (if created
53-
(assoc attrs :created created)
54-
attrs)}))
44+
(defn- jar->xml
45+
[{:keys [artifact-id at description group-id version]}]
46+
{:tag :result
47+
:attrs {:created at
48+
:description (some-> description xml-escape)
49+
:group_name group-id
50+
:jar_name artifact-id
51+
:version version}})
5552

5653
(defn xml-search [search query page]
5754
(let [response {:status 200

test/clojars/integration/search_test.clj

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
[clj-http.client :as client]
55
[clojars.search :as search]
66
[clojars.test-helper :as help]
7-
[clojure.set :as set]
87
[clojure.test :refer [deftest is testing use-fixtures]]
98
[clojure.xml :as xml]
10-
[matcher-combinators.test]))
9+
[matcher-combinators.test])
10+
(:import
11+
(java.util
12+
Date)))
1113

1214
(use-fixtures :each
1315
help/default-fixture
@@ -28,15 +30,21 @@
2830
page)
2931
(client/get opts)))
3032

33+
(defn created-as-str
34+
[data]
35+
(update data :created #(str (.getTime %))))
36+
3137
(deftest search-test
3238
(let [base-data {:description "foo" :version "1.0"}
3339
search-data (map (partial merge base-data)
34-
[{:jar_name "test" :group_name "test"}
35-
{:jar_name "test" :group_name "testing"}])]
40+
[{:created (Date.)
41+
:group_name "test"
42+
:jar_name "test"}
43+
{:created (Date.)
44+
:group_name "testing"
45+
:jar_name "test"}])]
3646
(doseq [data search-data]
37-
(search/index! (:search help/system)
38-
(set/rename-keys data {:jar_name :artifact-id
39-
:group_name :group-id})))
47+
(search/index! (:search help/system) data))
4048

4149
(testing "json request returns json"
4250
(let [resp (do-search :json "test")]
@@ -81,7 +89,7 @@
8189
result (json/parse-string (:body resp) true)]
8290
(is (= 200 (:status resp)))
8391
(is (= 2 (:count result)))
84-
(is (match? search-data (:results result)))))
92+
(is (match? (mapv created-as-str search-data) (:results result)))))
8593

8694
(testing "json request with invalid page returns error"
8795
(let [resp (do-search-with-page :json "test" "a" {:throw-exceptions false})

test/clojars/test_helper.clj

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -163,15 +163,6 @@
163163
m))
164164
new-pom))
165165

166-
(defn at-as-time-str
167-
"Adjusts the :at (or :created) Date to a millis-since-epoch string to
168-
match the search results."
169-
[data]
170-
(let [date->time-str #(str (.getTime %))]
171-
(cond-> data
172-
(:at data) (update :at date->time-str)
173-
(:created data) (update :created date->time-str))))
174-
175166
(defn add-verified-group
176167
[account group]
177168
(db/add-group *db* account group)

0 commit comments

Comments
 (0)