Skip to content

Commit 4c2b07e

Browse files
committed
WIP: write import rake task
1 parent 56e8537 commit 4c2b07e

File tree

5 files changed

+541
-1
lines changed

5 files changed

+541
-1
lines changed

app/components/admin/editions/editorial_remark_component.html.erb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<h4 class="govuk-heading-s govuk-!-margin-bottom-1">Internal note</h4>
33

44
<p class="govuk-body govuk-!-margin-bottom-0 govuk-!-margin-top-0">
5-
<%= sanitize(editorial_remark.body, tags: %w[a]) %>
5+
<%= sanitize(editorial_remark.body, tags: %w[a br]) %>
66
</p>
77

88
<p class="app-view-editions-editorial-remark__list-item-datetime govuk-body govuk-body-s govuk-!-margin-bottom-0 govuk-!-margin-top-0">

app/models/standard_edition.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ class StandardEdition < Edition
44
include ::Attachable
55
include Edition::AlternativeFormatProvider
66

7+
# Associations
8+
include Edition::Organisations
9+
include Edition::RoleAppointments
10+
include Edition::TopicalEvents
11+
include Edition::WorldLocations
12+
include Edition::WorldwideOrganisations
13+
714
validates :configurable_document_type, presence: true, inclusion: { in: -> { ConfigurableDocumentType.all_keys } }
815
validate :content_conforms_to_schema
916

lib/tasks/import.rake

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
require_relative "../whitehall/document_importer"
2+
3+
namespace :import do
4+
desc "Import a news article via its JSON representation (exported via content-publisher#3311)"
5+
task :news_article, %i[path_to_import_file] => :environment do |_, args|
6+
data = JSON.parse(File.read(args[:path_to_import_file]))
7+
8+
# Automatically roll back the import if we encounter any errors
9+
ApplicationRecord.transaction do
10+
puts "Importing document..."
11+
document = Whitehall::DocumentImporter.import!(data)
12+
puts "...document imported (/government/admin/standard-editions/#{document.live_edition.id})"
13+
14+
puts "Re-claiming the route (from Content Publisher) in Publishing API..."
15+
Services.publishing_api.put_path(
16+
document.live_edition.base_path,
17+
{
18+
publishing_app: "whitehall",
19+
override_existing: true,
20+
},
21+
)
22+
puts "...route claimed."
23+
24+
puts "Republishing document..."
25+
PublishingApiDocumentRepublishingWorker.new.perform(document.id)
26+
puts "...document published."
27+
end
28+
rescue JSON::ParserError
29+
puts "Failed to parse JSON for #{args[:path_to_import_file]}"
30+
end
31+
end

lib/whitehall/document_importer.rb

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
require "open-uri"
2+
require "pdf-reader"
3+
require "timecop"
4+
5+
class Whitehall::DocumentImporter
6+
def self.import!(data)
7+
edition = create_base_edition!(data)
8+
9+
AuditTrail.acting_as(robot_user) do
10+
EditorialRemark.create!(
11+
edition: edition,
12+
body: internal_history_summary(data["internal_history"]),
13+
author: robot_user,
14+
created_at: Time.zone.now,
15+
updated_at: Time.zone.now,
16+
)
17+
end
18+
19+
save_attachments(data, edition)
20+
save_images(data, edition)
21+
22+
edition.document.update_columns(
23+
content_id: data["content_id"],
24+
slug: data["base_path"].split("/").last,
25+
)
26+
edition.document
27+
end
28+
29+
def self.create_base_edition!(data)
30+
user = User.find_by(email: data["created_by"]) || Whitehall::DocumentImporter.robot_user
31+
AuditTrail.acting_as(user) do
32+
# Override time, otherwise "Document created" timestamp in sidebar reflects current date
33+
Timecop.travel(Time.zone.parse(data["first_published_at"])) do
34+
edition = StandardEdition.new(
35+
# TODO: we're blocked on running the import in full until press_release has
36+
# been migrated to StandardEdition (one of the two `document_type`s we need to support)
37+
configurable_document_type: "news_story", # data["document_type"],
38+
created_at: data["created_at"],
39+
state: derived_state(data["state"]),
40+
title: data["title"],
41+
summary: data["summary"],
42+
block_content: {
43+
"body" => pre_process_body(data["body"]),
44+
},
45+
political: data["political"],
46+
government_id: data["government_id"],
47+
change_note: combined_change_notes(data["change_notes"]),
48+
alternative_format_provider_id: Organisation.find_by(content_id: data["tags"]["primary_publishing_organisation"]).id,
49+
lead_organisations: [Organisation.find_by(content_id: data["tags"]["primary_publishing_organisation"])],
50+
supporting_organisations: Organisation.where(content_id: data["tags"]["organisations"]),
51+
topical_events: TopicalEvent.where(content_id: data["tags"]["topical_events"]),
52+
world_locations: WorldLocation.where(content_id: data["tags"]["world_locations"]),
53+
role_appointments: RoleAppointment.where(content_id: data["tags"]["role_appointments"]),
54+
)
55+
edition.creator = user
56+
set_publish_timestamps(edition, data)
57+
edition.save!
58+
return edition
59+
end
60+
end
61+
end
62+
63+
def self.derived_state(state)
64+
case state
65+
when "published", "published_but_needs_2i"
66+
"published"
67+
when "withdrawn"
68+
"withdrawn"
69+
else
70+
raise "Unsupported state: #{state}"
71+
end
72+
end
73+
74+
def self.set_publish_timestamps(edition, data)
75+
if Time.zone.parse(data["created_at"]) > Time.zone.parse(data["first_published_at"])
76+
edition.first_published_at = data["first_published_at"]
77+
edition.previously_published = true
78+
else
79+
edition.previously_published = false
80+
end
81+
edition.major_change_published_at = data["first_published_at"]
82+
end
83+
84+
def self.pre_process_body(body)
85+
# Content Publisher has embeds like `[Contact: c1f13fd8-9feb-4028-9323-7cb3383323b4]`.
86+
# Here we find-and-replace for Whitehall's equivalent: `[Contact:171]`
87+
body.gsub!(/\[Contact: ?(.+?)\]/) do |_match|
88+
id = Contact.find_by(content_id: ::Regexp.last_match(1)).id
89+
"[Contact:#{id}]"
90+
end
91+
body
92+
end
93+
94+
def self.combined_change_notes(change_notes)
95+
return nil if change_notes.empty?
96+
97+
change_notes.map { |cn|
98+
"#{Time.zone.parse(cn['public_timestamp']).strftime('%-d %B %Y')}: #{cn['note']}"
99+
}.join("; ")
100+
end
101+
102+
def self.internal_history_summary(internal_history)
103+
return "No internal history available" if internal_history.empty?
104+
105+
lines = [
106+
"Imported from Content Publisher on #{Time.zone.now.strftime('%-d %B %Y at %H:%M')}. Document history:<br>",
107+
]
108+
internal_history.each do |entry|
109+
line = "#{entry['date']} #{entry['time']}: #{entry['entry_type'].to_s.humanize} by #{entry['user']}"
110+
line += ". Details: #{entry['entry_content']}" if entry["entry_content"].present?
111+
lines << "• #{line}"
112+
end
113+
114+
lines.join("<br>")
115+
end
116+
117+
def self.save_attachments(data, edition)
118+
# Temporarily disable callbacks that try to read the file to get its size and content type
119+
# since we don't have a local file, just the Asset Manager reference.
120+
AttachmentData.skip_callback(:save, :before, :update_file_attributes)
121+
122+
data["attachments"].each do |attachment_hash|
123+
uploader_identifier = File.basename(attachment_hash["file_url"])
124+
response = URI.parse(attachment_hash["file_url"]).open
125+
attachment_data = AttachmentData.new(
126+
carrierwave_file: uploader_identifier,
127+
content_type: response.content_type,
128+
file_size: response.size,
129+
number_of_pages: response.content_type == "application/pdf" ? PDF::Reader.new(response).page_count : nil,
130+
created_at: attachment_hash["created_at"],
131+
updated_at: attachment_hash["created_at"],
132+
)
133+
attachment_data.save!(validate: false) # no local file, so have to skip validation
134+
Asset.create!(
135+
variant: "original",
136+
filename: File.basename(attachment_hash["file_url"]),
137+
asset_manager_id: attachment_hash["file_url"].match(%r{media/([^/]+)}).captures.first,
138+
assetable: attachment_data,
139+
)
140+
141+
attachment = FileAttachment.create!(
142+
attachable: edition,
143+
title: attachment_hash["title"],
144+
attachment_data: attachment_data,
145+
accessible: false,
146+
isbn: "",
147+
unique_reference: "",
148+
command_paper_number: "",
149+
hoc_paper_number: "",
150+
parliamentary_session: "",
151+
unnumbered_command_paper: false,
152+
unnumbered_hoc_paper: false,
153+
created_at: attachment_hash["created_at"],
154+
updated_at: attachment_hash["created_at"],
155+
)
156+
edition.attachments << attachment
157+
end
158+
end
159+
160+
def self.save_images(data, edition)
161+
lead_image = nil
162+
data["images"].each do |image_hash|
163+
image = import_image_and_its_variants(image_hash, edition)
164+
edition.images << image
165+
lead_image = image if image_hash["lead_image"]
166+
end
167+
edition.block_content["image"] = lead_image&.image_data_id.to_s || ""
168+
end
169+
170+
def self.import_image_and_its_variants(image_hash, edition)
171+
original_variant = image_hash["variants"].find { |v| v["variant"] == "high_resolution" }
172+
uploader_identifier = File.basename(original_variant["file_url"])
173+
image_data = ImageData.new(
174+
image_kind: "default",
175+
carrierwave_image: uploader_identifier,
176+
)
177+
image_data.save!(validate: false) # no local file, so have to skip validation
178+
image_hash["variants"].each do |variant|
179+
import_image_variant(variant, image_data)
180+
end
181+
# Content Publisher exports a caption and a credit, but there is no credit in Whitehall.
182+
# We'll append the credit onto the caption as follows:
183+
# `caption` => "Foo"
184+
# `credit` => "Bar"
185+
# result => "Foo. Credit: Bar"
186+
caption_parts = [
187+
image_hash["caption"].presence,
188+
image_hash["credit"].blank? ? nil : "Credit: #{image_hash['credit']}",
189+
].compact
190+
caption = caption_parts.join(". ")
191+
Image.create!(
192+
alt_text: image_hash["alt_text"],
193+
caption: caption,
194+
edition: edition,
195+
image_data: image_data,
196+
created_at: image_hash["created_at"],
197+
updated_at: image_hash["updated_at"],
198+
)
199+
end
200+
201+
def self.import_image_variant(variant, image_data)
202+
variant_mappings = {
203+
"high_resolution" => "original",
204+
"960" => "s960",
205+
"300" => "s300",
206+
}
207+
variant_name = variant_mappings[variant["variant"]]
208+
raise "Unknown variant: #{variant['variant']}" unless variant_name
209+
210+
Asset.create!(
211+
variant: variant_name,
212+
filename: File.basename(variant["file_url"]),
213+
asset_manager_id: variant["file_url"].match(%r{media/([^/]+)}).captures.first,
214+
assetable: image_data,
215+
)
216+
end
217+
218+
def self.robot_user
219+
User.find_by(name: "Scheduled Publishing Robot")
220+
end
221+
end

0 commit comments

Comments
 (0)