Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F86136
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
View Options
diff --git a/lib/plugins/link/html.ex b/lib/plugins/link/html.ex
index 1173526..7df1bfc 100644
--- a/lib/plugins/link/html.ex
+++ b/lib/plugins/link/html.ex
@@ -1,153 +1,158 @@
defmodule Nola.Plugins.Link.HTML do
@behaviour Nola.Plugins.Link
@impl true
def match(_, _), do: false
@impl true
def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil}
def post_match(_, _, _, _), do: false
@impl true
def post_expand(url, body, _params, _opts) do
{:ok, html} = Floki.parse_document(body)
opengraph = collect_open_graph(html)
text =
if has_sufficient_opengraph_data?(opengraph) do
generate_text_from_opengraph(url, html, opengraph)
else
clean_text(collect_title(html))
end
{:ok, text}
end
defp has_sufficient_opengraph_data?(opengraph) do
Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description")
end
defp generate_text_from_opengraph(url, html, opengraph) do
itemprops = collect_itemprops(html)
prefix = collect_prefix_and_site_name(url, opengraph, itemprops)
- description = collect_description(opengraph, itemprops, 400)
+ title = Map.get(opengraph, "title")
+ description = collect_description(opengraph, itemprops, title, 400)
- [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description
+ [clean_text("#{prefix}#{title}")] ++ description
end
defp collect_title(html) do
case Floki.find(html, "title") do
[{"title", [], [title]} | _] -> String.trim(title)
_ -> ""
end
end
defp collect_open_graph(html) do
Floki.find(html, "head meta")
|> Enum.reverse()
|> Enum.reduce(%{}, &extract_meta_tag/2)
end
defp extract_meta_tag({"meta", values, []}, acc) do
with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}),
{_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
true <- is_valid_meta_tag?(name) do
Map.put(acc, strip_prefix(name), content)
else
_ -> acc
end
end
defp extract_meta_tag(_, acc), do: acc
defp is_valid_meta_tag?(nil) do
false
end
defp is_valid_meta_tag?(name) do
String.starts_with?(name, "og:") || String.starts_with?(name, "article:")
end
defp strip_prefix("og:" <> key), do: key
defp strip_prefix(other), do: other
defp collect_itemprops(html) do
Floki.find(html, "[itemprop]")
|> Enum.reduce(%{}, &extract_itemprop/2)
end
defp extract_itemprop({"meta", values, []}, acc) do
with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}),
{_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
true <- String.starts_with?(name, "article:") do
Map.put(acc, name, content)
else
_ -> acc
end
end
defp extract_itemprop(_, acc), do: acc
defp collect_prefix_and_site_name(url, opengraph, itemprops) do
uri = URI.parse(url)
site_name = Map.get(opengraph, "site_name", uri.host)
paywall_status = get_paywall_status(opengraph, itemprops)
section = get_section(opengraph, itemprops)
prefix = "#{paywall_status}#{site_name}#{section}"
if prefix == "", do: "", else: "#{prefix} — "
end
defp get_paywall_status(opengraph, itemprops) do
content_tier =
Map.get(
opengraph,
"article:content_tier",
Map.get(itemprops, "article:content_tier", "free")
)
if content_tier == "free", do: "", else: "[paywall] "
end
defp get_section(opengraph, itemprops) do
section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section"))
if section, do: ": #{section}", else: ""
end
- defp collect_description(opengraph, itemprops, max_length) do
+ defp collect_description(opengraph, itemprops, title, max_length) do
date = get_formatted_date(opengraph, itemprops)
description = transform_description(Map.get(opengraph, "description"), max_length)
- Nola.Irc.Message.splitlong(clean_text("#{date}#{description}"))
+ cond do
+ title == description -> date
+ String.jaro_distance(title, description) > 0.8 -> date
+ true -> Nola.Irc.Message.splitlong(clean_text("#{date}#{description}"))
+ end
end
defp get_formatted_date(opengraph, itemprops) do
published_time =
Map.get(
opengraph,
"article:published_time",
Map.get(itemprops, "article:published_time", "")
)
case DateTime.from_iso8601(published_time) do
{:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. "
_ -> ""
end
end
# TODO: Swap with AI description instead of truncating.
defp transform_description(nil, _), do: nil
defp transform_description(string, length) when is_binary(string) do
if String.length(string) > length, do: "#{String.slice(string, 0..length)}…", else: string
end
defp clean_text(text) do
text
|> String.replace("\n", " ")
|> String.replace("<br>", " ")
|> String.replace("<br/>", " ")
|> String.replace("<br />", " ")
|> HtmlEntities.decode()
end
end
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sun, Aug 31, 10:02 AM (1 d, 11 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
55332
Default Alt Text
(5 KB)
Attached To
rNOLA Nola
Event Timeline
Log In to Comment