diff --git a/lib/plugins/link/html.ex b/lib/plugins/link/html.ex
index aa78810..1173526 100644
--- a/lib/plugins/link/html.ex
+++ b/lib/plugins/link/html.ex
@@ -1,152 +1,153 @@
defmodule Nola.Plugins.Link.HTML do
@behaviour Nola.Plugins.Link
@impl true
def match(_, _), do: false
@impl true
def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil}
def post_match(_, _, _, _), do: false
@impl true
def post_expand(url, body, _params, _opts) do
{:ok, html} = Floki.parse_document(body)
opengraph = collect_open_graph(html)
text =
if has_sufficient_opengraph_data?(opengraph) do
generate_text_from_opengraph(url, html, opengraph)
else
clean_text(collect_title(html))
end
{:ok, text}
end
defp has_sufficient_opengraph_data?(opengraph) do
Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description")
end
defp generate_text_from_opengraph(url, html, opengraph) do
itemprops = collect_itemprops(html)
prefix = collect_prefix_and_site_name(url, opengraph, itemprops)
description = collect_description(opengraph, itemprops, 400)
[clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description
end
defp collect_title(html) do
case Floki.find(html, "title") do
[{"title", [], [title]} | _] -> String.trim(title)
_ -> ""
end
end
defp collect_open_graph(html) do
Floki.find(html, "head meta")
+ |> Enum.reverse()
|> Enum.reduce(%{}, &extract_meta_tag/2)
end
defp extract_meta_tag({"meta", values, []}, acc) do
with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}),
{_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
true <- is_valid_meta_tag?(name) do
Map.put(acc, strip_prefix(name), content)
else
_ -> acc
end
end
defp extract_meta_tag(_, acc), do: acc
defp is_valid_meta_tag?(nil) do
false
end
defp is_valid_meta_tag?(name) do
String.starts_with?(name, "og:") || String.starts_with?(name, "article:")
end
defp strip_prefix("og:" <> key), do: key
defp strip_prefix(other), do: other
defp collect_itemprops(html) do
Floki.find(html, "[itemprop]")
|> Enum.reduce(%{}, &extract_itemprop/2)
end
defp extract_itemprop({"meta", values, []}, acc) do
with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}),
{_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
true <- String.starts_with?(name, "article:") do
Map.put(acc, name, content)
else
_ -> acc
end
end
defp extract_itemprop(_, acc), do: acc
defp collect_prefix_and_site_name(url, opengraph, itemprops) do
uri = URI.parse(url)
site_name = Map.get(opengraph, "site_name", uri.host)
paywall_status = get_paywall_status(opengraph, itemprops)
section = get_section(opengraph, itemprops)
prefix = "#{paywall_status}#{site_name}#{section}"
if prefix == "", do: "", else: "#{prefix} — "
end
defp get_paywall_status(opengraph, itemprops) do
content_tier =
Map.get(
opengraph,
"article:content_tier",
Map.get(itemprops, "article:content_tier", "free")
)
if content_tier == "free", do: "", else: "[paywall] "
end
defp get_section(opengraph, itemprops) do
section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section"))
if section, do: ": #{section}", else: ""
end
defp collect_description(opengraph, itemprops, max_length) do
date = get_formatted_date(opengraph, itemprops)
description = transform_description(Map.get(opengraph, "description"), max_length)
Nola.Irc.Message.splitlong(clean_text("#{date}#{description}"))
end
defp get_formatted_date(opengraph, itemprops) do
published_time =
Map.get(
opengraph,
"article:published_time",
Map.get(itemprops, "article:published_time", "")
)
case DateTime.from_iso8601(published_time) do
{:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. "
_ -> ""
end
end
# TODO: Swap with AI description instead of truncating.
defp transform_description(nil, _), do: nil
defp transform_description(string, length) when is_binary(string) do
if String.length(string) > length, do: "#{String.slice(string, 0..length)}…", else: string
end
defp clean_text(text) do
text
|> String.replace("\n", " ")
|> String.replace("
", " ")
|> String.replace("
", " ")
|> String.replace("
", " ")
|> HtmlEntities.decode()
end
end