diff --git a/lib/plugins/link/html.ex b/lib/plugins/link/html.ex index aa78810..1173526 100644 --- a/lib/plugins/link/html.ex +++ b/lib/plugins/link/html.ex @@ -1,152 +1,153 @@ defmodule Nola.Plugins.Link.HTML do @behaviour Nola.Plugins.Link @impl true def match(_, _), do: false @impl true def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil} def post_match(_, _, _, _), do: false @impl true def post_expand(url, body, _params, _opts) do {:ok, html} = Floki.parse_document(body) opengraph = collect_open_graph(html) text = if has_sufficient_opengraph_data?(opengraph) do generate_text_from_opengraph(url, html, opengraph) else clean_text(collect_title(html)) end {:ok, text} end defp has_sufficient_opengraph_data?(opengraph) do Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description") end defp generate_text_from_opengraph(url, html, opengraph) do itemprops = collect_itemprops(html) prefix = collect_prefix_and_site_name(url, opengraph, itemprops) description = collect_description(opengraph, itemprops, 400) [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description end defp collect_title(html) do case Floki.find(html, "title") do [{"title", [], [title]} | _] -> String.trim(title) _ -> "" end end defp collect_open_graph(html) do Floki.find(html, "head meta") + |> Enum.reverse() |> Enum.reduce(%{}, &extract_meta_tag/2) end defp extract_meta_tag({"meta", values, []}, acc) do with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}), {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), true <- is_valid_meta_tag?(name) do Map.put(acc, strip_prefix(name), content) else _ -> acc end end defp extract_meta_tag(_, acc), do: acc defp is_valid_meta_tag?(nil) do false end defp is_valid_meta_tag?(name) do String.starts_with?(name, "og:") || String.starts_with?(name, "article:") end defp strip_prefix("og:" <> key), do: key defp strip_prefix(other), do: other defp collect_itemprops(html) do Floki.find(html, "[itemprop]") |> Enum.reduce(%{}, &extract_itemprop/2) end defp extract_itemprop({"meta", values, []}, acc) do with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}), {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), true <- String.starts_with?(name, "article:") do Map.put(acc, name, content) else _ -> acc end end defp extract_itemprop(_, acc), do: acc defp collect_prefix_and_site_name(url, opengraph, itemprops) do uri = URI.parse(url) site_name = Map.get(opengraph, "site_name", uri.host) paywall_status = get_paywall_status(opengraph, itemprops) section = get_section(opengraph, itemprops) prefix = "#{paywall_status}#{site_name}#{section}" if prefix == "", do: "", else: "#{prefix} — " end defp get_paywall_status(opengraph, itemprops) do content_tier = Map.get( opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free") ) if content_tier == "free", do: "", else: "[paywall] " end defp get_section(opengraph, itemprops) do section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section")) if section, do: ": #{section}", else: "" end defp collect_description(opengraph, itemprops, max_length) do date = get_formatted_date(opengraph, itemprops) description = transform_description(Map.get(opengraph, "description"), max_length) Nola.Irc.Message.splitlong(clean_text("#{date}#{description}")) end defp get_formatted_date(opengraph, itemprops) do published_time = Map.get( opengraph, "article:published_time", Map.get(itemprops, "article:published_time", "") ) case DateTime.from_iso8601(published_time) do {:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. " _ -> "" end end # TODO: Swap with AI description instead of truncating. defp transform_description(nil, _), do: nil defp transform_description(string, length) when is_binary(string) do if String.length(string) > length, do: "#{String.slice(string, 0..length)}…", else: string end defp clean_text(text) do text |> String.replace("\n", " ") |> String.replace("
", " ") |> String.replace("
", " ") |> String.replace("
", " ") |> HtmlEntities.decode() end end