diff --git a/config/config.exs b/config/config.exs index bf52838..35ea22f 100644 --- a/config/config.exs +++ b/config/config.exs @@ -1,56 +1,89 @@ import Config config :logger, level: :debug + config :logger, :console, format: "$date $time [$level] $metadata$message\n", metadata: :all config :phoenix, :json_library, Jason # General application configuration config :nola, namespace: Nola config :nola, :data_path, "priv" config :nola, :brand, name: "Nola", source_url: "https://phab.random.sh/source/Nola/", owner: "Ashamed Owner", owner_email: "do@not.mail.him" config :ex_aws, region: "us-east-1", host: "s3.wasabisys.com", s3: [ host: "s3.wasabisys.com", region: "us-east-1", scheme: "https://" ] # Configures the endpoint config :nola, NolaWeb.Endpoint, url: [host: "localhost"], secret_key_base: "cAFb7x2p/D7PdV8/C6Os18uygoD0FVQh3efNEFc5+5L529q3dofZtZye/BG12MRZ", render_errors: [view: NolaWeb.ErrorView, accepts: ~w(html json)], server: true, live_view: [signing_salt: "CHANGE_ME_FFS"], - pubsub: [name: NolaWeb.PubSub, - adapter: Phoenix.PubSub.PG2] + pubsub: [name: NolaWeb.PubSub, adapter: Phoenix.PubSub.PG2] config :mime, :types, %{"text/event-stream" => ["sse"]} config :nola, :lastfm, api_key: "x", api_secret: "x" config :nola, :youtube, api_key: "x", invidious: "yewtu.be" config :mnesia, - dir: '.mnesia/#{Mix.env}/#{node()}' + dir: '.mnesia/#{Mix.env()}/#{node()}' + +config :nola, Nola.Plugins.Link, + proxy: nil, + scraper: [ + service: "usescraper", + config: [ + api_key: "xxxx", + http_options: [ + timeout: :timer.seconds(120), + recv_timeout: :timer.seconds(120) + ] + ] + ], + store: [ + ttl: :timer.hours(24), + inhibit: :timer.hours(16), + interval: :timer.minutes(30) + ], + handlers: [ + "Nola.Plugins.Link.Image": [], + "Nola.Plugins.Link.HTML": [], + "Nola.Plugins.Link.PDF": [], + "Nola.Plugins.Link.YouTube": [ + invidious: true + ], + "Nola.Plugins.Link.Twitter": [ + expand_quoted: true + ], + "Nola.Plugins.Link.Imgur": [], + "Nola.Plugins.Link.Github": [], + "Nola.Plugins.Link.Reddit": [], + "Nola.Plugins.Link.ImgDebridLink": [] + ] # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. -import_config "#{Mix.env}.exs" +import_config "#{Mix.env()}.exs" diff --git a/lib/nola/application.ex b/lib/nola/application.ex index fc51c2c..73cc80b 100644 --- a/lib/nola/application.ex +++ b/lib/nola/application.ex @@ -1,60 +1,61 @@ defmodule Nola.Application do use Application def start(_type, _args) do import Supervisor.Spec Nola.Plugins.setup() :ok = Nola.Matrix.setup() :ok = Nola.TelegramRoom.setup() # Define workers and child supervisors to be supervised children = [ supervisor(NolaWeb.Endpoint, []), worker(Registry, [[keys: :duplicate, name: Nola.BroadcastRegistry]], id: :registry_broadcast ), worker(Nola.IcecastAgent, []), worker(Nola.Token, []), worker(Nola.AuthToken, []), Nola.Subnet, {GenMagic.Pool, [name: Nola.GenMagic, pool_size: 2]}, worker(Registry, [[keys: :duplicate, name: Nola.PubSub]], id: :registry_nola_pubsub), worker(Nola.Membership, []), worker(Nola.Account, []), worker(Nola.UserTrack.Storage, []), worker(Nola.Plugins.Account, []), + worker(Nola.Plugins.Link.Store, []), supervisor(Nola.Plugins.Supervisor, [], name: Nola.Plugins.Supervisor) ] ++ Nola.Irc.application_childs() ++ Nola.Matrix.application_childs() opts = [strategy: :one_for_one, name: Nola.Supervisor] sup = Supervisor.start_link(children, opts) start_telegram() Nola.Plugins.start_all() spawn_link(fn -> Nola.Irc.after_start() end) spawn_link(fn -> Nola.Matrix.after_start() end) spawn_link(fn -> Nola.TelegramRoom.after_start() end) sup end def config_change(changed, _new, removed) do NolaWeb.Endpoint.config_change(changed, removed) :ok end defp start_telegram() do token = Keyword.get(Application.get_env(:nola, :telegram, []), :key) options = [ username: Keyword.get(Application.get_env(:nola, :telegram, []), :nick, "beauttebot"), purge: false ] telegram = Telegram.Bot.ChatBot.Supervisor.start_link({Nola.Telegram, token, options}) end end diff --git a/lib/plugins/link.ex b/lib/plugins/link.ex index 89fe944..bdc0fe9 100644 --- a/lib/plugins/link.ex +++ b/lib/plugins/link.ex @@ -1,357 +1,381 @@ defmodule Nola.Plugins.Link do @moduledoc """ # Link Previewer An extensible link previewer for IRC. To extend the supported sites, create a new handler implementing the callbacks. See `link/` directory. The first in list handler that returns true to the `match/2` callback will be used, and if the handler returns `:error` or crashes, will fallback to the default preview. Unsupported websites will use the default link preview method, which is for html document the title, otherwise it'll use the mimetype and size. ## Configuration: ``` config :nola, Nola.Plugins.Link, handlers: [ Nola.Plugins.Link.Youtube: [ invidious: true ], Nola.Plugins.Link.Twitter: [], Nola.Plugins.Link.Imgur: [], ] ``` """ @ircdoc """ # Link preview Previews links (just post a link!). Announces real URL after redirections and provides extended support for YouTube, Twitter and Imgur. """ def short_irc_doc, do: false def irc_doc, do: @ircdoc require Logger alias __MODULE__.Store alias __MODULE__.Scraper def start_link() do GenServer.start_link(__MODULE__, [], name: __MODULE__) end @callback match(uri :: URI.t(), options :: Keyword.t()) :: {true, params :: Map.t()} | false @callback expand(uri :: URI.t(), params :: Map.t(), options :: Keyword.t()) :: {:ok, lines :: [] | String.t()} | :error @callback post_match(uri :: URI.t(), content_type :: binary, headers :: [], opts :: Keyword.t()) :: {:body | :file, params :: Map.t()} | false @callback post_expand( uri :: URI.t(), body :: binary() | Path.t(), params :: Map.t(), options :: Keyword.t() ) :: {:ok, lines :: [] | String.t()} | :error @optional_callbacks [expand: 3, post_expand: 4] defstruct [:client] def init([]) do - Store.setup() {:ok, _} = Registry.register(Nola.PubSub, "messages", plugin: __MODULE__) # {:ok, _} = Registry.register(Nola.PubSub, "messages:telegram", [plugin: __MODULE__]) Logger.info("Link handler started") {:ok, %__MODULE__{}} end def handle_info({:irc, :text, message = %{text: text}}, state) do String.split(text) |> Enum.map(fn word -> if String.starts_with?(word, "http://") || String.starts_with?(word, "https://") do uri = URI.parse(word) if uri.scheme && uri.host do - spawn(fn -> - :timer.kill_after(:timer.seconds(30)) - - case expand_link([uri]) do - {:ok, uris, text} -> - text = - case uris do - [uri] -> - text - - [luri | _] -> - if luri.host == uri.host && luri.path == uri.path do - text - else - ["-> #{URI.to_string(luri)}", text] - end - end - - case text do - lines when is_list(lines) -> - for text <- lines, do: message.replyfun.(text) - - text when is_binary(text) -> - message.replyfun.(text) - - nil -> - nil - end - - _ -> - nil - end - end) + if Store.inhibit_link?(word, {message.network, message.channel}) do + Logger.debug("link inhibited #{word}") + else + handle_link(word, uri, message) + end end end end) {:noreply, state} end def handle_info(msg, state) do {:noreply, state} end def terminate(_reason, state) do :ok end + def handle_link(url, uri, message) do + spawn(fn -> + :timer.kill_after(:timer.seconds(30)) + + store = Store.get_link(url) + + case store || expand_link([uri]) do + {:ok, uris, text} = save -> + text = + case uris do + [uri] -> + text + + [luri | _] -> + if luri.host == uri.host && luri.path == uri.path do + text + else + ["-> #{URI.to_string(luri)}", text] + end + end + + case text do + lines when is_list(lines) -> + for text <- lines, do: message.replyfun.(text) + if !store, do: Store.insert_link(url, save) + Store.witness_link(url, {message.network, message.channel}) + + text when is_binary(text) -> + message.replyfun.(text) + if !store, do: Store.insert_link(url, save) + Store.witness_link(url, {message.network, message.channel}) + + nil -> + nil + end + + _ -> + nil + end + end) + end + # 1. Match the first valid handler # 2. Try to run the handler # 3. If :error or crash, default link. # If :skip, nothing # 4. ? # Over five redirections: cancel. def expand_link(acc = [_, _, _, _, _ | _]) do {:ok, acc, "link redirects more than five times"} end def expand_link(acc = [uri | _]) do Logger.debug("link: expanding: #{inspect(uri)}") handlers = Keyword.get(Application.get_env(:nola, __MODULE__, handlers: []), :handlers) handler = Enum.reduce_while(handlers, nil, fn {module, opts}, acc -> - Logger.debug("link: attempt expanding: #{inspect(module)} for #{inspect(uri)}") module = Module.concat([module]) case module.match(uri, opts) do - {true, params} -> {:halt, {module, params, opts}} - false -> {:cont, acc} + {true, params} -> + Logger.debug("link: will expand with #{inspect(module)} for #{inspect(uri)}") + {:halt, {module, params, opts}} + + false -> + {:cont, acc} end end) run_expand(acc, handler) end def run_expand(acc, nil) do expand_default(acc) end def run_expand(acc = [uri | _], {module, params, opts}) do - Logger.debug("link: expanding #{inspect(uri)} with #{inspect(module)}") - case module.expand(uri, params, opts) do - {:ok, data} -> {:ok, acc, data} - :error -> expand_default(acc) - :skip -> nil + {:ok, data} -> + Logger.debug("link: expanded #{inspect(uri)} with #{inspect(module)}") + {:ok, acc, data} + + :error -> + Logger.error("Error expanding URL #{uri} with #{inspect(module)}") + expand_default(acc) + + :skip -> + nil end rescue e -> Logger.error("link: rescued #{inspect(uri)} with #{inspect(module)}: #{inspect(e)}") Logger.error(Exception.format(:error, e, __STACKTRACE__)) expand_default(acc) catch e, b -> Logger.error("link: catched #{inspect(uri)} with #{inspect(module)}: #{inspect({e, b})}") expand_default(acc) end defp get(url, headers \\ [], options \\ []) do get_req(url, :hackney.get(url, headers, <<>>, options)) end defp get_req(_, {:error, reason}) do {:error, reason} end defp get_req(url, {:ok, 200, headers, client}) do headers = Enum.reduce(headers, %{}, fn {key, value}, acc -> Map.put(acc, String.downcase(key), value) end) content_type = Map.get(headers, "content-type", "application/octect-stream") length = Map.get(headers, "content-length", "0") {length, _} = Integer.parse(length) handlers = Keyword.get(Application.get_env(:nola, __MODULE__, handlers: []), :handlers) handler = Enum.reduce_while(handlers, false, fn {module, opts}, acc -> module = Module.concat([module]) try do case module.post_match(url, content_type, headers, opts) do {mode, params} when mode in [:body, :file] -> {:halt, {module, params, opts, mode}} false -> {:cont, acc} end rescue e -> Logger.error(inspect(e)) {:cont, false} catch e, b -> Logger.error(inspect({b})) {:cont, false} end end) cond do handler != false and length <= 30_000_000 -> case get_body(url, 30_000_000, client, handler, <<>>) do {:ok, _} = ok -> ok :error -> {:ok, "file: #{content_type}, size: #{human_size(length)}"} end # String.starts_with?(content_type, "text/html") && length <= 30_000_000 -> # get_body(url, 30_000_000, client, <<>>) true -> :hackney.close(client) {:ok, "file: #{content_type}, size: #{human_size(length)}"} end end defp get_req(_, {:ok, redirect, headers, client}) when redirect in 300..399 do headers = Enum.reduce(headers, %{}, fn {key, value}, acc -> Map.put(acc, String.downcase(key), value) end) location = Map.get(headers, "location") :hackney.close(client) {:redirect, location} end - defp get_req(_, {:ok, status, headers, client}) do + defp get_req(url, {:ok, status, headers, client}) do + Logger.error("Error fetching URL #{url} = #{status}") :hackney.close(client) {:error, status, headers} end defp get_body(url, len, client, {handler, params, opts, mode} = h, acc) when len >= byte_size(acc) do case :hackney.stream_body(client) do {:ok, data} -> get_body(url, len, client, h, <>) :done -> body = case mode do :body -> acc :file -> {:ok, tmpfile} = Plug.Upload.random_file("linkplugin") File.write!(tmpfile, acc) tmpfile end + Logger.debug("expanding body with #{inspect(handler)}: #{inspect(body)}") handler.post_expand(url, body, params, opts) {:error, reason} -> {:ok, "failed to fetch body: #{inspect(reason)}"} end end defp get_body(_, len, client, h, _acc) do :hackney.close(client) - IO.inspect(h) {:ok, "Error: file over 30"} end def expand_default(acc = [uri = %URI{scheme: scheme} | _]) when scheme in ["http", "https"] do Logger.debug("link: expanding #{uri} with default") headers = [ {"user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"} ] proxy = Keyword.get(Application.get_env(:nola, __MODULE__, []), :proxy, nil) options = [follow_redirect: false, max_body_length: 30_000_000, proxy: proxy] url = URI.to_string(uri) case get(URI.to_string(uri), headers, options) do {:ok, text} -> {:ok, acc, text} {:redirect, link} -> new_uri = URI.parse(link) - - # new_uri = %URI{new_uri | scheme: scheme, authority: uri.authority, host: uri.host, port: uri.port} expand_link([new_uri | acc]) - {:error, status, _headers} -> - # text = Plug.Conn.Status.reason_phrase(status) - # {:ok, acc, "Error: HTTP #{text} (#{status})"} + {:error, status, _headers} when status in [400, 403] -> + Logger.warning("Was denied to fetch URL, using scraper #{url} = #{status}") retry_expand_with_scraper(acc, url) + {:error, status, _headers} -> + Logger.error("Error fetching URL #{url} = #{status}") + {:ok, acc, nil} + {:error, {:tls_alert, {:handshake_failure, err}}} -> - # "TLS Error: #{to_string(err)}"} + Logger.error("Error fetching URL #{url} = TLS Error: #{to_string(err)}") {:ok, acc, nil} {:error, :timeout} -> + Logger.error("Error fetching URL #{url} = timeout") retry_expand_with_scraper(acc, url) {:error, reason} -> - # "Error: #{to_string(reason)}"} + Logger.error("Error fetching URL #{url} = #{to_string(reason)}") {:ok, acc, nil} end end # Unsupported scheme, came from a redirect. def expand_default(acc = [uri | _]) do {:ok, [uri], "-> #{URI.to_string(uri)}"} end # Last resort: scrape the page # We'll be mostly calling this when 403 or 500 or timeout because site blocks us. # An external service will scrape the page for us and return the body. # We'll call directly the HTML handler on the result. defp retry_expand_with_scraper(acc, url) do Logger.info("Attempting scraper") handlers = Keyword.get(Application.get_env(:nola, __MODULE__), :handlers) Logger.info("Attempting scraper #{inspect(handlers)}") with true <- Keyword.has_key?(handlers, :"Nola.Plugins.Link.HTML"), {:ok, body, _meta} <- Scraper.get(url), {:ok, text} <- __MODULE__.HTML.post_expand(url, body, nil, nil) do {:ok, acc, text} else error -> Logger.debug("Attempt with scraper failed: #{inspect(error)}") # We give up here. We don't return anything (the acc from caller `expand default` # does not matter anymore) and I see returning error messages as useless. {:ok, acc, nil} end end defp human_size(bytes) do bytes |> FileSize.new(:b) |> FileSize.scale() |> FileSize.format() end end diff --git a/lib/plugins/link/html.ex b/lib/plugins/link/html.ex index bef9640..78f3192 100644 --- a/lib/plugins/link/html.ex +++ b/lib/plugins/link/html.ex @@ -1,149 +1,149 @@ defmodule Nola.Plugins.Link.HTML do @behaviour Nola.Plugins.Link @impl true def match(_, _), do: false @impl true def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil} def post_match(_, _, _, _), do: false @impl true def post_expand(url, body, _params, _opts) do {:ok, html} = Floki.parse_document(body) opengraph = collect_open_graph(html) text = if has_sufficient_opengraph_data?(opengraph) do generate_text_from_opengraph(url, html, opengraph) else clean_text(collect_title(html)) end {:ok, text} end defp has_sufficient_opengraph_data?(opengraph) do Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description") end defp generate_text_from_opengraph(url, html, opengraph) do itemprops = collect_itemprops(html) prefix = collect_prefix_and_site_name(url, opengraph, itemprops) - description = collect_description(opengraph, itemprops, 500) + description = collect_description(opengraph, itemprops, 400) [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description end defp collect_title(html) do case Floki.find(html, "title") do [{"title", [], [title]} | _] -> String.trim(title) _ -> "" end end defp collect_open_graph(html) do Floki.find(html, "head meta") |> Enum.reduce(%{}, &extract_meta_tag/2) end defp extract_meta_tag({"meta", values, []}, acc) do with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}), {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), true <- is_valid_meta_tag?(name) do Map.put(acc, strip_prefix(name), content) else _ -> acc end end defp extract_meta_tag(_, acc), do: acc defp is_valid_meta_tag?(nil) do false end defp is_valid_meta_tag?(name) do String.starts_with?(name, "og:") || String.starts_with?(name, "article:") end defp strip_prefix("og:" <> key), do: key defp strip_prefix(other), do: other defp collect_itemprops(html) do Floki.find(html, "[itemprop]") |> Enum.reduce(%{}, &extract_itemprop/2) end defp extract_itemprop({"meta", values, []}, acc) do with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}), {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), true <- String.starts_with?(name, "article:") do Map.put(acc, name, content) else _ -> acc end end defp extract_itemprop(_, acc), do: acc defp collect_prefix_and_site_name(url, opengraph, itemprops) do uri = URI.parse(url) site_name = Map.get(opengraph, "site_name", uri.host) paywall_status = get_paywall_status(opengraph, itemprops) section = get_section(opengraph, itemprops) prefix = "#{paywall_status}#{site_name}#{section}" if prefix == "", do: "", else: "#{prefix} — " end defp get_paywall_status(opengraph, itemprops) do content_tier = Map.get( opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free") ) if content_tier == "free", do: "", else: "[paywall] " end defp get_section(opengraph, itemprops) do section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section")) if section, do: ": #{section}", else: "" end defp collect_description(opengraph, itemprops, max_length) do date = get_formatted_date(opengraph, itemprops) description = transform_description(Map.get(opengraph, "description"), max_length) Nola.Irc.Message.splitlong(clean_text("#{date}#{description}")) end defp get_formatted_date(opengraph, itemprops) do published_time = Map.get( opengraph, "article:published_time", Map.get(itemprops, "article:published_time", "") ) case DateTime.from_iso8601(published_time) do {:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. " _ -> "" end end # TODO: Swap with AI description instead of truncating. defp transform_description(nil, _), do: nil defp transform_description(string, length) when is_binary(string) do - if String.length(string) >= length, do: String.truncate(string, length), else: string + if String.length(string) > length, do: "#{String.slice(string, 0..length)}…", else: string end defp clean_text(text) do text |> String.replace("\n", " ") |> HtmlEntities.decode() end end diff --git a/lib/plugins/link/store.ex b/lib/plugins/link/store.ex index ea43070..4e2aa58 100644 --- a/lib/plugins/link/store.ex +++ b/lib/plugins/link/store.ex @@ -1,29 +1,95 @@ defmodule Nola.Plugins.Link.Store do + alias DialyxirVendored.Warnings.Apply + use GenServer + require Logger require Record import Ex2ms @type url() :: String.t() - Record.defrecord(:link, link: nil, at: nil) - @type link :: record(:link, link: String.t(), at: nil) + Record.defrecord(:link, link: nil, result: nil, at: nil) + @type link :: record(:link, link: url(), result: any(), at: nil) - Record.defrecord(:link_entry, key: nil, at: nil) - @type link_entry :: record(:link_entry, key: {url(), String.t()}, at: nil) + Record.defrecord(:link_seen, key: nil, at: nil) + + @doc "A `link_seen` record represents a link that has been seen at a specific time in a given context." + @type link_seen :: record(:link_seen, key: {url(), String.t()}, at: nil) def setup do :ets.new(:links, [:set, :public, :named_table, keypos: 2]) + :ets.new(:links_witness, [:set, :public, :named_table, keypos: 2]) end - @spec insert_link(url()) :: true - def insert_link(url) do - :ets.insert(:links, link(link: url, at: NaiveDateTime.utc_now() |> NaiveDateTime.to_unix())) + @spec insert_link(url(), any()) :: true + def insert_link(url, result) do + :ets.insert( + :links, + link(link: url, result: result, at: DateTime.utc_now() |> DateTime.to_unix()) + ) end - @spec get_link(url()) :: String.t() | nil + @spec get_link(url()) :: any() | nil def get_link(url) do case :ets.lookup(:links, url) do - [link] -> link + [link(result: result)] -> result [] -> nil end end + + @spec witness_link(url(), String.t()) :: boolean() + def inhibit_link?(url, key) do + case :ets.lookup(:links_witness, {url, key}) do + [_] -> true + [] -> false + end + end + + @spec witness_link(url(), String.t()) :: :ok | :inhibit + def witness_link(url, key) do + if inhibit_link?(url, key) do + :inhibit + else + :ets.insert( + :links_witness, + link_seen(key: {url, key}, at: DateTime.utc_now() |> DateTime.to_unix()) + ) + + :ok + end + end + + def start_link(), do: GenServer.start_link(__MODULE__, [], name: __MODULE__) + + @doc false + @impl true + def init(_) do + setup() + env = Keyword.fetch!(Application.get_env(:nola, Nola.Plugins.Link, []), :store) + :erlang.send_after(env[:interval], self(), :expire) + {:ok, nil} + end + + @doc false + @impl true + def handle_info(:expire, state) do + env = Keyword.fetch!(Application.get_env(:nola, Nola.Plugins.Link, []), :store) + :erlang.send_after(env[:interval], self(), :expire) + ttl = env[:ttl] / 1000 + inhibit = env[:inhibit] / 1000 + now = DateTime.utc_now() |> DateTime.to_unix() + + links_evicted = + :ets.select_delete(:links, [ + {{:_, :_, :_, :"$1"}, [{:<, :"$1", now - ttl}], [true]} + ]) + + witness_evicted = + :ets.select_delete(:links_witness, [ + {{:_, :_, :"$1"}, [{:<, :"$1", now - inhibit}], [true]} + ]) + + Logger.debug("evicted #{links_evicted} links and #{witness_evicted} witnesses") + + {:noreply, state} + end end