rich_media: don't reattempt parsing on rejected URLs
This commit is contained in:
		
							parent
							
								
									92544e8f99
								
							
						
					
					
						commit
						280652651c
					
				
					 3 changed files with 33 additions and 13 deletions
				
			
		|  | @ -57,6 +57,10 @@ def run(%{"url" => url, "url_hash" => url_hash} = args) do | ||||||
|         Logger.debug("Rich media error for #{url}: :content_type is #{type}") |         Logger.debug("Rich media error for #{url}: :content_type is #{type}") | ||||||
|         negative_cache(url_hash, :timer.minutes(30)) |         negative_cache(url_hash, :timer.minutes(30)) | ||||||
| 
 | 
 | ||||||
|  |       {:error, {:url, reason}} -> | ||||||
|  |         Logger.debug("Rich media error for #{url}: refusing URL #{inspect(reason)}") | ||||||
|  |         negative_cache(url_hash, :timer.minutes(180)) | ||||||
|  | 
 | ||||||
|       e -> |       e -> | ||||||
|         Logger.debug("Rich media error for #{url}: #{inspect(e)}") |         Logger.debug("Rich media error for #{url}: #{inspect(e)}") | ||||||
|         {:error, e} |         {:error, e} | ||||||
|  |  | ||||||
|  | @ -16,12 +16,13 @@ def parse(nil), do: nil | ||||||
|   @spec parse(String.t()) :: {:ok, map()} | {:error, any()} |   @spec parse(String.t()) :: {:ok, map()} | {:error, any()} | ||||||
|   def parse(url) do |   def parse(url) do | ||||||
|     with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])}, |     with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])}, | ||||||
|          :ok <- validate_page_url(url), |          {_, :ok} <- {:url, validate_page_url(url)}, | ||||||
|          {:ok, data} <- parse_url(url) do |          {:ok, data} <- parse_url(url) do | ||||||
|       data = Map.put(data, "url", url) |       data = Map.put(data, "url", url) | ||||||
|       {:ok, data} |       {:ok, data} | ||||||
|     else |     else | ||||||
|       {:config, _} -> {:error, :rich_media_disabled} |       {:config, _} -> {:error, :rich_media_disabled} | ||||||
|  |       {:url, {:error, reason}} -> {:error, {:url, reason}} | ||||||
|       e -> e |       e -> e | ||||||
|     end |     end | ||||||
|   end |   end | ||||||
|  | @ -62,7 +63,7 @@ defp clean_parsed_data(data) do | ||||||
|     |> Map.new() |     |> Map.new() | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   @spec validate_page_url(URI.t() | binary()) :: :ok | :error |   @spec validate_page_url(URI.t() | binary()) :: :ok | {:error, term()} | ||||||
|   defp validate_page_url(page_url) when is_binary(page_url) do |   defp validate_page_url(page_url) when is_binary(page_url) do | ||||||
|     validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld]) |     validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld]) | ||||||
| 
 | 
 | ||||||
|  | @ -74,20 +75,20 @@ defp validate_page_url(page_url) when is_binary(page_url) do | ||||||
|   defp validate_page_url(%URI{host: host, scheme: "https"}) do |   defp validate_page_url(%URI{host: host, scheme: "https"}) do | ||||||
|     cond do |     cond do | ||||||
|       Linkify.Parser.ip?(host) -> |       Linkify.Parser.ip?(host) -> | ||||||
|         :error |         {:error, :ip} | ||||||
| 
 | 
 | ||||||
|       host in @config_impl.get([:rich_media, :ignore_hosts], []) -> |       host in @config_impl.get([:rich_media, :ignore_hosts], []) -> | ||||||
|         :error |         {:error, :ignore_hosts} | ||||||
| 
 | 
 | ||||||
|       get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) -> |       get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) -> | ||||||
|         :error |         {:error, :ignore_tld} | ||||||
| 
 | 
 | ||||||
|       true -> |       true -> | ||||||
|         :ok |         :ok | ||||||
|     end |     end | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   defp validate_page_url(_), do: :error |   defp validate_page_url(_), do: {:error, "scheme mismatch"} | ||||||
| 
 | 
 | ||||||
|   defp parse_uri(true, url) do |   defp parse_uri(true, url) do | ||||||
|     url |     url | ||||||
|  | @ -95,7 +96,7 @@ defp parse_uri(true, url) do | ||||||
|     |> validate_page_url |     |> validate_page_url | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   defp parse_uri(_, _), do: :error |   defp parse_uri(_, _), do: {:error, "not an URL"} | ||||||
| 
 | 
 | ||||||
|   defp get_tld(host) do |   defp get_tld(host) do | ||||||
|     host |     host | ||||||
|  |  | ||||||
|  | @ -109,25 +109,40 @@ test "does a HEAD request to check if the body is html" do | ||||||
| 
 | 
 | ||||||
|   test "refuses to crawl incomplete URLs" do |   test "refuses to crawl incomplete URLs" do | ||||||
|     url = "example.com/ogp" |     url = "example.com/ogp" | ||||||
|     assert :error == Parser.parse(url) |     assert {:error, {:url, "scheme mismatch"}} == Parser.parse(url) | ||||||
|  |   end | ||||||
|  | 
 | ||||||
|  |   test "refuses to crawl plain HTTP and other scheme URL" do | ||||||
|  |     [ | ||||||
|  |       "http://example.com/ogp", | ||||||
|  |       "ftp://example.org/dist/" | ||||||
|  |     ] | ||||||
|  |     |> Enum.each(fn url -> | ||||||
|  |       res = Parser.parse(url) | ||||||
|  | 
 | ||||||
|  |       assert {:error, {:url, "scheme mismatch"}} == res or | ||||||
|  |                {:error, {:url, "not an URL"}} == res | ||||||
|  |     end) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   test "refuses to crawl malformed URLs" do |   test "refuses to crawl malformed URLs" do | ||||||
|     url = "example.com[]/ogp" |     url = "example.com[]/ogp" | ||||||
|     assert :error == Parser.parse(url) |     assert {:error, {:url, "not an URL"}} == Parser.parse(url) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   test "refuses to crawl URLs of private network from posts" do |   test "refuses to crawl URLs of private network from posts" do | ||||||
|     [ |     [ | ||||||
|       "http://127.0.0.1:4000/notice/9kCP7VNyPJXFOXDrgO", |       "https://127.0.0.1:4000/notice/9kCP7VNyPJXFOXDrgO", | ||||||
|       "https://10.111.10.1/notice/9kCP7V", |       "https://10.111.10.1/notice/9kCP7V", | ||||||
|       "https://172.16.32.40/notice/9kCP7V", |       "https://172.16.32.40/notice/9kCP7V", | ||||||
|       "https://192.168.10.40/notice/9kCP7V", |       "https://192.168.10.40/notice/9kCP7V" | ||||||
|       "https://pleroma.local/notice/9kCP7V" |  | ||||||
|     ] |     ] | ||||||
|     |> Enum.each(fn url -> |     |> Enum.each(fn url -> | ||||||
|       assert :error == Parser.parse(url) |       assert {:error, {:url, :ip}} == Parser.parse(url) | ||||||
|     end) |     end) | ||||||
|  | 
 | ||||||
|  |     url = "https://pleroma.local/notice/9kCP7V" | ||||||
|  |     assert {:error, {:url, :ignore_tld}} == Parser.parse(url) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   test "returns error when disabled" do |   test "returns error when disabled" do | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Oneric
						Oneric