diff options
4 files changed, 97 insertions, 168 deletions
diff --git a/lib/pleroma/web/rich_media/parser/meta_tags.ex b/lib/pleroma/web/rich_media/parser/meta_tags.ex index 888ac3fc4..e5c6b448d 100644 --- a/lib/pleroma/web/rich_media/parser/meta_tags.ex +++ b/lib/pleroma/web/rich_media/parser/meta_tags.ex @@ -3,7 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Parser.MetaTags do - @moduledoc """ + @doc """ Parses a `Floki.html_tree/0` and returns a map of raw `<meta>` tag values. """ @spec parse(html_tree :: Floki.html_tree()) :: map() diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex deleted file mode 100644 index 5375037b3..000000000 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ /dev/null @@ -1,41 +0,0 @@ -# Pleroma: A lightweight social networking server -# Copyright © 2017-2021 Pleroma Authors <https://pleroma.social/> -# SPDX-License-Identifier: AGPL-3.0-only - -defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do - def parse(html, prefix, key_name, value_name \\ "content") do - html - |> get_elements(key_name, prefix) - |> Enum.reduce(%{}, fn el, acc -> - attributes = normalize_attributes(el, key_name, value_name) - Map.merge(acc, attributes) - end) - end - - defp get_elements(html, key_names, prefix) when is_list(key_names) do - Enum.reduce(key_names, [], fn key_name, acc -> - acc ++ Floki.find(html, "meta[#{key_name}^='#{prefix}:']") - end) - end - - defp get_elements(html, key_name, prefix) do - get_elements(html, [key_name], prefix) - end - - defp normalize_attributes(html_node, key_names, value_name) when is_list(key_names) do - {_tag, attributes, _children} = html_node - data = Map.new(attributes) - - Enum.reduce(key_names, %{}, fn key_name, acc -> - if data[key_name], do: Map.put(acc, data[key_name], data[value_name]), else: acc - end) - end - - defp normalize_attributes(html_node, key_name, value_name) do - normalize_attributes(html_node, [key_name], value_name) - end - - def get_page_title(html) do - Floki.find(html, "html head title") |> List.first() |> Floki.text() - end -end diff --git a/lib/pleroma/web/rich_media/parsers/twitter_card.ex b/lib/pleroma/web/rich_media/parsers/twitter_card.ex index c10b9370e..a892d16ea 100644 --- a/lib/pleroma/web/rich_media/parsers/twitter_card.ex +++ b/lib/pleroma/web/rich_media/parsers/twitter_card.ex @@ -3,13 +3,21 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do - alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser + alias Pleroma.Web.RichMedia.Parser.MetaTags - @spec parse(list(), map()) :: map() + @spec parse(Floki.html_tree(), map()) :: map() def parse(html, data) do data - |> Map.put(:title, MetaTagsParser.get_page_title(html)) - |> Map.put(:opengraph, MetaTagsParser.parse(html, "og", "property")) - |> Map.put(:twitter, MetaTagsParser.parse(html, "twitter", ["name", "property"])) + |> Map.put(:title, get_page_title(html)) + |> Map.put(:meta, MetaTags.parse(html)) + end + + def get_page_title(html) do + with [node | _] <- Floki.find(html, "html head title"), + title when is_binary(title) and title != "" <- Floki.text(node) do + title + else + _ -> nil + end end end diff --git a/test/pleroma/web/rich_media/parsers/twitter_card_test.exs b/test/pleroma/web/rich_media/parsers/twitter_card_test.exs index 909ce636d..1d2aa558e 100644 --- a/test/pleroma/web/rich_media/parsers/twitter_card_test.exs +++ b/test/pleroma/web/rich_media/parsers/twitter_card_test.exs @@ -6,8 +6,10 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do use ExUnit.Case, async: true alias Pleroma.Web.RichMedia.Parsers.TwitterCard - test "returns error when html not contains twitter card" do - assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) == %{} + test "fails gracefully with barebones HTML" do + html = [{"html", [], [{"head", [], []}, {"body", [], []}]}] + expected = %{meta: %{}, title: nil} + assert TwitterCard.parse(html, %{}) == expected end test "parses twitter card with only name attributes" do @@ -15,29 +17,24 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") |> Floki.parse_document!() - expected = %{ - title: - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", - twitter: %{ - "twitter:app:id:googleplay" => "com.nytimes.android", - "twitter:app:name:googleplay" => "NYTimes", - "twitter:app:url:googleplay" => "nytimes://reader/id/100000006583622", - "twitter:site" => nil - }, - opengraph: %{ - "og:description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "og:image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", - "og:title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "og:type" => "article", - "og:url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" - } - } - - assert TwitterCard.parse(html, %{}) == expected + assert %{ + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", + meta: %{ + "twitter:app:id:googleplay" => "com.nytimes.android", + "twitter:app:name:googleplay" => "NYTimes", + "twitter:app:url:googleplay" => "nytimes://reader/id/100000006583622", + "og:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "og:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "og:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "og:type" => "article", + "og:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" + } + } = TwitterCard.parse(html, %{}) end test "parses twitter card with only property attributes" do @@ -45,35 +42,31 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") |> Floki.parse_document!() - expected = %{ - title: - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", - twitter: %{ - "twitter:card" => "summary_large_image", - "twitter:description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "twitter:image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "twitter:image:alt" => "", - "twitter:title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "twitter:url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" - }, - opengraph: %{ - "og:description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "og:image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", - "og:title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "og:url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - "og:type" => "article" - } - } - - assert TwitterCard.parse(html, %{}) == expected + assert %{ + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", + meta: %{ + "twitter:card" => "summary_large_image", + "twitter:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "twitter:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", + "twitter:image:alt" => "", + "twitter:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "twitter:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "og:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "og:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "og:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "og:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "og:type" => "article" + } + } = TwitterCard.parse(html, %{}) end test "parses twitter card with name & property attributes" do @@ -81,84 +74,53 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") |> Floki.parse_document!() - expected = %{ - title: - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", - twitter: %{ - "twitter:app:id:googleplay" => "com.nytimes.android", - "twitter:app:name:googleplay" => "NYTimes", - "twitter:app:url:googleplay" => "nytimes://reader/id/100000006583622", - "twitter:card" => "summary_large_image", - "twitter:description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "twitter:image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "twitter:image:alt" => "", - "twitter:site" => nil, - "twitter:title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "twitter:url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" - }, - opengraph: %{ - "og:description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "og:image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", - "og:title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "og:url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - "og:type" => "article" - } - } - - assert TwitterCard.parse(html, %{}) == expected + assert %{ + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", + meta: %{ + "twitter:app:id:googleplay" => "com.nytimes.android", + "twitter:app:name:googleplay" => "NYTimes", + "twitter:app:url:googleplay" => "nytimes://reader/id/100000006583622", + "twitter:card" => "summary_large_image", + "twitter:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "twitter:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", + "twitter:image:alt" => "", + "twitter:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "twitter:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "og:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "og:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "og:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "og:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "og:type" => "article" + } + } = TwitterCard.parse(html, %{}) end test "respect only first title tag on the page" do - image_path = - "https://assets.atlasobscura.com/media/W1siZiIsInVwbG9hZHMvYXNzZXRzLzkwYzgyMzI4LThlMDUtNGRiNS05MDg3LTUzMGUxZTM5N2RmMmVkOTM5ZDM4MGM4OTIx" <> - "YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <> - "yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg" - html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!() - assert TwitterCard.parse(html, %{}) == - %{ - "site" => "@atlasobscura", - "title" => "The Missing Grave of Margaret Corbin, Revolutionary War Veteran", - "card" => "summary_large_image", - "image" => image_path, - "description" => - "She's the only woman veteran honored with a monument at West Point. But where was she buried?", - "site_name" => "Atlas Obscura", - "type" => "article", - "url" => "http://www.atlasobscura.com/articles/margaret-corbin-grave-west-point" - } + expected = "The Missing Grave of Margaret Corbin, Revolutionary War Veteran - Atlas Obscura" + + assert %{title: ^expected} = TwitterCard.parse(html, %{}) end - test "takes first founded title in html head if there is html markup error" do + test "takes first title found in html head if there is an html markup error" do html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html") |> Floki.parse_document!() - assert TwitterCard.parse(html, %{}) == - %{ - "site" => nil, - "title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "app:id:googleplay" => "com.nytimes.android", - "app:name:googleplay" => "NYTimes", - "app:url:googleplay" => "nytimes://reader/id/100000006583622", - "description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", - "type" => "article", - "url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" - } + expected = + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times" + + assert %{title: ^expected} = TwitterCard.parse(html, %{}) end end |