From 63af654688e5f5bbd54e001a62e3fbc20f117dea Mon Sep 17 00:00:00 2001 From: Alex Gleason Date: Tue, 4 May 2021 13:31:02 -0500 Subject: Update TwitterCard tests and add Parser.MetaTags --- lib/pleroma/web/rich_media/parser.ex | 7 +- lib/pleroma/web/rich_media/parser/meta_tags.ex | 30 +++++ .../web/rich_media/parsers/meta_tags_parser.ex | 41 +++---- lib/pleroma/web/rich_media/parsers/o_embed.ex | 6 +- lib/pleroma/web/rich_media/parsers/twitter_card.ex | 7 +- .../web/rich_media/parser/meta_tags_test.exs | 81 +++++++++++++ .../web/rich_media/parsers/twitter_card_test.exs | 133 +++++++++++++-------- 7 files changed, 222 insertions(+), 83 deletions(-) create mode 100644 lib/pleroma/web/rich_media/parser/meta_tags.ex create mode 100644 test/pleroma/web/rich_media/parser/meta_tags_test.exs diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index dee0f61ac..d628513be 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -164,11 +164,8 @@ defmodule Pleroma.Web.RichMedia.Parser do end defp maybe_parse(html) do - Enum.reduce_while(parsers(), %{}, fn parser, acc -> - case parser.parse(html, acc) do - data when data != %{} -> {:halt, data} - _ -> {:cont, acc} - end + Enum.reduce(parsers(), %{}, fn parser, acc -> + parser.parse(html, acc) end) end diff --git a/lib/pleroma/web/rich_media/parser/meta_tags.ex b/lib/pleroma/web/rich_media/parser/meta_tags.ex new file mode 100644 index 000000000..888ac3fc4 --- /dev/null +++ b/lib/pleroma/web/rich_media/parser/meta_tags.ex @@ -0,0 +1,30 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parser.MetaTags do + @moduledoc """ + Parses a `Floki.html_tree/0` and returns a map of raw `` tag values. + """ + @spec parse(html_tree :: Floki.html_tree()) :: map() + def parse(html_tree) do + html_tree + |> Floki.find("meta") + |> Enum.reduce(%{}, fn html_node, acc -> + case parse_node(html_node) do + {:ok, {name, content}} -> Map.put(acc, name, content) + _ -> acc + end + end) + end + + defp parse_node({_tag, attrs, _children}) when is_list(attrs) do + case Map.new(attrs) do + %{"name" => name, "content" => content} -> {:ok, {name, content}} + %{"property" => name, "content" => content} -> {:ok, {name, content}} + _ -> {:error, :invalid_meta_tag} + end + end + + defp parse_node(_), do: {:error, :invalid_meta_tag} +end diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex index 31c3d1e33..5375037b3 100644 --- a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -3,44 +3,39 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do - def parse(data, html, prefix, key_name, value_name \\ "content") do + def parse(html, prefix, key_name, value_name \\ "content") do html |> get_elements(key_name, prefix) - |> Enum.reduce(data, fn el, acc -> - attributes = normalize_attributes(el, prefix, key_name, value_name) - + |> Enum.reduce(%{}, fn el, acc -> + attributes = normalize_attributes(el, key_name, value_name) Map.merge(acc, attributes) end) - |> maybe_put_title(html) + end + + defp get_elements(html, key_names, prefix) when is_list(key_names) do + Enum.reduce(key_names, [], fn key_name, acc -> + acc ++ Floki.find(html, "meta[#{key_name}^='#{prefix}:']") + end) end defp get_elements(html, key_name, prefix) do - html |> Floki.find("meta[#{key_name}^='#{prefix}:']") + get_elements(html, [key_name], prefix) end - defp normalize_attributes(html_node, prefix, key_name, value_name) do + defp normalize_attributes(html_node, key_names, value_name) when is_list(key_names) do {_tag, attributes, _children} = html_node + data = Map.new(attributes) - data = - Map.new(attributes, fn {name, value} -> - {name, String.trim_leading(value, "#{prefix}:")} - end) - - %{data[key_name] => data[value_name]} + Enum.reduce(key_names, %{}, fn key_name, acc -> + if data[key_name], do: Map.put(acc, data[key_name], data[value_name]), else: acc + end) end - defp maybe_put_title(%{"title" => _} = meta, _), do: meta - - defp maybe_put_title(meta, html) when meta != %{} do - case get_page_title(html) do - "" -> meta - title -> Map.put_new(meta, "title", title) - end + defp normalize_attributes(html_node, key_name, value_name) do + normalize_attributes(html_node, [key_name], value_name) end - defp maybe_put_title(meta, _), do: meta - - defp get_page_title(html) do + def get_page_title(html) do Floki.find(html, "html head title") |> List.first() |> Floki.text() end end diff --git a/lib/pleroma/web/rich_media/parsers/o_embed.ex b/lib/pleroma/web/rich_media/parsers/o_embed.ex index 8dc378c3c..4f7cd1260 100644 --- a/lib/pleroma/web/rich_media/parsers/o_embed.ex +++ b/lib/pleroma/web/rich_media/parsers/o_embed.ex @@ -3,13 +3,13 @@ # SPDX-License-Identifier: AGPL-3.0-only defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do - def parse(html, _data) do + def parse(html, data) do with elements = [_ | _] <- get_discovery_data(html), oembed_url when is_binary(oembed_url) <- get_oembed_url(elements), {:ok, oembed_data} <- get_oembed_data(oembed_url) do - oembed_data + Map.put(data, :oembed, oembed_data) else - _e -> %{} + _e -> data end end diff --git a/lib/pleroma/web/rich_media/parsers/twitter_card.ex b/lib/pleroma/web/rich_media/parsers/twitter_card.ex index 31546819e..c10b9370e 100644 --- a/lib/pleroma/web/rich_media/parsers/twitter_card.ex +++ b/lib/pleroma/web/rich_media/parsers/twitter_card.ex @@ -8,9 +8,8 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do @spec parse(list(), map()) :: map() def parse(html, data) do data - |> MetaTagsParser.parse(html, "og", "property") - |> MetaTagsParser.parse(html, "twitter", "name") - |> MetaTagsParser.parse(html, "twitter", "property") - |> Map.put("type", "link") + |> Map.put(:title, MetaTagsParser.get_page_title(html)) + |> Map.put(:opengraph, MetaTagsParser.parse(html, "og", "property")) + |> Map.put(:twitter, MetaTagsParser.parse(html, "twitter", ["name", "property"])) end end diff --git a/test/pleroma/web/rich_media/parser/meta_tags_test.exs b/test/pleroma/web/rich_media/parser/meta_tags_test.exs new file mode 100644 index 000000000..128c83a95 --- /dev/null +++ b/test/pleroma/web/rich_media/parser/meta_tags_test.exs @@ -0,0 +1,81 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parser.MetaTagsTest do + use ExUnit.Case, async: true + alias Pleroma.Web.RichMedia.Parser.MetaTags + + test "returns a map of values" do + html = + File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") + |> Floki.parse_document!() + + expected = %{ + "CG" => "nyregion", + "CN" => "experience-tech-and-society", + "CT" => "spotlight", + "PST" => "News", + "PT" => "article", + "SCG" => "", + "al:android:app_name" => "NYTimes", + "al:android:package" => "com.nytimes.android", + "al:android:url" => "nytimes://reader/id/100000006583622", + "al:ipad:app_name" => "NYTimes", + "al:ipad:app_store_id" => "357066198", + "al:ipad:url" => + "nytimes://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "al:iphone:app_name" => "NYTimes", + "al:iphone:app_store_id" => "284862083", + "al:iphone:url" => + "nytimes://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "article:modified" => "2019-08-02T09:30:23.000Z", + "article:published" => "2019-08-01T17:15:31.000Z", + "article:section" => "New York", + "article:tag" => "New York City", + "articleid" => "100000006583622", + "byl" => "By Joseph Goldstein and Ali Watkins", + "description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "fb:app_id" => "9869919170", + "image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "msapplication-starturl" => "https://www.nytimes.com", + "news_keywords" => + "NYPD,Juvenile delinquency,Facial Recognition,Privacy,Government Surveillance,Police,Civil Rights,NYC", + "nyt_uri" => "nyt://article/9da58246-2495-505f-9abd-b5fda8e67b56", + "og:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "og:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "og:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "og:type" => "article", + "og:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "pdate" => "20190801", + "pubp_event_id" => "pubp://event/47a657bafa8a476bb36832f90ee5ac6e", + "robots" => "noarchive", + "thumbnail" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-thumbStandard.jpg", + "twitter:app:id:googleplay" => "com.nytimes.android", + "twitter:app:name:googleplay" => "NYTimes", + "twitter:app:url:googleplay" => "nytimes://reader/id/100000006583622", + "twitter:card" => "summary_large_image", + "twitter:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "twitter:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", + "twitter:image:alt" => "", + "twitter:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "twitter:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "viewport" => "width=device-width, initial-scale=1, maximum-scale=1" + } + + assert MetaTags.parse(html) == expected + end +end diff --git a/test/pleroma/web/rich_media/parsers/twitter_card_test.exs b/test/pleroma/web/rich_media/parsers/twitter_card_test.exs index 2aacd29a3..909ce636d 100644 --- a/test/pleroma/web/rich_media/parsers/twitter_card_test.exs +++ b/test/pleroma/web/rich_media/parsers/twitter_card_test.exs @@ -15,22 +15,29 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") |> Floki.parse_document!() - assert TwitterCard.parse(html, %{}) == - %{ - "app:id:googleplay" => "com.nytimes.android", - "app:name:googleplay" => "NYTimes", - "app:url:googleplay" => "nytimes://reader/id/100000006583622", - "site" => nil, - "description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", - "type" => "article", - "url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - "title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database." - } + expected = %{ + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", + twitter: %{ + "twitter:app:id:googleplay" => "com.nytimes.android", + "twitter:app:name:googleplay" => "NYTimes", + "twitter:app:url:googleplay" => "nytimes://reader/id/100000006583622", + "twitter:site" => nil + }, + opengraph: %{ + "og:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "og:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "og:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "og:type" => "article", + "og:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" + } + } + + assert TwitterCard.parse(html, %{}) == expected end test "parses twitter card with only property attributes" do @@ -38,20 +45,35 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") |> Floki.parse_document!() - assert TwitterCard.parse(html, %{}) == - %{ - "card" => "summary_large_image", - "description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "image:alt" => "", - "title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - "type" => "article" - } + expected = %{ + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", + twitter: %{ + "twitter:card" => "summary_large_image", + "twitter:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "twitter:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", + "twitter:image:alt" => "", + "twitter:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "twitter:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" + }, + opengraph: %{ + "og:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "og:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "og:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "og:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "og:type" => "article" + } + } + + assert TwitterCard.parse(html, %{}) == expected end test "parses twitter card with name & property attributes" do @@ -59,24 +81,39 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") |> Floki.parse_document!() - assert TwitterCard.parse(html, %{}) == - %{ - "app:id:googleplay" => "com.nytimes.android", - "app:name:googleplay" => "NYTimes", - "app:url:googleplay" => "nytimes://reader/id/100000006583622", - "card" => "summary_large_image", - "description" => - "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", - "image" => - "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", - "image:alt" => "", - "site" => nil, - "title" => - "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", - "url" => - "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", - "type" => "article" - } + expected = %{ + title: + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database. - The New York Times", + twitter: %{ + "twitter:app:id:googleplay" => "com.nytimes.android", + "twitter:app:name:googleplay" => "NYTimes", + "twitter:app:url:googleplay" => "nytimes://reader/id/100000006583622", + "twitter:card" => "summary_large_image", + "twitter:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "twitter:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-videoSixteenByNineJumbo1600.jpg", + "twitter:image:alt" => "", + "twitter:site" => nil, + "twitter:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "twitter:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html" + }, + opengraph: %{ + "og:description" => + "With little oversight, the N.Y.P.D. has been using powerful surveillance technology on photos of children and teenagers.", + "og:image" => + "https://static01.nyt.com/images/2019/08/01/nyregion/01nypd-juveniles-promo/01nypd-juveniles-promo-facebookJumbo.jpg", + "og:title" => + "She Was Arrested at 14. Then Her Photo Went to a Facial Recognition Database.", + "og:url" => + "https://www.nytimes.com/2019/08/01/nyregion/nypd-facial-recognition-children-teenagers.html", + "og:type" => "article" + } + } + + assert TwitterCard.parse(html, %{}) == expected end test "respect only first title tag on the page" do -- cgit v1.2.3