Bye opengraph_parser, hi open_graph_reader

opengraph_parser is basically unmainted, issues are ignored
or deliberately closed without fixing. It pollutes the global
namespace and has no verification of correctness.

The opengraph gem has basically the same issues, not really maintained,
unreleased patches on master since over a year, not really smart either.

So I created my own version and while at it, why not strive try to be
complete and robust, although it's still a work in progress.

This also improves general URL detection by parsing them
from the message after stripping markdown.

An additional dependency was added to support
fetching sites that require cookies to work at all.
For the same reason Faraday's default redirect limit was
bumped.
This commit is contained in:
Jonne Haß 2014-12-06 17:53:48 +01:00
parent 60a5d2a5ba
commit 74a6f42501
8 changed files with 47 additions and 24 deletions

13
Gemfile
View file

@ -110,12 +110,12 @@ gem 'messagebus_ruby_api', '1.0.3'
# Parsing # Parsing
gem 'nokogiri', '1.6.4.1' gem 'nokogiri', '1.6.4.1'
gem 'redcarpet', '3.2.0' gem 'redcarpet', '3.2.0'
gem 'twitter-text', '1.10.0' gem 'twitter-text', '1.10.0'
gem 'roxml', '3.1.6' gem 'roxml', '3.1.6'
gem 'ruby-oembed', '0.8.11' gem 'ruby-oembed', '0.8.11'
gem 'opengraph_parser', '0.2.3' gem 'open_graph_reader', '0.3.1'
# Services # Services
@ -140,6 +140,7 @@ gem 'acts-as-taggable-on', '3.4.2'
gem 'addressable', '2.3.6', :require => 'addressable/uri' gem 'addressable', '2.3.6', :require => 'addressable/uri'
gem 'faraday', '0.9.0' gem 'faraday', '0.9.0'
gem 'faraday_middleware', '0.9.1' gem 'faraday_middleware', '0.9.1'
gem 'faraday-cookie_jar', '0.0.6'
gem 'typhoeus', '0.6.9' gem 'typhoeus', '0.6.9'
# Views # Views

View file

@ -141,6 +141,8 @@ GEM
http_parser.rb (~> 0.6) http_parser.rb (~> 0.6)
nokogiri (~> 1.6) nokogiri (~> 1.6)
diff-lcs (1.2.5) diff-lcs (1.2.5)
domain_name (0.5.22)
unf (>= 0.0.5, < 1.0.0)
eco (1.0.0) eco (1.0.0)
coffee-script coffee-script
eco-source eco-source
@ -165,6 +167,9 @@ GEM
railties (>= 3.0.0) railties (>= 3.0.0)
faraday (0.9.0) faraday (0.9.0)
multipart-post (>= 1.2, < 3) multipart-post (>= 1.2, < 3)
faraday-cookie_jar (0.0.6)
faraday (>= 0.7.4)
http-cookie (~> 1.0.0)
faraday_middleware (0.9.1) faraday_middleware (0.9.1)
faraday (>= 0.7.4, < 0.10) faraday (>= 0.7.4, < 0.10)
ffi (1.9.6) ffi (1.9.6)
@ -268,6 +273,8 @@ GEM
hike (1.2.3) hike (1.2.3)
hiredis (0.5.2) hiredis (0.5.2)
hitimes (1.2.2) hitimes (1.2.2)
http-cookie (1.0.2)
domain_name (~> 0.5)
http_accept_language (2.0.2) http_accept_language (2.0.2)
http_parser.rb (0.6.0) http_parser.rb (0.6.0)
i18n (0.6.11) i18n (0.6.11)
@ -361,9 +368,9 @@ GEM
omniauth-oauth (~> 1.0) omniauth-oauth (~> 1.0)
omniauth-wordpress (0.2.1) omniauth-wordpress (0.2.1)
omniauth-oauth2 (~> 1.1.0) omniauth-oauth2 (~> 1.1.0)
opengraph_parser (0.2.3) open_graph_reader (0.3.1)
addressable faraday (~> 0.9.0)
nokogiri nokogiri (~> 1.6)
opennebula (4.10.1) opennebula (4.10.1)
json json
nokogiri nokogiri
@ -618,6 +625,7 @@ DEPENDENCIES
entypo-rails (= 2.2.2) entypo-rails (= 2.2.2)
factory_girl_rails (= 4.5.0) factory_girl_rails (= 4.5.0)
faraday (= 0.9.0) faraday (= 0.9.0)
faraday-cookie_jar (= 0.0.6)
faraday_middleware (= 0.9.1) faraday_middleware (= 0.9.1)
fixture_builder (= 0.3.6) fixture_builder (= 0.3.6)
fog (= 1.25.0) fog (= 1.25.0)
@ -650,7 +658,7 @@ DEPENDENCIES
omniauth-tumblr (= 1.1) omniauth-tumblr (= 1.1)
omniauth-twitter (= 1.0.1) omniauth-twitter (= 1.0.1)
omniauth-wordpress (= 0.2.1) omniauth-wordpress (= 0.2.1)
opengraph_parser (= 0.2.3) open_graph_reader (= 0.3.1)
pry pry
pry-byebug pry-byebug
pry-debundle pry-debundle

View file

@ -30,16 +30,17 @@ class OpenGraphCache < ActiveRecord::Base
end end
def fetch_and_save_opengraph_data! def fetch_and_save_opengraph_data!
response = OpenGraph.new(self.url) object = OpenGraphReader.fetch!(self.url)
return if response.blank? || response.type.blank? return unless object
self.title = response.title.truncate(255) self.title = object.og.title.truncate(255)
self.ob_type = response.type self.ob_type = object.og.type
self.image = response.images[0] self.image = object.og.image.url
self.url = response.url self.url = object.og.url
self.description = response.description self.description = object.og.description
self.save self.save
rescue OpenGraphReader::NoOpenGraphDataError
end end
end end

View file

@ -18,6 +18,7 @@ options = {
} }
Faraday.default_connection = Faraday::Connection.new(options) do |b| Faraday.default_connection = Faraday::Connection.new(options) do |b|
b.use FaradayMiddleware::FollowRedirects b.use FaradayMiddleware::FollowRedirects, limit: 8
b.use :cookie_jar
b.adapter Faraday.default_adapter b.adapter Faraday.default_adapter
end end

View file

@ -0,0 +1,4 @@
OpenGraphReader.configure do |config|
config.synthesize_title = true
config.synthesize_image_url = true
end

View file

@ -228,7 +228,7 @@ module Diaspora
# Extracts all the urls from the raw message and return them in the form of a string # Extracts all the urls from the raw message and return them in the form of a string
# Different URLs are seperated with a space # Different URLs are seperated with a space
def urls def urls
@urls ||= Twitter::Extractor.extract_urls(@raw_message) @urls ||= Twitter::Extractor.extract_urls(plain_text_without_markdown)
end end
def raw def raw

View file

@ -178,5 +178,10 @@ describe Diaspora::MessageRenderer do
text = "[Perdu](http://perdu.com/) and [DuckDuckGo](https://duckduckgo.com/) can help you" text = "[Perdu](http://perdu.com/) and [DuckDuckGo](https://duckduckgo.com/) can help you"
expect(message(text).urls).to eql ["http://perdu.com/", "https://duckduckgo.com/"] expect(message(text).urls).to eql ["http://perdu.com/", "https://duckduckgo.com/"]
end end
it "extracts urls from continous markdown correctly" do
text = "[![Image](https://www.antifainfoblatt.de/sites/default/files/public/styles/front_full/public/jockpalfreeman.png?itok=OPjHKpmt)](https://www.antifainfoblatt.de/artikel/%E2%80%9Eschlie%C3%9Flich-waren-es-zu-viele%E2%80%9C)"
expect(message(text).urls).to eq ["https://www.antifainfoblatt.de/sites/default/files/public/styles/front_full/public/jockpalfreeman.png?itok=OPjHKpmt", "https://www.antifainfoblatt.de/artikel/%E2%80%9Eschlie%C3%9Flich-waren-es-zu-viele%E2%80%9C"]
end
end end
end end

View file

@ -3,7 +3,7 @@ describe Workers::GatherOpenGraphData do
before do before do
@ogsite_title = 'Homepage' @ogsite_title = 'Homepage'
@ogsite_type = 'website' @ogsite_type = 'website'
@ogsite_image = '/img/something.png' @ogsite_image = 'http://www.we-support-open-graph.com/img/something.png'
@ogsite_url = 'http://www.we-support-open-graph.com' @ogsite_url = 'http://www.we-support-open-graph.com'
@ogsite_description = 'Homepage' @ogsite_description = 'Homepage'
@ -31,9 +31,12 @@ describe Workers::GatherOpenGraphData do
@status_message = FactoryGirl.create(:status_message) @status_message = FactoryGirl.create(:status_message)
stub_request(:get, @ogsite_url).to_return(:status => 200, :body => @ogsite_body) stub_request(:head, @ogsite_url).to_return(status: 200, body: "", headers: {'Content-Type' => 'text/html; utf-8'})
stub_request(:get, @no_open_graph_url).to_return(:status => 200, :body => '<html><body>hello there</body></html>') stub_request(:get, @ogsite_url).to_return(status: 200, body: @ogsite_body, headers: {'Content-Type' => 'text/html; utf-8'})
stub_request(:get, @oglong_url).to_return(:status => 200, :body => @oglong_body) stub_request(:head, @no_open_graph_url).to_return(status: 200, body: "", headers: {'Content-Type' => 'text/html; utf-8'})
stub_request(:get, @no_open_graph_url).to_return(:status => 200, :body => '<html><head><title>Hi</title><body>hello there</body></html>', headers: {'Content-Type' => 'text/html; utf-8'})
stub_request(:head, @oglong_url).to_return(status: 200, body: "", headers: {'Content-Type' => 'text/html; utf-8'})
stub_request(:get, @oglong_url).to_return(status: 200, body: @oglong_body, headers: {'Content-Type' => 'text/html; utf-8'})
end end
@ -59,7 +62,7 @@ describe Workers::GatherOpenGraphData do
expect(ogc.title).to eq(@ogsite_title) expect(ogc.title).to eq(@ogsite_title)
expect(ogc.ob_type).to eq(@ogsite_type) expect(ogc.ob_type).to eq(@ogsite_type)
expect(ogc.image).to eq(@ogsite_url + @ogsite_image) expect(ogc.image).to eq(@ogsite_image)
expect(ogc.url).to eq(@ogsite_url) expect(ogc.url).to eq(@ogsite_url)
expect(ogc.description).to eq(@ogsite_description) expect(ogc.description).to eq(@ogsite_description)