== Extract Content inside HTML
cache database link: string html: text content: text error_type: integer error_msg: text error_msg2: text
error_type: 0, ok, 1, html_error, 2. readability_error, 3. readability_error && extraction_error
== sample
require 'rubygems'
require 'json'
require 'net/http'
def htmcont(link)
base_url = "http://htmcont.heroku.com/conts/g?format=json&"
url = "#{base_url}&link=#{link}"
resp = Net::HTTP.get_response(URI.parse(url))
data = resp.body
# we convert the returned JSON data to native Ruby
# data structure - a hash
result = JSON.parse(data)
# if the hash has 'Error' as a key, we raise an error
if result.has_key? 'Error'
raise "web service error"
end
if result['error_type'] != 0
raise 'htmcont can not extract'
end
return result['content']
end