/haskell-html-parser

Primary LanguageHaskellThe UnlicenseUnlicense

haskell-html-parser

Libraries required:

  • http-conduit
  • parsec
import qualified Network.HTTP.Simple as Net
import qualified Data.ByteString as BS
import Data.Either
import Control.Monad

import Text.Parsec
import Text.Parsec.Char
import Text.Parsec.Combinator
import Text.Parsec.ByteString

-- Expressions to parse
data HTMLExpr = Link String
              | Other ()
                deriving (Eq, Show)

whitespace :: Parser ()
whitespace = void $ many $ oneOf " \n\t"

lexeme :: Parser a -> Parser a
lexeme p = do
           x <- p
           whitespace
           return x

simpleParser :: Parser a -> BS.ByteString -> Either ParseError a
simpleParser p = parse p ""

parseHTMLLink :: Parser HTMLExpr
parseHTMLLink = do
    void $ lexeme $ string "<a" -- go to a element
    void $ manyTill anyChar (try (string "href="))
    void $ char '"' -- open href
    link <- parseLink
    void $ char '"' -- close href
    void $ manyTill anyChar (try (string ">")) -- close element
    return $ Link link
  where parseLink = many1 $ satisfy (\a -> a /= '"')

parseHTMLOther :: Parser HTMLExpr
parseHTMLOther = do
    void $ anyChar
    return $ Other ()

parseHTML :: Parser [HTMLExpr]
parseHTML = many $ try parseHTMLLink <|> parseHTMLOther

getLinksFromParsedHTML:: Either ParseError [HTMLExpr] -> [HTMLExpr]
getLinksFromParsedHTML list = either (\left -> []) getLinks list
  where getLinks = \a -> filter (\b -> b /= Other ()) a

main :: IO ()
main = do
    let url = "https://haskell.org"
    let request = Net.parseRequest_ url
    response <- Net.httpBS request
    let bsResponseBody = Net.getResponseBody response
    let parsedHTML = simpleParser parseHTML bsResponseBody
    print $ show $ getLinksFromParsedHTML parsedHTML