patternhtmlMinor
Transform a Japanese word to furigana
Viewed 0 times
transformwordjapanesefurigana
Problem
I developed a script to transform a Japanese word in Furigana thanks to the jisho website. But the code is very ugly, especially the
extractHiraganaFromHTML and filterBetween functions. What kind of improvements are possible?module Main
where
import Network.HTTP
import Network.URI
import Text.HTML.TagSoup
import Data.Text (unpack)
import Data.Text.Encoding (decodeUtf8)
import Data.ByteString.Char8 (pack)
import System.Environment (getArgs)
jishoUrlFor :: String -> String
jishoUrlFor word = (escapeURIString isUnescapedInURI) $ "http://jisho.org/word/"++word
getJishoHTML :: String -> IO String
getJishoHTML w = simpleHTTP (getRequest $ jishoUrlFor w) >>= getResponseBody
extractHiraganaFromHTML :: String -> String
extractHiraganaFromHTML = unicodeToString . strip . innerText . filter(~== TagText "") . filterBetween . head . partitions (~== "") . parseTags
convertWordToHiragana :: String -> IO String
convertWordToHiragana w = do
html [Tag String]
filterBetween p = fst $ sfoldl step ([],0) $ p
where step (x,a) tag | tag ~== "" = (tag:x,a+1)
step (x,a) tag | tag ~== TagClose "span" = (tag:x,a-1)
step (x,a) tag = (tag:x,a)
sfoldl step (a,1) (TagClose "span":xs) = (reverse (TagClose "span":a),0) -- stop condition
sfoldl step (a,b) (x:xs) = sfoldl step (step (a,b) x) xs
sfoldl step (a,b) [] = (a,b)
wchars = " \t\r\n"
strip :: String -> String
strip = lstrip . rstrip
lstrip :: String -> String
lstrip = dropWhile (`elem` wchars)
rstrip :: String -> String
rstrip = reverse . lstrip . reverse
unicodeToString = Data.Text.unpack . Data.Text.Encoding.decodeUtf8 . Data.ByteString.Char8.pack
main = do
(word:_) <- getArgs
a <- convertWordToHiragana word
putStrLn aSolution
Let's inline stuff that's only used once and replace recursion with library combinators.
I left the `
Edit: More existing combinators :D
import Control.Monad.State (evalState, modify, gets)
import Control.Monad.Loops (takeWhileM)
import Control.Applicative ((*>))
import Data.Char (isSpace)
filterBetween :: [Tag String] -> [Tag String]
filterBetween = (`evalState` 1) . takeWhileM step where
step tag = modify (+ bracket tag) *> gets (>0)
bracket tag
| tag ~== "" = 1
| tag ~== TagClose "span" = -1
| otherwise = 0
main = do
[word] ") $ parseTags htmlI left the `
and ` tags themselves out of what is printed, that's easier to code up so might be the correct way.Edit: More existing combinators :D
{-# LANGUAGE OverloadedStrings #-}
import Text.HTML.TagSoup.Match (getTagContent)
import Data.Text (strip)
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text.IO as T
main = do
[word] <- getArgs
http <- simpleHTTP $ getRequest $ escapeURIString isUnescapedInURI
$ "http://jisho.org/word/" ++ word
html <- getResponseBody http
T.putStrLn $ strip $ decodeUtf8 $ innerText
$ getTagContent "span" (== [("class","furigana")]) $ parseTags htmlCode Snippets
import Control.Monad.State (evalState, modify, gets)
import Control.Monad.Loops (takeWhileM)
import Control.Applicative ((*>))
import Data.Char (isSpace)
filterBetween :: [Tag String] -> [Tag String]
filterBetween = (`evalState` 1) . takeWhileM step where
step tag = modify (+ bracket tag) *> gets (>0)
bracket tag
| tag ~== "<span>" = 1
| tag ~== TagClose "span" = -1
| otherwise = 0
main = do
[word] <- getArgs
http <- simpleHTTP $ getRequest $ escapeURIString isUnescapedInURI $ "http://jisho.org/word/" ++ word
html <- getResponseBody http
putStrLn
$ Data.Text.unpack $ Data.Text.Encoding.decodeUtf8 $ Data.ByteString.Char8.pack
$ dropWhile isSpace $ reverse $ dropWhile isSpace $ reverse
$ innerText $ filter (~== TagText "")
$ filterBetween
$ tail $ head $ partitions (~== "<span class=\"furigana\">") $ parseTags html{-# LANGUAGE OverloadedStrings #-}
import Text.HTML.TagSoup.Match (getTagContent)
import Data.Text (strip)
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text.IO as T
main = do
[word] <- getArgs
http <- simpleHTTP $ getRequest $ escapeURIString isUnescapedInURI
$ "http://jisho.org/word/" ++ word
html <- getResponseBody http
T.putStrLn $ strip $ decodeUtf8 $ innerText
$ getTagContent "span" (== [("class","furigana")]) $ parseTags htmlContext
StackExchange Code Review Q#149171, answer score: 3
Revisions (0)
No revisions yet.