patternhtmlMinor

Transform a Japanese word to furigana

Submitted by: @import:stackexchange-codereview·Mar 10, 2026·

Viewed 0 times

transformwordjapanesefurigana

Problem

I developed a script to transform a Japanese word in Furigana thanks to the jisho website. But the code is very ugly, especially the extractHiraganaFromHTML and filterBetween functions. What kind of improvements are possible?

module Main
where

import Network.HTTP
import Network.URI
import Text.HTML.TagSoup
import Data.Text (unpack)
import Data.Text.Encoding (decodeUtf8)
import Data.ByteString.Char8 (pack)
import System.Environment (getArgs)

jishoUrlFor :: String -> String 
jishoUrlFor word = (escapeURIString isUnescapedInURI) $ "http://jisho.org/word/"++word

getJishoHTML :: String -> IO String
getJishoHTML w  = simpleHTTP (getRequest $ jishoUrlFor w) >>=  getResponseBody

extractHiraganaFromHTML :: String -> String
extractHiraganaFromHTML = unicodeToString . strip . innerText . filter(~== TagText "") . filterBetween . head . partitions  (~== "") . parseTags

convertWordToHiragana :: String -> IO String 
convertWordToHiragana w = do 
    html  [Tag String]
filterBetween p = fst $ sfoldl step ([],0) $  p
where step (x,a) tag | tag ~== "" = (tag:x,a+1)             
      step (x,a) tag | tag ~== TagClose "span" = (tag:x,a-1)
      step (x,a) tag = (tag:x,a)

      sfoldl step (a,1) (TagClose "span":xs) = (reverse (TagClose "span":a),0) -- stop condition
      sfoldl step (a,b) (x:xs) = sfoldl step (step (a,b) x) xs 
      sfoldl step (a,b) [] = (a,b)

wchars = " \t\r\n"

strip :: String -> String
strip = lstrip . rstrip

lstrip :: String -> String
lstrip = dropWhile (`elem` wchars)

rstrip :: String -> String
rstrip = reverse . lstrip . reverse

unicodeToString = Data.Text.unpack . Data.Text.Encoding.decodeUtf8 . Data.ByteString.Char8.pack

main = do
    (word:_) <- getArgs
    a <- convertWordToHiragana word
    putStrLn a

Solution

Let's inline stuff that's only used once and replace recursion with library combinators.

import Control.Monad.State (evalState, modify, gets)
import Control.Monad.Loops (takeWhileM)
import Control.Applicative ((*>))
import Data.Char (isSpace)

filterBetween :: [Tag String] -> [Tag String]
filterBetween = (`evalState` 1) . takeWhileM step where
  step tag = modify (+ bracket tag) *> gets (>0)
  bracket tag
    | tag ~== "" = 1
    | tag ~== TagClose "span" = -1
    | otherwise = 0

main = do
  [word] ") $ parseTags html

I left the ` and ` tags themselves out of what is printed, that's easier to code up so might be the correct way.

Edit: More existing combinators :D

{-# LANGUAGE OverloadedStrings #-}
import Text.HTML.TagSoup.Match (getTagContent)
import Data.Text (strip)
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text.IO as T

main = do
  [word] <- getArgs
  http <- simpleHTTP $ getRequest $ escapeURIString isUnescapedInURI
    $ "http://jisho.org/word/" ++ word
  html <- getResponseBody http
  T.putStrLn $ strip $ decodeUtf8 $ innerText
    $ getTagContent "span" (== [("class","furigana")]) $ parseTags html

Code Snippets

import Control.Monad.State (evalState, modify, gets)
import Control.Monad.Loops (takeWhileM)
import Control.Applicative ((*>))
import Data.Char (isSpace)

filterBetween :: [Tag String] -> [Tag String]
filterBetween = (`evalState` 1) . takeWhileM step where
  step tag = modify (+ bracket tag) *> gets (>0)
  bracket tag
    | tag ~== "<span>" = 1
    | tag ~== TagClose "span" = -1
    | otherwise = 0

main = do
  [word] <- getArgs
  http <- simpleHTTP $ getRequest $ escapeURIString isUnescapedInURI $ "http://jisho.org/word/" ++ word
  html <- getResponseBody http
  putStrLn
    $ Data.Text.unpack $ Data.Text.Encoding.decodeUtf8 $ Data.ByteString.Char8.pack
    $ dropWhile isSpace $ reverse $ dropWhile isSpace $ reverse
    $ innerText $ filter (~== TagText "")
    $ filterBetween
    $ tail $ head $ partitions (~== "<span class=\"furigana\">") $ parseTags html

{-# LANGUAGE OverloadedStrings #-}
import Text.HTML.TagSoup.Match (getTagContent)
import Data.Text (strip)
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text.IO as T

main = do
  [word] <- getArgs
  http <- simpleHTTP $ getRequest $ escapeURIString isUnescapedInURI
    $ "http://jisho.org/word/" ++ word
  html <- getResponseBody http
  T.putStrLn $ strip $ decodeUtf8 $ innerText
    $ getTagContent "span" (== [("class","furigana")]) $ parseTags html

Context

StackExchange Code Review Q#149171, answer score: 3

Revisions (0)

No revisions yet.