https-everywhere-rules/src/Data/Text/ICU/Extras.hs

84 lines
2.8 KiB
Haskell

{-# LANGUAGE CPP #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
module Data.Text.ICU.Extras (
match,
findAndReplace,
#ifdef TEST
Segment(..),
parseReplacement
#endif
) where
import Prelude hiding (span)
import Control.Applicative ((<$>), (<*>), (*>), (<|>))
import Control.Error (hush)
import Data.Attoparsec.Text (Parser, char, digit, takeWhile1, string, many', parseOnly)
import Data.Char (digitToInt)
import Data.Maybe (isJust)
import Data.Monoid (Monoid(mappend, mconcat))
import Data.Text (Text)
import Data.Text.ICU (Regex, regex', find, findAll, group, span, Match, suffix, groupCount)
regex :: Text -> Maybe Regex
regex = hush . regex' []
match :: Text -> Maybe (Text -> Bool)
match = fmap (fmap isJust <$> find) <$> regex
findAndReplace :: Text -> Text -> Maybe (Text -> Maybe Text)
findAndReplace pattern replacement = (.)
<$> runReplacement `fmap` parseReplacement replacement
<*> findAll `fmap` regex pattern
type Replacement = [Segment]
data Segment = Reference Int | Literal Text deriving (Show, Eq)
-- JavaScript admits at most nine capture groups, so this is the correct
-- (and likely intended) interpretation.
parseReference :: Parser Segment
parseReference = (Reference . digitToInt) <$> (char '$' *> digit)
parseLiteral :: Parser Segment
parseLiteral = Literal <$> takeWhile1 (/= '$')
parseLiteralDollar :: Parser Segment
parseLiteralDollar = Literal <$> string "$"
parseSegment :: Parser Segment
parseSegment = parseLiteral <|> parseReference <|> parseLiteralDollar
parseReplacement :: Text -> Maybe Replacement
parseReplacement = hush . parseOnly (many' parseSegment)
-- In the language of ICU, given a match:
--
-- * 'span' : Text preceding the given match.
-- * 'suffix n' : Suffix of the /n/th capturing group in the match.
--
-- To reconstitute strings in replacement, we 'prefix' all matches
-- with their spans, use their groups to 'dereference' replacements
-- (references are replaced with corresponding captures, literals are
-- inserted as is), cat the results in order, and finally append the
-- suffix of the terminal capturing group ('getSuffix'). When there are
-- no explicit groups, we use the zeroeth; that is, the text following
-- the final match.
runReplacement :: Replacement -> [Match] -> Maybe Text
runReplacement replacement matches = do
prefix <- sequence $ matches >>= \match ->
Just (span match) : dereference match replacement
mappend (mconcat prefix) <$> getSuffix matches
dereference :: Match -> Replacement -> [Maybe Text]
dereference match = fmap $ \case
Reference n -> group n match
Literal str -> Just str
getSuffix :: [Match] -> Maybe Text
getSuffix [] = Just mempty
getSuffix ms = flip suffix <*> groupCount $ last ms