https-everywhere-rules/src/Data/Text/ICU/Extras.hs

85 lines
2.8 KiB
Haskell

{-# LANGUAGE CPP #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
module Data.Text.ICU.Extras (
match,
findAndReplace,
#ifdef TEST
Segment(..),
parseReplacement
#endif
) where
import Prelude hiding (span)
import Control.Applicative ((<$>), (<*>), (*>), (<|>))
import Control.Error (hush)
import Data.Attoparsec.Text (Parser, char, digit, takeWhile1, string, many', parseOnly)
import Data.Char (digitToInt)
import Data.Functor.Infix ((<$$>), (<&>))
import Data.Maybe (isJust)
import Data.Monoid (Monoid(mappend, mconcat))
import Data.Text (Text)
import Data.Text.ICU (Regex, regex', find, findAll, group, span, Match, suffix, groupCount)
regex :: Text -> Maybe Regex
regex = hush . regex' []
match :: Text -> Maybe (Text -> Bool)
match = (isJust <$$> find) <$$> regex
findAndReplace :: Text -> Text -> Maybe (Text -> Maybe Text)
findAndReplace pattern replacement = (.)
<$> runReplacement `fmap` parseReplacement replacement
<*> findAll `fmap` regex pattern
type Replacement = [Segment]
data Segment = Reference Int | Literal Text deriving (Show, Eq)
-- JavaScript admits at most nine capture groups, so this is the correct
-- (and likely intended) interpretation.
parseReference :: Parser Segment
parseReference = char '$' *> digit <&> Reference . digitToInt
parseLiteral :: Parser Segment
parseLiteral = Literal <$> takeWhile1 (/= '$')
parseLiteralDollar :: Parser Segment
parseLiteralDollar = Literal <$> string "$"
parseSegment :: Parser Segment
parseSegment = parseLiteral <|> parseReference <|> parseLiteralDollar
parseReplacement :: Text -> Maybe Replacement
parseReplacement = hush . parseOnly (many' parseSegment)
-- In the language of ICU, given a match:
--
-- * 'span' : Text preceding the given match.
-- * 'suffix n' : Suffix of the /n/th capturing group in the match.
--
-- To reconstitute strings in replacement, we 'prefix' all matches
-- with their spans, use their groups to 'dereference' replacements
-- (references are replaced with corresponding captures, literals are
-- inserted as is), cat the results in order, and finally append the
-- suffix of the terminal capturing group ('getSuffix'). When there are
-- no explicit groups, we use the zeroeth; that is, the text following
-- the final match.
runReplacement :: Replacement -> [Match] -> Maybe Text
runReplacement replacement matches = do
prefix <- sequence $ matches >>= \match ->
Just (span match) : dereference match replacement
mappend (mconcat prefix) <$> getSuffix matches
dereference :: Match -> Replacement -> [Maybe Text]
dereference match = fmap $ \case
Reference n -> group n match
Literal str -> Just str
getSuffix :: [Match] -> Maybe Text
getSuffix [] = Just mempty
getSuffix ms = flip suffix <*> groupCount $ last ms