Wrap up the Phonotactics type into a Language type including both the phonotactics and a distribution of the number of syllables per word, so we can generate arbitrary words in addition to just arbitrary syllables Change-Id: I8a37ce9c0eec019c9b84d21b0f2b3b9f5fd319eb Reviewed-on: https://cl.tvl.fyi/c/depot/+/3203 Reviewed-by: grfn <grfn@gws.fyi> Tested-by: BuildkiteCI
181 lines
6.2 KiB
Haskell
181 lines
6.2 KiB
Haskell
{-# LANGUAGE TemplateHaskell #-}
|
||
{-# LANGUAGE OverloadedLists #-}
|
||
--------------------------------------------------------------------------------
|
||
module Xanthous.Generators.Speech
|
||
( -- * Language definition
|
||
Language(..)
|
||
-- ** Lenses
|
||
, phonotactics
|
||
, syllablesPerWord
|
||
|
||
-- ** Phonotactics
|
||
, Phonotactics(..)
|
||
-- *** Lenses
|
||
, onsets
|
||
, nuclei
|
||
, codas
|
||
, numOnsets
|
||
, numNuclei
|
||
, numCodas
|
||
|
||
-- * Language generation
|
||
, syllable
|
||
, word
|
||
|
||
-- * Languages
|
||
, english
|
||
, gormlak
|
||
|
||
) where
|
||
--------------------------------------------------------------------------------
|
||
import Xanthous.Prelude hiding (replicateM)
|
||
import Data.Interval (Interval, (<=..<=))
|
||
import qualified Data.Interval as Interval
|
||
import Control.Monad.Random.Class (MonadRandom)
|
||
import Xanthous.Random (chooseRange, choose, ChooseElement (..), Weighted (Weighted))
|
||
import Control.Monad (replicateM)
|
||
import Test.QuickCheck (Arbitrary, CoArbitrary, Function)
|
||
import Test.QuickCheck.Instances.Text ()
|
||
import Data.List.NonEmpty (NonEmpty)
|
||
--------------------------------------------------------------------------------
|
||
|
||
newtype Phoneme = Phoneme Text
|
||
deriving stock (Show, Eq, Generic)
|
||
deriving anyclass (NFData, CoArbitrary, Function)
|
||
deriving newtype (IsString, Semigroup, Monoid, Arbitrary)
|
||
|
||
-- | The phonotactics of a language
|
||
--
|
||
-- The phonotactics of a language represent the restriction on the phonemes in
|
||
-- the syllables of a language.
|
||
--
|
||
-- Syllables in a language consist of an onset, a nucleus, and a coda (the
|
||
-- nucleus and the coda together representing the "rhyme" of the syllable).
|
||
data Phonotactics = Phonotactics
|
||
{ _onsets :: [Phoneme] -- ^ The permissible onsets, or consonant clusters
|
||
-- at the beginning of a syllable
|
||
, _nuclei :: [Phoneme] -- ^ The permissible nuclei, or vowel clusters in
|
||
-- the middle of a syllable
|
||
, _codas :: [Phoneme] -- ^ The permissible codas, or consonant clusters at
|
||
-- the end of a syllable
|
||
, _numOnsets :: Interval Word -- ^ The range of number of allowable onsets
|
||
, _numNuclei :: Interval Word -- ^ The range of number of allowable nuclei
|
||
, _numCodas :: Interval Word -- ^ The range of number of allowable codas
|
||
}
|
||
deriving stock (Show, Eq, Generic)
|
||
deriving anyclass (NFData)
|
||
makeLenses ''Phonotactics
|
||
|
||
-- | Randomly generate a syllable with the given 'Phonotactics'
|
||
syllable :: MonadRandom m => Phonotactics -> m Text
|
||
syllable phonotactics = do
|
||
let genPart num choices = do
|
||
n <- fromIntegral . fromMaybe 0 <$> chooseRange (phonotactics ^. num)
|
||
fmap (fromMaybe mempty . mconcat)
|
||
. replicateM n
|
||
. choose . ChooseElement
|
||
$ phonotactics ^. choices
|
||
|
||
(Phoneme onset) <- genPart numOnsets onsets
|
||
(Phoneme nucleus) <- genPart numNuclei nuclei
|
||
(Phoneme coda) <- genPart numCodas codas
|
||
|
||
pure $ onset <> nucleus <> coda
|
||
|
||
-- | A definition for a language
|
||
--
|
||
-- Currently this provides enough information to generate multi-syllabic words,
|
||
-- but in the future will likely also include grammar-related things.
|
||
data Language = Language
|
||
{ _phonotactics :: Phonotactics
|
||
, _syllablesPerWord :: Weighted Int NonEmpty Int
|
||
}
|
||
deriving stock (Show, Eq, Generic)
|
||
deriving anyclass (NFData)
|
||
makeLenses ''Language
|
||
|
||
word :: MonadRandom m => Language -> m Text
|
||
word lang = do
|
||
numSyllables <- choose $ lang ^. syllablesPerWord
|
||
mconcat <$> replicateM numSyllables (syllable $ lang ^. phonotactics)
|
||
|
||
--------------------------------------------------------------------------------
|
||
|
||
-- <https://en.wikipedia.org/wiki/English_phonology#Phonotactics>
|
||
englishPhonotactics :: Phonotactics
|
||
englishPhonotactics = Phonotactics
|
||
{ _onsets = [ "pl" , "bl" , "kl" , "gl" , "pr" , "br" , "tr" , "dr" , "kr"
|
||
, "gr" , "tw" , "dw" , "gw" , "kw" , "pw"
|
||
|
||
, "fl" , "sl" , {- "thl", -} "shl" {- , "vl" -}
|
||
, "p", "b", "t", "d", "k", "ɡ", "m", "n", "f", "v", "th", "s"
|
||
, "z", "h", "l", "w"
|
||
|
||
, "sp", "st", "sk"
|
||
|
||
, "sm", "sn"
|
||
|
||
, "sf", "sth"
|
||
|
||
, "spl", "skl", "spr", "str", "skr", "skw", "sm", "sp", "st", "sk"
|
||
]
|
||
, _nuclei = [ "a", "e", "i", "o", "u", "ur", "ar", "or", "ear", "are", "ure"
|
||
, "oa", "ee", "oo", "ei", "ie", "oi", "ou"
|
||
]
|
||
, _codas = [ "m", "n", "ng", "p", "t", "tsh", "k", "f", "sh", "s", "th", "x"
|
||
, "v", "z", "zh", "l", "r", "w"
|
||
|
||
, "lk", "lb", "lt", "ld", "ltsh", "ldsh", "lk"
|
||
, "rp", "rb", "rt", "rd", "rtsh", "rdsh", "rk", "rɡ"
|
||
, "lf", "lv", "lth", "ls", "lz", "lsh", "lth"
|
||
, "rf", "rv", "rth", "rs", "rz", "rth"
|
||
, "lm", "ln"
|
||
, "rm", "rn", "rl"
|
||
, "mp", "nt", "nd", "nth", "nsh", "nk"
|
||
, "mf", "ms", "mth", "nf", "nth", "ns", "nz", "nth"
|
||
, "ft", "sp", "st", "sk"
|
||
, "fth"
|
||
, "pt", "kt"
|
||
, "pth", "ps", "th", "ts", "dth", "dz", "ks"
|
||
, "lpt", "lps", "lfth", "lts", "lst", "lkt", "lks"
|
||
, "rmth", "rpt", "rps", "rts", "rst", "rkt"
|
||
, "mpt", "mps", "ndth", "nkt", "nks", "nkth"
|
||
, "ksth", "kst"
|
||
]
|
||
, _numOnsets = 0 <=..<= 1
|
||
, _numNuclei = Interval.singleton 1
|
||
, _numCodas = 0 <=..<= 1
|
||
}
|
||
|
||
english :: Language
|
||
english = Language
|
||
{ _phonotactics = englishPhonotactics
|
||
, _syllablesPerWord = Weighted [(20, 1),
|
||
(7, 2),
|
||
(2, 3),
|
||
(1, 4)]
|
||
}
|
||
|
||
gormlakPhonotactics :: Phonotactics
|
||
gormlakPhonotactics = Phonotactics
|
||
{ _onsets = [ "h", "l", "g", "b", "m", "n", "ng"
|
||
, "gl", "bl", "fl"
|
||
]
|
||
, _numOnsets = Interval.singleton 1
|
||
, _nuclei = [ "a", "o", "aa", "u" ]
|
||
, _numNuclei = Interval.singleton 1
|
||
, _codas = [ "r", "l", "g", "m", "n"
|
||
, "rl", "gl", "ml", "rm"
|
||
, "n", "k"
|
||
]
|
||
, _numCodas = Interval.singleton 1
|
||
}
|
||
|
||
gormlak :: Language
|
||
gormlak = Language
|
||
{ _phonotactics = gormlakPhonotactics
|
||
, _syllablesPerWord = Weighted [ (5, 2)
|
||
, (5, 1)
|
||
, (1, 3)
|
||
]
|
||
}
|