diff --git a/README.md b/README.md index ec47cc1..c028593 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,159 @@ human-friendly-id-gen New Haskell project to generate Human Friendly Ids. -Those ids should be easy to read, write and to remember. +Those ids should be easier to read / write and remember than classical random +base64 ids. + +The package provide both a lib and an executable `hfig` (for Human Friendly + Identifier Generator). + +## Strategies + +There are different strategies depending on your preferences. + +### Short strategy + +We generate random phonemes that should be not too hard to pronounce but in the +same time having sufficiently different phonemes to be able to have not too long +words to prevent collision. + +~~~ +rupomdovi +waziridro +moplaloxo +kankujochplu +drubrusadka +dripuxmopbi +jotchibluzuv +plotabrprabudr +zopranblokplab +tirbrozprakow +~~~ + +Here is the probability of collision if you generate a sample of n of those words: + +| n | % | +|------|--------| +| 1000 | 2.5e-8 | +| 10k | 2.5e-6 | +| 100k | 2.5e-4 | +| 1M | 2.5e-2 | + +You can also ask to use more phonemes if you only use 2 phonemes which generate words like: + +~~~ +blilwa +wirpa +winupl +tani +ludu +probrip +pichprox +joprux +drudibl +zibrku +~~~ + +The probility of collision become: + +| n | % | +|-----|------| +| 10 | 1e-5 | +| 100 | 1e-3 | +| 1k | 0.11 | +| 10k | 1.0 | + +### Lovecraftian strategy + +My nickname isn't yogsototh for nothing so why not generate as if Lovecraft +could have invented them. + +~~~ +ymhiovhotl +zhaobritl +v'odher +neltha +ucnouthlaxr +kola +adavhig +ctuthrilbh +yakthembru +athoubr'murh +~~~ + +The probability collision table looks like: + +| n | % | +|------|----------------------| +| 10 | 6.669334400426838e-8 | +| 100 | 6.669334400426838e-6 | +| 1k | 6.669334400426837e-4 | +| 10k | 6.669334400426838e-2 | +| 100k | 1.0 | + +if you generate two names for an id, you should be safe. + +| n | % | +|------|---------| +| 10 | 8.8e-17 | +| 100 | 8.8e-15 | +| 1k | 8.8e-13 | +| 10k | 8.8e-11 | +| 100k | 8.8e-9 | +| 1M | 8.8e-7 | + +### Dictionary Strategy + +You can read any file and each line will be considered as a word. +We then take a few random words. + +You can gather some word list in this repository to use. + +There is a default english dictionary with approximatively 370k English words. + +Here is an example: + +~~~ +shuckins-digitinerved-microspectrophotometrical +indeterminableness-getaways-sceloporus +diverts-okayed-cast +semirhythmically-thasian-thrawart +smashups-phototherapeutics-swollenness +bindingness-phoenicia-ringy +execs-axes-barotaxis +monimiaceous-presutural-submembers +heterodyned-pourparley-zecchino +fragmentate-contrude-taeniae +~~~ + +And here are the different table of collision probability. + +use 1 word to make the identifier: + +| n | % | +|---------|--------| +| 10 | 1.3e-4 | +| 100 | 1.3e-2 | +| 1k | 1.0 | + +combine 2 words to make the identifier: + +| n | % | +|------|---------| +| 10 | 3.6e-10 | +| 100 | 3.6e-8 | +| 1k | 3.6e-6 | +| 10k | 3.6e-4 | +| 100k | 3.6e-2 | +| 1M | 1.0 | + +combine 3 words to make the identifier: + +| n | % | +|------|---------| +| 10 | 9.8e-16 | +| 100 | 9.8e-14 | +| 1k | 9.8e-12 | +| 10k | 9.8e-10 | +| 100k | 9.8e-8 | +| 1M | 9.8e-6 | diff --git a/src-exe/Main.hs b/src-exe/Main.hs index 67c32a2..2cfaf6d 100644 --- a/src-exe/Main.hs +++ b/src-exe/Main.hs @@ -17,13 +17,9 @@ main = do Short -> Short.idgen (fromMaybe 4 (optLen opts)) >>= putText Lovecraftian -> Lovecraftian.idgen (fromMaybe 1 (optLen opts)) >>= putText Dict -> do - let file = case optDict opts of - Just "english" -> "dictionaries/english.txt" - Just "first-names" -> "dictionaries/first-names.txt" - Just "generic" -> "dictionaries/generic.txt" - Just "literary" -> "dictionaries/literary.txt" - Just filepath -> toS (format fp filepath) - Nothing -> "dictionaries/english.txt" + file <- case optDict opts of + Just filepath -> return $ toS (format fp filepath) + Nothing -> die "Please select a dictionary file with the -d or --dict options" dict <- Dict.dictionaryFromFile file Dict.idgen dict (fromMaybe 3 (optLen opts)) >>= putText diff --git a/src/HFIG/Dictionary.hs b/src/HFIG/Dictionary.hs index d5339fb..ef652b7 100644 --- a/src/HFIG/Dictionary.hs +++ b/src/HFIG/Dictionary.hs @@ -13,58 +13,23 @@ Yet not the best for preventing collision. module HFIG.Dictionary ( idgen , dictionaryFromFile - , collisionProbability ) where import Protolude -import qualified System.Random.MWC as Random -import qualified Control.Monad.Primitive as Prim import qualified Data.Vector as V import qualified Data.Text as T +import qualified HFIG.Helpers as Helpers + type Dictionary = V.Vector Text + -- | Will generate readable short names The integer parameter determine the -- length in number of syllabus of the name idgen :: Dictionary -> Int -> IO Text -idgen allwords n = - Random.withSystemRandom $ \gen -> - T.intercalate "-" <$> replicateM n (genWord gen allwords) - --- | Approximate collision probability other n generated name with complexity --- parameter equal to l --- --- For example if you generate 1000 words randomly with complexity parameter 4 --- We estimate the probability of collision to 3.85% --- --- This is a nice helper function to use when you want to estimate the optimal --- length of your ids --- --- @ --- > collisionProbability 1000 4 --- 3.8580246913580245e-2 --- --- > collisionProbability 10000 5 --- 6.430041152263374e-2 --- --- > collisionProbability 10000 6 --- 1.0716735253772291e-3 --- @ -collisionProbability :: V.Vector Text -- ^ The dictionary - -> Double -- ^ nb of generated names - -> Double -- ^ length parameter used - -> Double -collisionProbability dict n l = min ((n**2) / (2 * (nbWords dict ** l))) 1 - -nbWords :: V.Vector Text -> Double -nbWords ws = fromIntegral $ V.length ws - -genWord :: Random.Gen (Prim.PrimState IO) -> V.Vector Text -> IO Text -genWord gen allwords = do - (k :: Int) <- Random.uniformR (0, V.length allwords - 1) gen - return (allwords V.! k) +idgen d = Helpers.idgen "-" [d] dictionaryFromFile :: FilePath -> IO (V.Vector Text) dictionaryFromFile dictName = (V.fromList . T.lines) <$> readFile dictName diff --git a/src/HFIG/Lovecraftian.hs b/src/HFIG/Lovecraftian.hs index 3f5fc1b..9f0cda6 100644 --- a/src/HFIG/Lovecraftian.hs +++ b/src/HFIG/Lovecraftian.hs @@ -12,7 +12,6 @@ Yet not the best for preventing collision. -} module HFIG.Lovecraftian ( idgen - , collisionProbability ) where @@ -43,16 +42,3 @@ nameparts = [ , V.fromList ["a","e","i","u","o"] , V.fromList ["","","","","","","","","","","d","g","h","l","lb","lbh","n","r","rc","rh","s","sh","ss","st","sz","th","tl","x","xr","xz"] ] - --- | Approximate collision probability other n generated name with complexity --- parameter equal to l --- --- For example if you generate 1000 words randomly with complexity parameter 4 --- We estimate the probability of collision to 3.85% --- --- This is a nice helper function to use when you want to estimate the optimal --- length of your ids -collisionProbability :: Double -- ^ nb of generated names - -> Double -- ^ length parameter used - -> Double -collisionProbability = Helpers.collisionProbability nameparts