words

Build Status

Haskell code that:

todo

import Control.Lens
import Data.Char (isSpace)
import Pipes (runEffect, (>->), Producer)
import Pipes.HTTP
import Pipes.Text.Encoding (utf8)
import Pipes.Text.IO (fromHandle)
import Protolude

import qualified Control.Foldl as L
import qualified Data.Map.Strict as Map
import qualified Data.Text as Text
import qualified Data.Text.IO as Text
import qualified Pipes.Prelude as Pipes

A fold that takes a stream of Text and produces a word count.

fwords :: L.Fold Text (Map Text Int)
fwords = L.Fold step begin done
  where
    step (acc,rem0) a = (Map.unionWith (+) acc (toMap . words $ txt),rem1)
      where
        (txt, rem1) = splitLastSpace (rem0<>a)
        unpunct = Text.split (not . (`Protolude.elem` ['a'..'z']))
        words x =
            (filter (/="") . concat)
            ((unpunct . Text.toLower) <$> Text.words x)
        toMap x = Map.unionsWith (+) (map (`Map.singleton` 1) x)
    begin  = (Map.empty, mempty)
    done (acc,rem) = case rem of
        "" -> acc
        _  -> Map.unionWith (+) acc (Map.singleton rem 1)

splitLastSpace is needed to differentiate the final bit of a text chunk in case a word has split between chunks. None of the lower level functions in Data.Text quite cuts it.

splitLastSpace x = case Text.findIndex isSpace (Text.reverse x) of
    Nothing -> (x,"")
    Just i  -> (Text.reverse revx, Text.reverse revrem)
      where
        (revrem,revx) = Text.splitAt i (Text.reverse x)

Take a chunk of text and output a list of words and the last non-space text chunk

toWords :: Text -> ([Text], Text)
toWords t = (words txt, rem)
  where
    (txt, rem) = splitLastSpace t
    unpunct = Text.split (not . (`Protolude.elem` ['a'..'z']))
    words x =
        (filter (/="") . concat)
        ((unpunct . Text.toLower) <$> Text.words x)
wordsFromFile :: FilePath -> IO (Map Text Int)
wordsFromFile f =
    withFile f ReadMode $ \h ->
        runEffect $ L.purely Pipes.fold fwords
        (fromHandle h)
faves :: Int -> Map Text Int -> [(Text,Int)]
faves n =
    take n .
    sortBy (\(_,x) (_,y) -> compare y x) .
    Map.toList
httpWords f = do
    req <- parseRequest f
    man <- newManager tlsManagerSettings
    withHTTP req man $ \resp ->
        L.purely Pipes.fold fwords (void $ view utf8 (responseBody resp))
mkTable :: [(Text,Int)] -> Text
mkTable ws = h <> sep <> b <> sep <> t
  where
    sep = "\n"
    h = "\n<table>"
    t = "</table>"
    b = Text.concat
        ((\x -> "<tr>\n" <> x <> "\n</tr>\n") .
          (\(w,n) -> "<th>" <> w <> "</th>\n" <> "<th>\n" <>
            show n <> "\n</th>\n") <$> ws)
main :: IO ()
main = do
    ws <- httpWords "http://www.gutenberg.org/files/4300/4300-0.txt"
    putStrLn ("Top 100 words being calced ..." :: Text)
    -- print $ faves 100 ws
    Text.writeFile "other/table.md" $
      mkTable $ faves 100 ws

Word Frequency Output

the 15130
of 8262
and 7287
a 6582
to 5042
in 5006
he 4227
his 3331
i 2995
s 2840
that 2795
with 2562
it 2530
was 2134
on 2129
you 2084
for 1963
her 1785
him 1525
is 1462
all 1340
at 1305
by 1291
said 1208
as 1198
she 1190
from 1103
they 1053
or 1039
bloom 1000
what 976
me 947
not 914
out 899
be 897
my 838
up 830
had 814
there 793
one 743
like 730
their 720
mr 719
but 702
have 699
no 691
t 678
them 672
an 659
so 619
o 603
then 579
stephen 571
if 564
when 555
are 552
about 542
which 525
who 521
were 511
your 496
this 493
old 492
we 474
says 473
do 454
down 452
man 451
over 443
too 442
now 441
see 435
after 428
did 403
two 391
would 385
time 380
off 371
back 362
yes 359
will 357
other 334
where 334
into 330
eyes 329
know 328
good 321
more 317
those 317
some 316
could 312
hand 309
its 305
m 303
father 298
here 296
our 295
street 293
has 291
little 290

usage

To build & run:

stack install && example-words

To build, run & render:

stack build --copy-bins --exec "example-words" --exec "pandoc -f markdown+lhs -i src/words.lhs -t html -o index.html --filter pandoc-include --mathjax" --exec "pandoc -f markdown+lhs -i src/words.lhs -t markdown -o readme.md --filter pandoc-include --mathjax" --exec "echo Yah, it succeeded" --file-watch