We present, for pedagogical purposes, a simple Haskell implementation of MapReduce. MapReduce is a technology idea popularized by Google. The polymorphic type signature specifies exactly what the input functions passed in by the user (mapFn
and reduceFn
) are supposed to do. Of course, a real MapReduce implementation would be parallel across multiple machines, fault tolerant, use external sort etc.
{-# LANGUAGE ScopedTypeVariables #-}
import Data.List(groupBy, sortOn);
mapReduce :: forall a key value b . Ord key => (a -> [(key,value)]) -> (key -> [(a,value)] -> b) -> [a] -> [b];
mapReduce mapFn reduceFn input = map (uncurry reduceFn) $ shuffle $ do { -- list monad
x :: a <- input;
y :: (key,value) <- mapFn x;
return (x,y); -- all pairs
};
-- profiling reveals the majority of the computation time is spent here, not too surprising.
shuffle :: forall a key value . Ord key => [(a,(key,value))] -> [(key,[(a,value)])];
shuffle = let {
get_a :: (a,(key,value)) -> a;
get_a (a1,_) = a1;
get_key :: (a,(key,value)) -> key;
get_key (_,(k,_)) = k;
get_value :: (a,(key,value)) -> value;
get_value (_,(_,v)) = v;
rearrange :: [(a,(key,value))] -> (key,[(a,value)]);
rearrange l = (get_key $ head l, zip (map get_a l) (map get_value l));
} in map rearrange . groupBy (equating get_key) . sortOn get_key; -- point-free style
-- regular list sort 364.75 sec
-- Seq.unstableSortBy = 440.98 sec
-- cf Data.Ord.comparing
equating :: Eq b => (a -> b) -> a -> a -> Bool;
equating f x y = (f x) == (f y);
No comments :
Post a Comment