module Hag
( module Hag
) where
import qualified Data.ByteString.Lazy as L
import Data.Char
import Data.Csv
import Data.Either
import Data.List
import qualified Data.Map as M
import qualified Data.PSQueue as PS
import qualified Data.Text as T
import Data.Text.Encoding
import qualified Data.Text.IO as TI
import qualified Data.Vector as V
import Debug.Trace
import Helpers
import NLP.Tokenize
import Preprocess
import qualified System.Directory as S
import System.Environment
import Tweets
type FeatureMap = M.Map String Float
parseCsv :: T.Text -> Either String (V.Vector Tweet)
parseCsv text = decodeWith
defaultDecodeOptions {decDelimiter = fromIntegral $ ord '\t' }
NoHeader
(L.fromStrict $
encodeUtf8 text)
getFiles :: FilePath -> IO [FilePath]
getFiles dir = S.getDirectoryContents dir
>>= return . map (dir ++) . filter (`notElem` [".", ".."])
extractFeatures :: Tweet -> FeatureMap
extractFeatures tweet = bagOfWords
where
preprocess = V.filter (`notElem` ["USER", "RT"])
. V.fromList
. tokenize
. tMessage
bagOfWords = frequency
. V.filter (/= "")
. V.filter (`notElem` stopWords)
. V.map (map toLower)
. preprocess
$ tweet
frequency :: V.Vector String -> FeatureMap
frequency = V.foldl' countItem M.empty
countItem :: M.Map String Float -> String -> FeatureMap
countItem myMap item = M.insertWith (+) item 1 myMap
insertInMap :: M.Map Tweet FeatureMap
-> Tweet
-> M.Map Tweet FeatureMap
insertInMap oldMap tweet = M.insert tweet val oldMap
where val = extractFeatures tweet
getNeighbors :: (V.Vector Tweet, V.Vector Tweet)
-> V.Vector (Tweet, PS.PSQ Tweet Float)
getNeighbors (v1,v2) = V.map (featureIntersection dictionary) v1
where dictionary = V.foldl insertInMap M.empty v2 :: M.Map Tweet FeatureMap
featureIntersection :: M.Map Tweet FeatureMap
-> Tweet
-> (Tweet, PS.PSQ Tweet Float)
featureIntersection tweetMap tweet = (tweet, mini)
where
mini = PS.fromList
$ M.elems
$ M.mapWithKey (mergeTweetFeatures cosineDistance tweet) tweetMap
mergeTweetFeatures :: (FeatureMap -> FeatureMap -> Float)
-> Tweet
-> Tweet
-> FeatureMap
-> PS.Binding Tweet Float
mergeTweetFeatures distF t1 t2 dictionary = queue
where featuresT1 = extractFeatures t1
featuresT2 = extractFeatures t2
distance = distF featuresT1 featuresT2
queue = t2 PS.:-> distance
cosineDistance :: FeatureMap -> FeatureMap -> Float
cosineDistance t1 t2 = 1000 * (mySum / (wordsInT1 * wordsInT2))
where
wordsInT1 = M.foldl (+) 0 t1
wordsInT2 = M.foldl (+) 0 t2
intersection = M.elems $ M.intersectionWith (*) t1 t2
mySum = foldl (+) 0 intersection
idftf :: FeatureMap -> FeatureMap -> FeatureMap
idftf grandDict miniDict = M.mapWithKey (iFrequency grandDict) miniDict
iFrequency :: FeatureMap -> String -> Float -> Float
iFrequency dict word freq = freq * (log (totalNumberOfWords / freqWord))
where freqWord = M.findWithDefault 1 word dict
totalNumberOfWords = M.foldl (+) 0 dict
compareLabels :: Int -> V.Vector (Tweet,PS.PSQ Tweet Float) -> V.Vector Float
compareLabels k vec = V.map
(\(a,b) -> if (tLabel a) == getLabel k b then 1 else 0)
vec
compareLabelsForScheme :: [V.Vector (Tweet,PS.PSQ Tweet Float)] -> Int -> [Float]
compareLabelsForScheme vecs k = map (getAccuracy . compareLabels k) vecs
getLabel :: Int -> PS.PSQ Tweet Float -> String
getLabel k queue = if agg >= nonAgg then "aggressive" else "non_aggressive"
where tweets = queueTake k queue
labels = map tLabel tweets
agg = length $ filter (== "aggressive") labels
nonAgg = length $ filter (== "non_aggressive") labels
getAccuracy :: V.Vector Float -> Float
getAccuracy vec = (V.foldl (+) 0 vec) / fromIntegral (V.length vec)
main :: IO ()
main = do
(dir:_) <- getArgs
files <- getFiles dir
csvs <- mapM TI.readFile $ sort files
let
processedCsvs = map preprocess csvs
r = map parseCsv processedCsvs
tweets = rights r
scheme = mkCrossValScheme tweets
allNeighbors = map getNeighbors scheme
ks = [1..100]
comparedTweets = map (compareLabelsForScheme allNeighbors) ks
results = encode comparedTweets
header = encode
$ map (("fold_" ++) . show) ([1..10] :: [Integer])
L.writeFile "resultsK.csv" $ header `L.append` results