{-# LANGUAGE RecordPuns #-} {-# OPTIONS -fno-warn-name-shadowing #-} module Text.PageIO.Index where import Codec.Text.IConv import Control.Exception (try) import Control.Monad (unless) import Data.Char (isSpace) import Data.List (nub, sort, intersperse) import Data.Monoid (mappend) import Database.SQLite import System.Environment import System.Time import Debug.Trace import Text.PageIO.Extract import Text.PageIO.Parser (packLBS) import Text.PageIO.Transform import Text.PageIO.Types import Text.Printf import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.ByteString.UTF8 as UTF8 import qualified Text.PageIO.LabelMap as LM addTables dbh = mapM_ (execStatement_ dbh) [ "CREATE VIRTUAL TABLE doc USING FTS3 ( head TEXT, body TEXT )" , "CREATE TABLE idx ( pri_nid TEXT, sec_nid TEXT, doc_name INT )" , "CREATE UNIQUE INDEX doc_name_idx ON idx (doc_name)" , "CREATE TABLE txt ( body TEXT )" ] addColumn dbh cols = (`mapM_` cols) $ \col -> mapM_ (execStatement_ dbh) [ "ALTER TABLE idx ADD COLUMN '" ++ col ++ "' COLLATE NOCASE" , "CREATE INDEX '" ++ col ++ "_idx' ON idx ('" ++ col ++ "' COLLATE NOCASE)" ] indexDocs :: Sheet -> [Doc] -> IO () indexDocs MkSheet{ sheetName, sheetFields, sheetFrames, sheetBox = MkBox{ boxRight, boxBottom } } docs = do env <- getEnvironment let dbName = case lookup "PIO_DB" env of Just n -> n _ -> LM.fromLabel sheetName putStr dbName dbh <- openConnection $ dbName ++ ".db" -- XXX - Suspicious overload: pri_nid means "cols" and sec_nid means "rows" addTables dbh addColumn dbh columns dbhDetails <- case lookup "PIO_DETAILS_DB" env of Just n -> do h <- openConnection $ n ++ ".db" addTables h addColumn h ("fulltext":detailColumns) execStatement_ h "BEGIN EXCLUSIVE TRANSACTION" return (Just h) _ -> return Nothing execStatement_ dbh "BEGIN EXCLUSIVE TRANSACTION" (`mapM_` ([1..] `zip` docs)) $ \(docIndex, MkDoc meta contents) -> try $ do putStr "." let fields = resultFields meta body = UTF8.toString . packLBS $ convert "CP950" "UTF-8" contents let valOf x = LM.lookup (LM.toLabel x) fields year = case valOf "year_roc" of Just year_roc -> valToInt year_roc + 11 _ -> case valOf "year" of Just year -> valToInt year _ -> 0 r_date | year == 0 = "0" | otherwise = let month = maybe 1 valToInt (valOf "month") day = maybe 1 valToInt (valOf "day") in fromYMD (fixY2K year) month day fixY2K year | year >= 1900 = year | year >= 70 = 1900 + year | otherwise = 2000 + year let attrsVanilla = [ (LM.fromLabel lbl, dropWhile isSpace (decode val)) | (lbl, val) <- LM.toList fields ] -- XXX - Blob? decode bound = UTF8.toString . packLBS $ convert "CP950" "UTF-8" (L.fromChunks [bound]) attrs = (("r_id", LM.fromLabel sheetName):("r_date", r_date):attrsVanilla) attrHead = unlines (map snd attrs) cols = concatMap ((++ "'") . (", '" ++) . fst) attrs prms = concatMap ((++ "'") . (", '" ++) . snd) attrs execStatement_ dbh $ concat [ "INSERT INTO doc (head, body) VALUES ('" , attrHead , "', '" , body , "')" ] execStatement_ dbh $ concat [ "INSERT INTO idx (doc_name, pri_nid, sec_nid" , cols , ") VALUES (last_insert_rowid(), " , show boxRight , ", " , show boxBottom , prms , ")" ] let blocks = LM.elems $ resultBlocks meta case dbhDetails of Just h -> do execStatement_ h $ concat [ "INSERT INTO txt (rowid, body) VALUES (" , show docIndex , ", '" , body , "')" ] (`mapM_` foldl blockProduct [] blocks) $ \(area, vals) -> try $ unless (LM.null vals) $ do let attrsBlock = [ (LM.fromLabel lbl, dropWhile isSpace (decode val)) | (lbl, val) <- LM.toList vals ] attrs' = attrs ++ map maybeFix attrsBlock attrHead' = unlines (map snd attrs') cols' = concatMap ((++ "'") . (", '" ++) . fst) attrs' prms' = concatMap ((++ "'") . (", '" ++) . snd) attrs' maybeFix ("expiry_date", date) = ("expiry_date", parseDate date) maybeFix (c@['F','D','S','D',x,'D'], date@(_:_)) | x == 'B' || x == 'E' || x == 'O' = (c, parseDate date) maybeFix x = x parseDate date = fromYMD y m d where i = read date y = 1911 + (i `div` 10000) m = i `mod` 10000 `div` 100 d = i `mod` 100 execStatement_ h $ concat [ "INSERT INTO doc (head, body) VALUES ('" , attrHead' , "', '" , concatMap decode (pageLines area) , "')" ] execStatement_ h $ concat [ "INSERT INTO idx (doc_name, fulltext, pri_nid, sec_nid" , cols' , ") VALUES (last_insert_rowid(), '" , show docIndex , "', " , show boxRight , ", " , show boxBottom , prms' , ")" ] return () _ -> return () execStatement_ dbh "COMMIT" case dbhDetails of Just h -> execStatement_ h "COMMIT" _ -> return Nothing closeConnection dbh putStrLn "done!" where columns = nub $ sort ("r_date":"r_id":map LM.fromLabel (LM.keys sheetFields)) detailColumns = nub $ sort ("r_date":"r_id":map LM.fromLabel (concatMap LM.keys (sheetFields : frameFields))) frameFields = concatMap (map blockFields . LM.elems . frameBlocks) sheetFrames blockProduct :: [(Area, LabelMap Bound)] -> BlockResult -> [(Area, LabelMap Bound)] blockProduct [] (MkBlockResult ys) = ys blockProduct xs (MkBlockResult []) = xs blockProduct xs (MkBlockResult ys) = [ (xa `mappend` ya, xb `mappend` yb) | (xa, xb) <- xs , (ya, yb) <- ys ] fromYMD :: Int -> Int -> Int -> String fromYMD y m d = case toClockTime cal of TOD sec _ -> show (succ (sec `div` 86400)) where cal = CalendarTime { ctYear = y , ctMonth = toEnum (m-1) , ctDay = d , ctHour = 0 , ctMin = 0 , ctSec = 0 , ctPicosec = 0 , ctWDay = Sunday , ctYDay = 0 , ctTZName = "UTC" , ctTZ = 0 , ctIsDST = False } {- import Codec.Binary.Base64.String import Data.Maybe (fromJust) import Data.UUID (generate, toStringUpper) import Network.URI import Text.HyperEstraier import Text.PageIO.Extract import Text.PageIO.Parser (packLBS) import Text.PageIO.Transform import Text.PageIO.Types import System.Environment import qualified Text.PageIO.LabelMap as LM import qualified Data.ByteString.Char8 as S import qualified Data.ByteString.Lazy.Char8 as L indexDocs :: Sheet -> [Doc] -> IO () indexDocs MkSheet{ sheetName, sheetBox = MkBox{ boxRight, boxBottom } } docs = do -- removeDirectoryRecursive "casket" withDatabase "casket" (Writer [Create [], WriteLock NoLock]) $ \db -> do withDatabase "Details" (Writer [Create[], Truncate [], WriteLock NoLock]) $ \db2 -> do (`mapM_` docs) $ \(MkDoc meta contents) -> try $ do doc <- newDocument uuid <- generate let fields = resultFields meta lns = S.lines . packLBS $ convert "CP950" "UTF-8" contents uri = parseURI $ "urn:uuid:" ++ (toStringUpper uuid) setURI doc uri (`mapM_` lns) $ addText doc . UTF8.toString let valOf x = LM.lookup (LM.toLabel x) fields year = case valOf "year_roc" of Just year_roc -> valToInt year_roc + 11 _ -> case valOf "year" of Just year -> valToInt year _ -> 0 r_date | year == 0 = Nothing | otherwise = let month = maybe 1 valToInt (valOf "month") day = maybe 1 valToInt (valOf "day") in Just $ printf "%04d-%02d-%02d 00:00:00" (fixY2K year) (month) (day) fixY2K year | year >= 1900 = year | year >= 70 = 1900 + year | otherwise = 2000 + year let addAttributes d = do (`mapM_` LM.toList fields) $ \(lbl, bound) -> try $ do setAttribute d (LM.fromLabel lbl) $ Just ( UTF8.toString . packLBS $ convert "CP950" "UTF-8" (L.fromChunks [bound]) ) setAttribute d "__text__" $ Just (encode . L.unpack $ contents) setAttribute d "__cols__" $ Just (show boxRight) setAttribute d "__rows__" $ Just (show boxBottom) setAttribute d "r_date" r_date setAttribute d "r_id" $ Just (LM.fromLabel sheetName) setAttribute d "__index_code__" $ Just "" maybe (return ()) (addHiddenText doc) r_date addHiddenText doc (LM.fromLabel sheetName) addAttributes doc putDocument db doc [] docID <- getId doc print (docID, fromJust uri) env <- getEnvironment unless (lookup "PIO" env == Just "1") $ do let blocks = LM.elems $ resultBlocks meta (`mapM_` foldl blockProduct [] blocks) $ \(area, vals) -> try . unless (LM.null vals) $ do doc2 <- newDocument uuid2 <- generate let uri2 = parseURI $ "urn:uuid:" ++ (toStringUpper uuid2) setURI doc2 uri2 (`mapM_` pageLines area) $ \val -> do addText doc2 $ ( UTF8.toString . packLBS $ convert "CP950" "UTF-8" (L.fromChunks [val]) ) (`mapM_` LM.toList vals) $ \(lbl, val) -> do let str = UTF8.toString . packLBS $ convert "CP950" "UTF-8" (L.fromChunks [val]) addHiddenText doc2 str setAttribute doc2 (LM.fromLabel lbl) $ Just str setAttribute doc2 "__index_code__" $ Just "" addAttributes doc2 putDocument db2 doc2 [] docID2 <- getId doc2 print (docID2, fromJust uri2) -}