-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.hs
97 lines (79 loc) · 3.43 KB
/
Main.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
import Network.HTTP.Client
import Network.HTTP.Client.TLS
import Network.URI
import System.Environment
import Text.HTML.Parser
import Data.Text.Encoding
import qualified Data.Text as T
import qualified Data.Text.IO as T.IO
import qualified Data.ByteString.Lazy as BSL
import Control.Concurrent
import Control.Concurrent.Async
import Control.Monad
import Control.Exception
import Data.Maybe
import Data.List
import qualified Data.Map.Strict as Map
type URL = String
type InnerText = T.Text
type Depth = Int
type Info = (InnerText, Depth)
type URLInfo = (URL, Info)
main :: IO ()
main = do
[websiteURL, maxDepth] <- getArgs
let d = read maxDepth :: Int
let rootURL = (websiteURL, ("首页", 0))
globalMap <- newEmptyMVar
putMVar globalMap (Map.fromList [rootURL])
toProcess <- newEmptyMVar
putMVar toProcess [rootURL]
forM_ [1..d] $ \x -> do
l <- takeMVar toProcess
let url_list = filter ((< d) . snd . snd) l
newList <- mapConcurrently (`testHTTP` globalMap) url_list
putMVar toProcess (join newList)
m <- takeMVar globalMap
mapM_ (`printURLInfo` d) (Map.toList m)
-------------- Print URL Info ---------------
printURLInfo :: URLInfo -> Int -> IO ()
printURLInfo (url, (t, d)) maxD = do { putStr $ url ++ "\t"; T.IO.putStrLn t;}
------- Global Variables and Settings --------
globalHTTP = managerSetProxy (proxyEnvironment Nothing) defaultManagerSettings
globalHTTPS = managerSetProxy (proxyEnvironment Nothing) tlsManagerSettings
-------------- URL Processing --------------
makeAbsoluteURL :: String -> String -> String
makeAbsoluteURL root rel
| isRelativeReference rel = show $ relativeTo (fromJust $ parseRelativeReference rel) (fromJust $ parseURI root)
| otherwise = rel
-------------- Processing --------------
findHref :: [Attr] -> String
findHref [] = []
findHref (Attr "href" s:xs) = T.unpack s
findHref (x:xs) = findHref xs
findAllHrefs :: URLInfo -> [Token] -> [URLInfo]
findAllHrefs url_info@(url, (str, d)) (TagOpen "a" attrs:ContentText t:xs) = if x /= "" then
((makeAbsoluteURL url x, (str `T.append` ('-' `T.cons` t), d+1))):(findAllHrefs url_info xs)
else
findAllHrefs url_info xs
where x = findHref attrs
findAllHrefs _ [] = []
findAllHrefs url_info (x:xs) = findAllHrefs url_info xs
-------------- Testing --------
testHTTP :: URLInfo -> MVar (Map.Map URL Info) -> IO [URLInfo]
testHTTP url_info@(url, (_, _)) gMap = do
catch (processResponse) (\(e :: SomeException) -> return [])
where
processResponse = do
let setting = if ("https://" `isPrefixOf` url) then globalHTTPS else globalHTTP
man <- newManager setting
req <- parseRequest url
response <- httpLbs req man
m <- takeMVar gMap
let parsed_tokens = parseTokens $ decodeUtf8 $ BSL.toStrict $ responseBody response
let page_urls = findAllHrefs url_info parsed_tokens
let new_urls = filter (isNothing . (`Map.lookup` m) . fst) page_urls
putMVar gMap (Map.union m (Map.fromList new_urls))
return new_urls