31 #include <QNetworkReply>
51 QQueue<QUrl> *urlQueue,
53 const QStringList &urlPatternsIncluded,
54 const QStringList &urlPatternsExcluded,
55 const QStringList &linkClasses,
57 const int &maxLinksPerPage,
58 const bool &intLinks =
true,
59 const bool &childLinks =
true,
60 const bool &parentLinks =
false,
61 const bool &selfLinks =
false,
62 const bool &extLinksIncluded =
false,
63 const bool &extLinksCrawl =
false,
64 const bool &socialLinks =
false,
65 const int &delayBetween = 0
71 void parse(QNetworkReply *reply);
72 void newLink(
int s, QUrl target,
bool enqueue_to_frontier);
77 const bool &signalMW=
false);
The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url qu...
Definition: webcrawler.h:45
QStringList m_urlPatternsIncluded
Definition: webcrawler.h:101
bool m_urlPatternNotAllowed
Definition: webcrawler.h:108
QStringList m_socialLinksExcluded
Definition: webcrawler.h:105
QString urlPattern
Definition: webcrawler.h:102
bool m_childLinks
Definition: webcrawler.h:91
int m_maxUrls
Definition: webcrawler.h:86
bool m_extLinksCrawl
Definition: webcrawler.h:95
QUrl m_initialUrl
Definition: webcrawler.h:85
int m_maxLinksPerPage
Definition: webcrawler.h:88
bool m_parentLinks
Definition: webcrawler.h:92
int m_discoveredNodes
Definition: webcrawler.h:87
bool m_urlPatternAllowed
Definition: webcrawler.h:107
int m_delayBetween
Definition: webcrawler.h:99
bool m_extLinksIncluded
Definition: webcrawler.h:94
QStringList m_linkClasses
Definition: webcrawler.h:104
bool m_urlIsSocial
Definition: webcrawler.h:97
QQueue< QUrl > * m_urlQueue
Definition: webcrawler.h:83
bool m_intLinks
Definition: webcrawler.h:90
bool m_socialLinks
Definition: webcrawler.h:96
void signalCreateNode(const int &no, const QString &url, const bool &signalMW=false)
QStringList m_urlPatternsExcluded
Definition: webcrawler.h:103
QStringList::const_iterator constIterator
Definition: webcrawler.h:106
void signalCreateEdge(const int &source, const int &target)
bool m_selfLinks
Definition: webcrawler.h:93
QMap< QUrl, int > knownUrls
Definition: webcrawler.h:84
bool m_linkClassAllowed
Definition: webcrawler.h:109