The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url queue (passed from the parent) while emitting signals to the parent to create new nodes and edges between them.
More...
#include <webcrawler.h>
|
| void | parse (QNetworkReply *reply) |
| | Called from Graph when a network reply for a new page download has finished to do the actual parsing of that page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings.
|
| |
| void | newLink (int s, QUrl target, bool enqueue_to_frontier) |
| | ??
|
| |
|
| | WebCrawler (QQueue< QUrl > *urlQueue, const QUrl &startUrl, const QStringList &urlPatternsIncluded, const QStringList &urlPatternsExcluded, const QStringList &linkClasses, const int &maxNodes, const int &maxLinksPerPage, const bool &intLinks=true, const bool &childLinks=true, const bool &parentLinks=false, const bool &selfLinks=false, const bool &extLinksIncluded=false, const bool &extLinksCrawl=false, const bool &socialLinks=false, const int &delayBetween=0) |
| | Constructor from parent Graph thread. Inits variables.
|
| |
| | ~WebCrawler () |
| |
The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url queue (passed from the parent) while emitting signals to the parent to create new nodes and edges between them.
◆ WebCrawler()
| WebCrawler::WebCrawler |
( |
QQueue< QUrl > * |
urlQueue, |
|
|
const QUrl & |
startUrl, |
|
|
const QStringList & |
urlPatternsIncluded, |
|
|
const QStringList & |
urlPatternsExcluded, |
|
|
const QStringList & |
linkClasses, |
|
|
const int & |
maxN, |
|
|
const int & |
maxLinksPerPage, |
|
|
const bool & |
intLinks = true, |
|
|
const bool & |
childLinks = true, |
|
|
const bool & |
parentLinks = false, |
|
|
const bool & |
selfLinks = false, |
|
|
const bool & |
extLinksIncluded = false, |
|
|
const bool & |
extLinksCrawl = false, |
|
|
const bool & |
socialLinks = false, |
|
|
const int & |
delayBetween = 0 |
|
) |
| |
Constructor from parent Graph thread. Inits variables.
- Parameters
-
| url | |
| maxNc | |
| maxLinksPerPage | |
| extLinks | |
| intLinks | |
◆ ~WebCrawler()
| WebCrawler::~WebCrawler |
( |
| ) |
|
◆ finished
| void WebCrawler::finished |
( |
QString |
| ) |
|
|
signal |
◆ newLink
| void WebCrawler::newLink |
( |
int |
s, |
|
|
QUrl |
target, |
|
|
bool |
enqueue_to_frontier |
|
) |
| |
|
slot |
??
- Parameters
-
| s | |
| target | |
| enqueue_to_frontier | |
◆ parse
| void WebCrawler::parse |
( |
QNetworkReply * |
reply | ) |
|
|
slot |
Called from Graph when a network reply for a new page download has finished to do the actual parsing of that page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings.
- Parameters
-
◆ signalCreateEdge
| void WebCrawler::signalCreateEdge |
( |
const int & |
source, |
|
|
const int & |
target |
|
) |
| |
|
signal |
◆ signalCreateNode
| void WebCrawler::signalCreateNode |
( |
const int & |
no, |
|
|
const QString & |
url, |
|
|
const bool & |
signalMW = false |
|
) |
| |
|
signal |
◆ signalStartSpider
| void WebCrawler::signalStartSpider |
( |
| ) |
|
|
signal |
◆ constIterator
| QStringList::const_iterator WebCrawler::constIterator |
|
private |
◆ knownUrls
| QMap<QUrl, int> WebCrawler::knownUrls |
|
private |
◆ m_childLinks
| bool WebCrawler::m_childLinks |
|
private |
◆ m_delayBetween
| int WebCrawler::m_delayBetween |
|
private |
◆ m_discoveredNodes
| int WebCrawler::m_discoveredNodes |
|
private |
◆ m_extLinksCrawl
| bool WebCrawler::m_extLinksCrawl |
|
private |
◆ m_extLinksIncluded
| bool WebCrawler::m_extLinksIncluded |
|
private |
◆ m_initialUrl
| QUrl WebCrawler::m_initialUrl |
|
private |
◆ m_intLinks
| bool WebCrawler::m_intLinks |
|
private |
◆ m_linkClassAllowed
| bool WebCrawler::m_linkClassAllowed |
|
private |
◆ m_linkClasses
| QStringList WebCrawler::m_linkClasses |
|
private |
◆ m_maxLinksPerPage
| int WebCrawler::m_maxLinksPerPage |
|
private |
◆ m_maxUrls
| int WebCrawler::m_maxUrls |
|
private |
◆ m_parentLinks
| bool WebCrawler::m_parentLinks |
|
private |
◆ m_selfLinks
| bool WebCrawler::m_selfLinks |
|
private |
◆ m_socialLinks
| bool WebCrawler::m_socialLinks |
|
private |
◆ m_socialLinksExcluded
| QStringList WebCrawler::m_socialLinksExcluded |
|
private |
◆ m_urlIsSocial
| bool WebCrawler::m_urlIsSocial |
|
private |
◆ m_urlPatternAllowed
| bool WebCrawler::m_urlPatternAllowed |
|
private |
◆ m_urlPatternNotAllowed
| bool WebCrawler::m_urlPatternNotAllowed |
|
private |
◆ m_urlPatternsExcluded
| QStringList WebCrawler::m_urlPatternsExcluded |
|
private |
◆ m_urlPatternsIncluded
| QStringList WebCrawler::m_urlPatternsIncluded |
|
private |
◆ m_urlQueue
| QQueue<QUrl>* WebCrawler::m_urlQueue |
|
private |
◆ urlPattern
| QString WebCrawler::urlPattern |
|
private |
The documentation for this class was generated from the following files: