The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url queue (passed from the parent) while emitting signals to the parent to create new nodes and edges between them.
More...
#include <webcrawler.h>
|
void | parse (QNetworkReply *reply) |
| Called from Graph when a network reply for a new page download has finished to do the actual parsing of that page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings.
|
|
void | newLink (int s, QUrl target, bool enqueue_to_frontier) |
| ??
|
|
|
| WebCrawler (QQueue< QUrl > *urlQueue, const QUrl &startUrl, const QStringList &urlPatternsIncluded, const QStringList &urlPatternsExcluded, const QStringList &linkClasses, const int &maxNodes, const int &maxLinksPerPage, const bool &intLinks=true, const bool &childLinks=true, const bool &parentLinks=false, const bool &selfLinks=false, const bool &extLinksIncluded=false, const bool &extLinksCrawl=false, const bool &socialLinks=false, const int &delayBetween=0) |
| Constructor from parent Graph thread. Inits variables.
|
|
| ~WebCrawler () |
|
The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url queue (passed from the parent) while emitting signals to the parent to create new nodes and edges between them.
◆ WebCrawler()
WebCrawler::WebCrawler |
( |
QQueue< QUrl > * |
urlQueue, |
|
|
const QUrl & |
startUrl, |
|
|
const QStringList & |
urlPatternsIncluded, |
|
|
const QStringList & |
urlPatternsExcluded, |
|
|
const QStringList & |
linkClasses, |
|
|
const int & |
maxN, |
|
|
const int & |
maxLinksPerPage, |
|
|
const bool & |
intLinks = true , |
|
|
const bool & |
childLinks = true , |
|
|
const bool & |
parentLinks = false , |
|
|
const bool & |
selfLinks = false , |
|
|
const bool & |
extLinksIncluded = false , |
|
|
const bool & |
extLinksCrawl = false , |
|
|
const bool & |
socialLinks = false , |
|
|
const int & |
delayBetween = 0 |
|
) |
| |
Constructor from parent Graph thread. Inits variables.
- Parameters
-
url | |
maxNc | |
maxLinksPerPage | |
extLinks | |
intLinks | |
◆ ~WebCrawler()
WebCrawler::~WebCrawler |
( |
| ) |
|
◆ finished
void WebCrawler::finished |
( |
QString |
| ) |
|
|
signal |
◆ newLink
void WebCrawler::newLink |
( |
int |
s, |
|
|
QUrl |
target, |
|
|
bool |
enqueue_to_frontier |
|
) |
| |
|
slot |
??
- Parameters
-
s | |
target | |
enqueue_to_frontier | |
◆ parse
void WebCrawler::parse |
( |
QNetworkReply * |
reply | ) |
|
|
slot |
Called from Graph when a network reply for a new page download has finished to do the actual parsing of that page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings.
- Parameters
-
◆ signalCreateEdge
void WebCrawler::signalCreateEdge |
( |
const int & |
source, |
|
|
const int & |
target |
|
) |
| |
|
signal |
◆ signalCreateNode
void WebCrawler::signalCreateNode |
( |
const int & |
no, |
|
|
const QString & |
url, |
|
|
const bool & |
signalMW = false |
|
) |
| |
|
signal |
◆ signalStartSpider
void WebCrawler::signalStartSpider |
( |
| ) |
|
|
signal |
◆ constIterator
QStringList::const_iterator WebCrawler::constIterator |
|
private |
◆ knownUrls
QMap<QUrl, int> WebCrawler::knownUrls |
|
private |
◆ m_childLinks
bool WebCrawler::m_childLinks |
|
private |
◆ m_delayBetween
int WebCrawler::m_delayBetween |
|
private |
◆ m_discoveredNodes
int WebCrawler::m_discoveredNodes |
|
private |
◆ m_extLinksCrawl
bool WebCrawler::m_extLinksCrawl |
|
private |
◆ m_extLinksIncluded
bool WebCrawler::m_extLinksIncluded |
|
private |
◆ m_initialUrl
QUrl WebCrawler::m_initialUrl |
|
private |
◆ m_intLinks
bool WebCrawler::m_intLinks |
|
private |
◆ m_linkClassAllowed
bool WebCrawler::m_linkClassAllowed |
|
private |
◆ m_linkClasses
QStringList WebCrawler::m_linkClasses |
|
private |
◆ m_maxLinksPerPage
int WebCrawler::m_maxLinksPerPage |
|
private |
◆ m_maxUrls
int WebCrawler::m_maxUrls |
|
private |
◆ m_parentLinks
bool WebCrawler::m_parentLinks |
|
private |
◆ m_selfLinks
bool WebCrawler::m_selfLinks |
|
private |
◆ m_socialLinks
bool WebCrawler::m_socialLinks |
|
private |
◆ m_socialLinksExcluded
QStringList WebCrawler::m_socialLinksExcluded |
|
private |
◆ m_urlIsSocial
bool WebCrawler::m_urlIsSocial |
|
private |
◆ m_urlPatternAllowed
bool WebCrawler::m_urlPatternAllowed |
|
private |
◆ m_urlPatternNotAllowed
bool WebCrawler::m_urlPatternNotAllowed |
|
private |
◆ m_urlPatternsExcluded
QStringList WebCrawler::m_urlPatternsExcluded |
|
private |
◆ m_urlPatternsIncluded
QStringList WebCrawler::m_urlPatternsIncluded |
|
private |
◆ m_urlQueue
QQueue<QUrl>* WebCrawler::m_urlQueue |
|
private |
◆ urlPattern
QString WebCrawler::urlPattern |
|
private |
The documentation for this class was generated from the following files: