The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url queue (passed from the parent) while emitting signals to the parent to create new nodes and edges between them. More...

#include <webcrawler.h>

+ Inheritance diagram for WebCrawler:
+ Collaboration diagram for WebCrawler:

Public Slots

void parse (QNetworkReply *reply)
 Called from Graph when a network reply for a new page download has finished to do the actual parsing of that page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings. More...
 
void newLink (int s, QUrl target, bool enqueue_to_frontier)
 ?? More...
 

Signals

void signalCreateNode (const int &no, const QString &url, const bool &signalMW=false)
 
void signalCreateEdge (const int &source, const int &target)
 
void signalStartSpider ()
 
void finished (QString)
 

Public Member Functions

 WebCrawler (QQueue< QUrl > *urlQueue, const QUrl &startUrl, const QStringList &urlPatternsIncluded, const QStringList &urlPatternsExcluded, const QStringList &linkClasses, const int &maxNodes, const int &maxLinksPerPage, const bool &intLinks=true, const bool &childLinks=true, const bool &parentLinks=false, const bool &selfLinks=false, const bool &extLinksIncluded=false, const bool &extLinksCrawl=false, const bool &socialLinks=false, const int &delayBetween=0)
 Constructor from parent Graph thread. Inits variables. More...
 
 ~WebCrawler ()
 

Private Attributes

QQueue< QUrl > * m_urlQueue
 
QMap< QUrl, int > knownUrls
 
QUrl m_initialUrl
 
int m_maxUrls
 
int m_discoveredNodes
 
int m_maxLinksPerPage
 
bool m_intLinks
 
bool m_childLinks
 
bool m_parentLinks
 
bool m_selfLinks
 
bool m_extLinksIncluded
 
bool m_extLinksCrawl
 
bool m_socialLinks
 
bool m_urlIsSocial
 
int m_delayBetween
 
QStringList m_urlPatternsIncluded
 
QString urlPattern
 
QStringList m_urlPatternsExcluded
 
QStringList m_linkClasses
 
QStringList m_socialLinksExcluded
 
QStringList::const_iterator constIterator
 
bool m_urlPatternAllowed
 
bool m_urlPatternNotAllowed
 
bool m_linkClassAllowed
 

Detailed Description

The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url queue (passed from the parent) while emitting signals to the parent to create new nodes and edges between them.

Constructor & Destructor Documentation

◆ WebCrawler()

WebCrawler::WebCrawler ( QQueue< QUrl > *  urlQueue,
const QUrl &  startUrl,
const QStringList &  urlPatternsIncluded,
const QStringList &  urlPatternsExcluded,
const QStringList &  linkClasses,
const int &  maxN,
const int &  maxLinksPerPage,
const bool &  intLinks = true,
const bool &  childLinks = true,
const bool &  parentLinks = false,
const bool &  selfLinks = false,
const bool &  extLinksIncluded = false,
const bool &  extLinksCrawl = false,
const bool &  socialLinks = false,
const int &  delayBetween = 0 
)

Constructor from parent Graph thread. Inits variables.

Parameters
url
maxNc
maxLinksPerPage
extLinks
intLinks

◆ ~WebCrawler()

WebCrawler::~WebCrawler ( )

Member Function Documentation

◆ finished

void WebCrawler::finished ( QString  )
signal

◆ newLink

void WebCrawler::newLink ( int  s,
QUrl  target,
bool  enqueue_to_frontier 
)
slot

??

Parameters
s
target
enqueue_to_frontier

◆ parse

void WebCrawler::parse ( QNetworkReply *  reply)
slot

Called from Graph when a network reply for a new page download has finished to do the actual parsing of that page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings.

Parameters
reply

◆ signalCreateEdge

void WebCrawler::signalCreateEdge ( const int &  source,
const int &  target 
)
signal

◆ signalCreateNode

void WebCrawler::signalCreateNode ( const int &  no,
const QString &  url,
const bool &  signalMW = false 
)
signal

◆ signalStartSpider

void WebCrawler::signalStartSpider ( )
signal

Member Data Documentation

◆ constIterator

QStringList::const_iterator WebCrawler::constIterator
private

◆ knownUrls

QMap<QUrl, int> WebCrawler::knownUrls
private

◆ m_childLinks

bool WebCrawler::m_childLinks
private

◆ m_delayBetween

int WebCrawler::m_delayBetween
private

◆ m_discoveredNodes

int WebCrawler::m_discoveredNodes
private

◆ m_extLinksCrawl

bool WebCrawler::m_extLinksCrawl
private

◆ m_extLinksIncluded

bool WebCrawler::m_extLinksIncluded
private

◆ m_initialUrl

QUrl WebCrawler::m_initialUrl
private

◆ m_intLinks

bool WebCrawler::m_intLinks
private

◆ m_linkClassAllowed

bool WebCrawler::m_linkClassAllowed
private

◆ m_linkClasses

QStringList WebCrawler::m_linkClasses
private

◆ m_maxLinksPerPage

int WebCrawler::m_maxLinksPerPage
private

◆ m_maxUrls

int WebCrawler::m_maxUrls
private

◆ m_parentLinks

bool WebCrawler::m_parentLinks
private

◆ m_selfLinks

bool WebCrawler::m_selfLinks
private

◆ m_socialLinks

bool WebCrawler::m_socialLinks
private

◆ m_socialLinksExcluded

QStringList WebCrawler::m_socialLinksExcluded
private

◆ m_urlIsSocial

bool WebCrawler::m_urlIsSocial
private

◆ m_urlPatternAllowed

bool WebCrawler::m_urlPatternAllowed
private

◆ m_urlPatternNotAllowed

bool WebCrawler::m_urlPatternNotAllowed
private

◆ m_urlPatternsExcluded

QStringList WebCrawler::m_urlPatternsExcluded
private

◆ m_urlPatternsIncluded

QStringList WebCrawler::m_urlPatternsIncluded
private

◆ m_urlQueue

QQueue<QUrl>* WebCrawler::m_urlQueue
private

◆ urlPattern

QString WebCrawler::urlPattern
private

The documentation for this class was generated from the following files: