#include <webcrawler.h>

+ Inheritance diagram for WebCrawler_Parser:
+ Collaboration diagram for WebCrawler_Parser:

Public Slots

void parse (QNetworkReply *reply)
 Called when NetworkManager has finished. This method does the actual parsing of each page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings. More...
 
void newLink (int s, QUrl target, bool enqueue_to_frontier)
 signals node creation - Called from wc_parser::load() More...
 

Signals

void signalCreateNode (const int &no, const QString &url, const bool &signalMW=false)
 
void signalCreateEdge (const int &source, const int &target)
 
void startSpider ()
 
void finished (QString)
 

Public Member Functions

 WebCrawler_Parser ()
 constructor called from parent thread - does nothing More...
 
 ~WebCrawler_Parser ()
 
void load (const QString &seed, const QStringList &urlPatternsIncluded, const QStringList &urlPatternsExcluded, const QStringList &linkClasses, const int &maxNodes, const int &maxLinksPerPage, const bool &intLinks, const bool &childLinks, const bool &parentLinks, const bool &selfLinks, const bool &extLinksIncluded, const bool &extLinksCrawl, const bool &socialLinks)
 Called from parent Graph thread. Inits variables. More...
 

Private Attributes

QByteArray ba
 
QMap< QUrl, int > knownUrls
 
QUrl m_seed
 
int m_maxNodes
 
int m_discoveredNodes
 
int m_maxLinksPerPage
 
bool m_intLinks
 
bool m_childLinks
 
bool m_parentLinks
 
bool m_selfLinks
 
bool m_extLinksIncluded
 
bool m_extLinksCrawl
 
bool m_socialLinks
 
bool m_urlIsSocial
 
QStringList m_urlPatternsIncluded
 
QString urlPattern
 
QStringList m_urlPatternsExcluded
 
QStringList m_linkClasses
 
QStringList m_socialLinksExcluded
 
QStringList::const_iterator constIterator
 
bool m_urlPatternAllowed
 
bool m_urlPatternNotAllowed
 
bool m_linkClassAllowed
 

Constructor & Destructor Documentation

◆ WebCrawler_Parser()

WebCrawler_Parser::WebCrawler_Parser ( )

constructor called from parent thread - does nothing

◆ ~WebCrawler_Parser()

WebCrawler_Parser::~WebCrawler_Parser ( )

Member Function Documentation

◆ finished

void WebCrawler_Parser::finished ( QString  )
signal

◆ load()

void WebCrawler_Parser::load ( const QString &  url,
const QStringList &  urlPatternsIncluded,
const QStringList &  urlPatternsExcluded,
const QStringList &  linkClasses,
const int &  maxN,
const int &  maxLinksPerPage,
const bool &  intLinks,
const bool &  childLinks,
const bool &  parentLinks,
const bool &  selfLinks,
const bool &  extLinksIncluded,
const bool &  extLinksCrawl,
const bool &  socialLinks 
)

Called from parent Graph thread. Inits variables.

Parameters
url
maxN
maxLinksPerPage
extLinks
intLinks

◆ newLink

void WebCrawler_Parser::newLink ( int  s,
QUrl  target,
bool  enqueue_to_frontier 
)
slot

signals node creation - Called from wc_parser::load()

Parameters
s
target
enqueue_to_frontier

◆ parse

void WebCrawler_Parser::parse ( QNetworkReply *  reply)
slot

Called when NetworkManager has finished. This method does the actual parsing of each page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings.

Parameters
reply

◆ signalCreateEdge

void WebCrawler_Parser::signalCreateEdge ( const int &  source,
const int &  target 
)
signal

◆ signalCreateNode

void WebCrawler_Parser::signalCreateNode ( const int &  no,
const QString &  url,
const bool &  signalMW = false 
)
signal

◆ startSpider

void WebCrawler_Parser::startSpider ( )
signal

Member Data Documentation

◆ ba

QByteArray WebCrawler_Parser::ba
private

◆ constIterator

QStringList::const_iterator WebCrawler_Parser::constIterator
private

◆ knownUrls

QMap<QUrl, int> WebCrawler_Parser::knownUrls
private

◆ m_childLinks

bool WebCrawler_Parser::m_childLinks
private

◆ m_discoveredNodes

int WebCrawler_Parser::m_discoveredNodes
private

◆ m_extLinksCrawl

bool WebCrawler_Parser::m_extLinksCrawl
private

◆ m_extLinksIncluded

bool WebCrawler_Parser::m_extLinksIncluded
private

◆ m_intLinks

bool WebCrawler_Parser::m_intLinks
private

◆ m_linkClassAllowed

bool WebCrawler_Parser::m_linkClassAllowed
private

◆ m_linkClasses

QStringList WebCrawler_Parser::m_linkClasses
private

◆ m_maxLinksPerPage

int WebCrawler_Parser::m_maxLinksPerPage
private

◆ m_maxNodes

int WebCrawler_Parser::m_maxNodes
private

◆ m_parentLinks

bool WebCrawler_Parser::m_parentLinks
private

◆ m_seed

QUrl WebCrawler_Parser::m_seed
private

◆ m_selfLinks

bool WebCrawler_Parser::m_selfLinks
private

◆ m_socialLinks

bool WebCrawler_Parser::m_socialLinks
private

◆ m_socialLinksExcluded

QStringList WebCrawler_Parser::m_socialLinksExcluded
private

◆ m_urlIsSocial

bool WebCrawler_Parser::m_urlIsSocial
private

◆ m_urlPatternAllowed

bool WebCrawler_Parser::m_urlPatternAllowed
private

◆ m_urlPatternNotAllowed

bool WebCrawler_Parser::m_urlPatternNotAllowed
private

◆ m_urlPatternsExcluded

QStringList WebCrawler_Parser::m_urlPatternsExcluded
private

◆ m_urlPatternsIncluded

QStringList WebCrawler_Parser::m_urlPatternsIncluded
private

◆ urlPattern

QString WebCrawler_Parser::urlPattern
private

The documentation for this class was generated from the following files: