#include <webcrawler.h>

+ Inheritance diagram for WebCrawler_Parser:
+ Collaboration diagram for WebCrawler_Parser:

Public Slots

void parse (QNetworkReply *reply)
 Called from WebCrawler_Spider::parse() signal when http has finished. This method does the actual parsing of each page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings. More...
 
void newLink (int s, QUrl target, bool enqueue_to_frontier)
 signals node creation - Called from wc_parser::load() More...
 

Signals

void signalCreateNode (const int &no, const QString &url, const bool &signalMW=false)
 
void signalCreateEdge (const int &source, const int &target)
 
void startSpider ()
 
void finished (QString)
 

Public Member Functions

 WebCrawler_Parser ()
 constructor called from parent thread - does nothing More...
 
 ~WebCrawler_Parser ()
 
void load (const QString &seed, const QStringList &urlPatternsIncluded, const QStringList &urlPatternsExcluded, const QStringList &linkClasses, const int &maxNodes, const int &maxLinksPerPage, const bool &extLinks, const bool &intLinks, const bool &selfLinks)
 Called from parent Graph thread. Inits variables. More...
 

Private Attributes

QByteArray ba
 
QMap< QUrl, int > knownUrls
 
QUrl m_seed
 
int m_maxNodes
 
int m_discoveredNodes
 
int m_maxLinksPerPage
 
bool m_extLinks
 
bool m_intLinks
 
bool m_selfLinks
 
QStringList m_urlPatternsIncluded
 
QString urlPattern
 
QStringList m_urlPatternsExcluded
 
QStringList m_linkClasses
 
QStringList::const_iterator constIterator
 
bool m_urlPatternAllowed
 
bool m_urlPatternNotAllowed
 
bool m_linkClassAllowed
 

Constructor & Destructor Documentation

WebCrawler_Parser::WebCrawler_Parser ( )

constructor called from parent thread - does nothing

WebCrawler_Parser::~WebCrawler_Parser ( )

Member Function Documentation

void WebCrawler_Parser::finished ( QString  )
signal
void WebCrawler_Parser::load ( const QString &  url,
const QStringList &  urlPatternsIncluded,
const QStringList &  urlPatternsExcluded,
const QStringList &  linkClasses,
const int &  maxN,
const int &  maxLinksPerPage,
const bool &  extLinks,
const bool &  intLinks,
const bool &  selfLinks 
)

Called from parent Graph thread. Inits variables.

Parameters
url
maxN
maxLinksPerPage
extLinks
intLinks
void WebCrawler_Parser::newLink ( int  s,
QUrl  target,
bool  enqueue_to_frontier 
)
slot

signals node creation - Called from wc_parser::load()

Parameters
s
target
enqueue_to_frontier
void WebCrawler_Parser::parse ( QNetworkReply *  reply)
slot

Called from WebCrawler_Spider::parse() signal when http has finished. This method does the actual parsing of each page's html source from the reply bytearray. First, we start by reading all from http reply to a QString called page. Then we parse the page string, searching for url substrings.

Parameters
reply
void WebCrawler_Parser::signalCreateEdge ( const int &  source,
const int &  target 
)
signal
void WebCrawler_Parser::signalCreateNode ( const int &  no,
const QString &  url,
const bool &  signalMW = false 
)
signal
void WebCrawler_Parser::startSpider ( )
signal

Member Data Documentation

QByteArray WebCrawler_Parser::ba
private
QStringList::const_iterator WebCrawler_Parser::constIterator
private
QMap<QUrl, int> WebCrawler_Parser::knownUrls
private
int WebCrawler_Parser::m_discoveredNodes
private
bool WebCrawler_Parser::m_extLinks
private
bool WebCrawler_Parser::m_intLinks
private
bool WebCrawler_Parser::m_linkClassAllowed
private
QStringList WebCrawler_Parser::m_linkClasses
private
int WebCrawler_Parser::m_maxLinksPerPage
private
int WebCrawler_Parser::m_maxNodes
private
QUrl WebCrawler_Parser::m_seed
private
bool WebCrawler_Parser::m_selfLinks
private
bool WebCrawler_Parser::m_urlPatternAllowed
private
bool WebCrawler_Parser::m_urlPatternNotAllowed
private
QStringList WebCrawler_Parser::m_urlPatternsExcluded
private
QStringList WebCrawler_Parser::m_urlPatternsIncluded
private
QString WebCrawler_Parser::urlPattern
private

The documentation for this class was generated from the following files: