webcrawler.h
Go to the documentation of this file.
1 /***************************************************************************
2  SocNetV: Social Network Visualizer
3  version: 2.5
4  Written in Qt
5 
6  webcrawler.h - description
7  -------------------
8  copyright : (C) 2005-2019 by Dimitris B. Kalamaras
9  project site : https://socnetv.org
10 
11  ***************************************************************************/
12 
13 /*******************************************************************************
14 * This program is free software: you can redistribute it and/or modify *
15 * it under the terms of the GNU General Public License as published by *
16 * the Free Software Foundation, either version 3 of the License, or *
17 * (at your option) any later version. *
18 * *
19 * This program is distributed in the hope that it will be useful, *
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
22 * GNU General Public License for more details. *
23 * *
24 * You should have received a copy of the GNU General Public License *
25 * along with this program. If not, see <http://www.gnu.org/licenses/>. *
26 ********************************************************************************/
27 
28 #ifndef WEBCRAWLER_H
29 #define WEBCRAWLER_H
30 
31 #include <QNetworkReply>
32 #include <QUrl>
33 
34 class QNetworkAccessManager;
35 class QNetworkRequest;
36 
37 using namespace std;
38 
39 
40 class WebCrawler_Parser : public QObject {
41  Q_OBJECT
42 public:
45  void load (const QString &seed,
46  const QStringList &urlPatternsIncluded,
47  const QStringList &urlPatternsExcluded,
48  const QStringList &linkClasses,
49  const int &maxNodes,
50  const int &maxLinksPerPage,
51  const bool &intLinks,
52  const bool &childLinks,
53  const bool &parentLinks,
54  const bool &selfLinks,
55  const bool &extLinksIncluded,
56  const bool &extLinksCrawl,
57  const bool &socialLinks);
58 
59 public slots:
60  void parse(QNetworkReply *reply);
61  void newLink(int s, QUrl target, bool enqueue_to_frontier);
62 signals:
63  void signalCreateNode(const int &no,
64  const QString &url,
65  const bool &signalMW=false);
66  void signalCreateEdge (const int &source, const int &target);
67  void startSpider();
68  void finished (QString);
69 private:
70  QByteArray ba;
71  QMap <QUrl, int> knownUrls;
72  QUrl m_seed;
76 
77  bool m_intLinks;
80  bool m_selfLinks ;
85 
86  QStringList m_urlPatternsIncluded;
87  QString urlPattern;
88  QStringList m_urlPatternsExcluded;
89  QStringList m_linkClasses;
90  QStringList m_socialLinksExcluded;
91  QStringList::const_iterator constIterator;
95 };
96 
97 
98 class WebCrawler_Spider : public QObject {
99  Q_OBJECT
100 public:
103  void load (
104  //QNetworkAccessManager *NetworkManager,
105  WebCrawler_Parser *wc_parser,
106  const QString &seed,
107  const int &maxNodes,
108  const bool &delayedRequests);
109 
110 public slots:
111  void visitUrls();
112 
113 signals:
114  void getUrl(const QNetworkRequest &request);
115  void parse(QNetworkReply *reply);
116  void finished (QString);
117 private:
118  //QNetworkAccessManager *manager;
119 
120  QNetworkReply *reply;
121  QUrl currentUrl ;
122  QString m_seed;
127 
128 };
129 
130 #endif
Definition: webcrawler.h:98
bool m_urlPatternAllowed
Definition: webcrawler.h:92
QString urlPattern
Definition: webcrawler.h:87
bool m_extLinksIncluded
Definition: webcrawler.h:81
int m_maxNodes
Definition: webcrawler.h:123
QNetworkReply * reply
Definition: webcrawler.h:120
bool m_parentLinks
Definition: webcrawler.h:79
QUrl currentUrl
Definition: webcrawler.h:121
bool m_linkClassAllowed
Definition: webcrawler.h:94
int m_wait_msecs
Definition: webcrawler.h:125
QStringList m_urlPatternsExcluded
Definition: webcrawler.h:88
bool m_extLinksCrawl
Definition: webcrawler.h:82
bool m_intLinks
Definition: webcrawler.h:77
Definition: webcrawler.h:40
int m_discoveredNodes
Definition: webcrawler.h:74
int m_maxNodes
Definition: webcrawler.h:73
QStringList m_socialLinksExcluded
Definition: webcrawler.h:90
QStringList m_urlPatternsIncluded
Definition: webcrawler.h:86
QStringList m_linkClasses
Definition: webcrawler.h:89
QUrl m_seed
Definition: webcrawler.h:72
bool m_urlIsSocial
Definition: webcrawler.h:84
QByteArray ba
Definition: webcrawler.h:70
bool m_socialLinks
Definition: webcrawler.h:83
QString m_seed
Definition: webcrawler.h:122
QMap< QUrl, int > knownUrls
Definition: webcrawler.h:71
bool m_delayedRequests
Definition: webcrawler.h:126
int m_maxLinksPerPage
Definition: webcrawler.h:75
bool m_urlPatternNotAllowed
Definition: webcrawler.h:93
int m_visitedNodes
Definition: webcrawler.h:124
bool m_selfLinks
Definition: webcrawler.h:80
QStringList::const_iterator constIterator
Definition: webcrawler.h:91
bool m_childLinks
Definition: webcrawler.h:78