webcrawler.h
Go to the documentation of this file.
1 /***************************************************************************
2  SocNetV: Social Network Visualizer
3  version: 2.5
4  Written in Qt
5 
6  webcrawler.h - description
7  -------------------
8  copyright : (C) 2005-2018 by Dimitris B. Kalamaras
9  project site : http://socnetv.org
10 
11  ***************************************************************************/
12 
13 /*******************************************************************************
14 * This program is free software: you can redistribute it and/or modify *
15 * it under the terms of the GNU General Public License as published by *
16 * the Free Software Foundation, either version 3 of the License, or *
17 * (at your option) any later version. *
18 * *
19 * This program is distributed in the hope that it will be useful, *
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
22 * GNU General Public License for more details. *
23 * *
24 * You should have received a copy of the GNU General Public License *
25 * along with this program. If not, see <http://www.gnu.org/licenses/>. *
26 ********************************************************************************/
27 
28 #ifndef WEBCRAWLER_H
29 #define WEBCRAWLER_H
30 
31 #include <QNetworkReply>
32 #include <QUrl>
33 
34 class QNetworkAccessManager;
35 class QNetworkRequest;
36 
37 using namespace std;
38 
39 
40 class WebCrawler_Parser : public QObject {
41  Q_OBJECT
42 public:
45  void load (const QString &seed,
46  const QStringList &urlPatternsIncluded,
47  const QStringList &urlPatternsExcluded,
48  const QStringList &linkClasses,
49  const int &maxNodes,
50  const int &maxLinksPerPage,
51  const bool &extLinks,
52  const bool &intLinks,
53  const bool &selfLinks);
54 
55 public slots:
56  void parse(QNetworkReply *reply);
57  void newLink(int s, QUrl target, bool enqueue_to_frontier);
58 signals:
59  void signalCreateNode(const int &no,
60  const QString &url,
61  const bool &signalMW=false);
62  void signalCreateEdge (const int &source, const int &target);
63  void startSpider();
64  void finished (QString);
65 private:
66  QByteArray ba;
67  QMap <QUrl, int> knownUrls;
68  QUrl m_seed;
72  bool m_extLinks, m_intLinks, m_selfLinks ;
73  QStringList m_urlPatternsIncluded;
74  QString urlPattern;
75  QStringList m_urlPatternsExcluded;
76  QStringList m_linkClasses;
77  QStringList::const_iterator constIterator;
81 };
82 
83 
84 class WebCrawler_Spider : public QObject {
85  Q_OBJECT
86 public:
89  void load (const QString &seed,
90  const int &maxNodes,
91  const bool &delayedRequests);
92 
93 public slots:
94  void get();
95  void httpFinished(QNetworkReply *reply);
96 
97 signals:
98  void parse(QNetworkReply *reply);
99  void finished (QString);
100 private:
101  QNetworkAccessManager *http;
102  QNetworkRequest request;
103  QNetworkReply *reply;
104  QUrl currentUrl ;
105  QString m_seed;
110 
111 };
112 
113 #endif
QNetworkAccessManager * http
Definition: webcrawler.h:101
Definition: webcrawler.h:84
bool m_urlPatternAllowed
Definition: webcrawler.h:78
QString urlPattern
Definition: webcrawler.h:74
QNetworkRequest request
Definition: webcrawler.h:102
int m_maxNodes
Definition: webcrawler.h:106
QNetworkReply * reply
Definition: webcrawler.h:103
QUrl currentUrl
Definition: webcrawler.h:104
bool m_linkClassAllowed
Definition: webcrawler.h:80
int m_wait_msecs
Definition: webcrawler.h:108
QStringList m_urlPatternsExcluded
Definition: webcrawler.h:75
Definition: webcrawler.h:40
int m_discoveredNodes
Definition: webcrawler.h:70
int m_maxNodes
Definition: webcrawler.h:69
QStringList m_urlPatternsIncluded
Definition: webcrawler.h:73
QStringList m_linkClasses
Definition: webcrawler.h:76
QUrl m_seed
Definition: webcrawler.h:68
QByteArray ba
Definition: webcrawler.h:66
QString m_seed
Definition: webcrawler.h:105
QMap< QUrl, int > knownUrls
Definition: webcrawler.h:67
bool m_delayedRequests
Definition: webcrawler.h:109
int m_maxLinksPerPage
Definition: webcrawler.h:71
bool m_urlPatternNotAllowed
Definition: webcrawler.h:79
int m_visitedNodes
Definition: webcrawler.h:107
bool m_selfLinks
Definition: webcrawler.h:72
QStringList::const_iterator constIterator
Definition: webcrawler.h:77