webcrawler.h
Go to the documentation of this file.
1 /***************************************************************************
2  SocNetV: Social Network Visualizer
3  version: 3.1
4  Written in Qt
5 
6  webcrawler.h - description
7  -------------------
8  copyright : (C) 2005-2023 by Dimitris B. Kalamaras
9  project site : https://socnetv.org
10 
11  ***************************************************************************/
12 
13 /*******************************************************************************
14 * This program is free software: you can redistribute it and/or modify *
15 * it under the terms of the GNU General Public License as published by *
16 * the Free Software Foundation, either version 3 of the License, or *
17 * (at your option) any later version. *
18 * *
19 * This program is distributed in the hope that it will be useful, *
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
22 * GNU General Public License for more details. *
23 * *
24 * You should have received a copy of the GNU General Public License *
25 * along with this program. If not, see <http://www.gnu.org/licenses/>. *
26 ********************************************************************************/
27 
28 #ifndef WEBCRAWLER_H
29 #define WEBCRAWLER_H
30 
31 #include <QNetworkReply>
32 #include <QQueue>
33 
34 QT_BEGIN_NAMESPACE
35 class QUrl;
36 QT_END_NAMESPACE
37 
38 using namespace std;
39 
45 class WebCrawler : public QObject {
46  Q_OBJECT
47  // QThread wc_spiderThread;
48 public:
49 
50  WebCrawler (
51  QQueue<QUrl> *urlQueue,
52  const QUrl &startUrl,
53  const QStringList &urlPatternsIncluded,
54  const QStringList &urlPatternsExcluded,
55  const QStringList &linkClasses,
56  const int &maxNodes,
57  const int &maxLinksPerPage,
58  const bool &intLinks = true,
59  const bool &childLinks = true,
60  const bool &parentLinks = false,
61  const bool &selfLinks = false,
62  const bool &extLinksIncluded = false,
63  const bool &extLinksCrawl = false,
64  const bool &socialLinks = false,
65  const int &delayBetween = 0
66  );
67 
68  ~WebCrawler();
69 
70 public slots:
71  void parse(QNetworkReply *reply);
72  void newLink(int s, QUrl target, bool enqueue_to_frontier);
73 
74 signals:
75  void signalCreateNode(const int &no,
76  const QString &url,
77  const bool &signalMW=false);
78  void signalCreateEdge (const int &source, const int &target);
80  void finished (QString);
81 
82 private:
83  QQueue<QUrl> *m_urlQueue;
84  QMap <QUrl, int> knownUrls;
86  int m_maxUrls;
89 
90  bool m_intLinks;
93  bool m_selfLinks ;
98 
100 
102  QString urlPattern;
104  QStringList m_linkClasses;
106  QStringList::const_iterator constIterator;
110 };
111 
112 
113 
114 #endif
The WebCrawler class Parses HTML code it receives, locates urls inside it and puts them into a url qu...
Definition: webcrawler.h:45
QStringList m_urlPatternsIncluded
Definition: webcrawler.h:101
bool m_urlPatternNotAllowed
Definition: webcrawler.h:108
QStringList m_socialLinksExcluded
Definition: webcrawler.h:105
QString urlPattern
Definition: webcrawler.h:102
bool m_childLinks
Definition: webcrawler.h:91
void finished(QString)
int m_maxUrls
Definition: webcrawler.h:86
bool m_extLinksCrawl
Definition: webcrawler.h:95
QUrl m_initialUrl
Definition: webcrawler.h:85
int m_maxLinksPerPage
Definition: webcrawler.h:88
bool m_parentLinks
Definition: webcrawler.h:92
int m_discoveredNodes
Definition: webcrawler.h:87
bool m_urlPatternAllowed
Definition: webcrawler.h:107
int m_delayBetween
Definition: webcrawler.h:99
bool m_extLinksIncluded
Definition: webcrawler.h:94
QStringList m_linkClasses
Definition: webcrawler.h:104
bool m_urlIsSocial
Definition: webcrawler.h:97
QQueue< QUrl > * m_urlQueue
Definition: webcrawler.h:83
bool m_intLinks
Definition: webcrawler.h:90
void signalStartSpider()
bool m_socialLinks
Definition: webcrawler.h:96
void signalCreateNode(const int &no, const QString &url, const bool &signalMW=false)
QStringList m_urlPatternsExcluded
Definition: webcrawler.h:103
QStringList::const_iterator constIterator
Definition: webcrawler.h:106
void signalCreateEdge(const int &source, const int &target)
bool m_selfLinks
Definition: webcrawler.h:93
QMap< QUrl, int > knownUrls
Definition: webcrawler.h:84
bool m_linkClassAllowed
Definition: webcrawler.h:109