-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspider.cpp
More file actions
120 lines (106 loc) · 4.14 KB
/
spider.cpp
File metadata and controls
120 lines (106 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
spider.cpp
This program searches an html file for all references and follows them.
(c) 2006-07-02 by Thorsten Staerk
(c) 2006-2010
build and run it like this:
cmake . && make -j8 && ./spider
Progress:
2009-12-13: 2 hours: Tidy html to xhtml first.
2010-07-22: 1 hour: react on <a href>, but not on <a name>
*/
#include <qstring.h>
#include <QXmlInputSource>
#include <QXmlLocator>
#include <qfile.h>
#include "parser.h"
#include "myqxmlerrorhandler.h"
#include <kdebug.h>
#include <kurl.h>
#include <tidy.h>
#include <buffio.h>
#include <stdio.h>
#include <errno.h>
#include <iostream>
void parsecontent(QString content, int depth)
{
kDebug() << "Entering function";
Parsert* handler=new Parsert();
QXmlInputSource* source=new QXmlInputSource();
source->setData(content);
QXmlSimpleReader reader;
myqxmlerrorhandler* er=new myqxmlerrorhandler();
reader.setContentHandler( handler );
reader.setErrorHandler( er );
reader.parse( source );
for (int i=0; i<handler->hrefcount(); i++)
{
kDebug() << "inspecting next trunk";
QString descent=handler->href(i);
kDebug() << descent << endl;
parsecontent(descent, depth+1);
}
}
QString tidy(char* input)
// take html code and return it converted to xhtml code
{
kDebug() << "Entering function";
// the following code is (c) Charles Reitzel and Dave Raggett, see the package tidy
TidyBuffer output = {0};
TidyBuffer errbuf = {0};
QString result;
int rc = -1;
Bool ok;
TidyDoc tdoc = tidyCreate(); // Initialize "document"
kDebug() << "Tidying:\t\%s\\n" << input;
ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML
if ( ok ) rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics
if ( rc >= 0 ) rc = tidyParseString( tdoc, input ); // Parse the input
if ( rc >= 0 ) rc = tidyCleanAndRepair( tdoc ); // Tidy it up!
if ( rc >= 0 ) rc = tidyRunDiagnostics( tdoc ); // Kvetch
if ( rc > 1 ) // If error, force output.
rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 );
if ( rc >= 0 ) rc = tidySaveBuffer( tdoc, &output ); // Pretty Print
if ( rc >= 0 )
{
if ( rc > 0 ) kDebug() << "\\nDiagnostics:\\n\\n\%s" << errbuf.bp;
char* outputstring; // content of the outputfile
// find out length of outputstring
int length=0; // length of outputstring
byte* string=output.bp;
while (*string)
{
string++;
length++;
}
kDebug() << "allocating memory " << length;
outputstring=(char*)malloc(length);
snprintf(outputstring,length,"%s",output.bp);
result=QString::fromLocal8Bit(outputstring,length);
}
else
printf( "A severe error (\%d) occurred.\\n", rc );
tidyBufFree( &output );
tidyBufFree( &errbuf );
tidyRelease( tdoc );
return result;
}
int main(int argc, char *argv[])
{
QByteArray inputfilecontent;
QString tidycontent;
if ( argc<2 )
{
std::cout << "You did not call spider with the correct syntax.\n";
std::cout << "Syntax: spider <html file>" << std::endl;
std::cout << "Example: spider index.htm" << std::endl;
}
else
{
QFile inputfile(argv[1]);
inputfile.open(QIODevice::ReadOnly);
inputfilecontent = inputfile.read(inputfile.bytesAvailable());
tidycontent=tidy(inputfilecontent.data());
parsecontent(tidycontent,0);
}
}