1. #!/usr/bin/perl -w
2. # archipol.cgi
3. # 20010716
4. # H.C.G. Druiven
5. # h.c.g.druiven@ub.rug.nl
6. use LWP::UserAgent;
7. use HTML::LinkExtor;
8. use URI;
9. use File::Basename;
10. use File::Path;
11. use strict;
12. no strict( 'refs'); # In de hashes kunnen ook referenties met alleen cijfers voorkomen
13. ###################### globals
14. my $g_debug; # Debug mode
15. my $g_html; # HTML mode, genereer output als HTML-pagina
16. my $g_nl; # in HTML-mode $g_nl= "
\n", anders $g_nl="\n";
17. my $g_search_engine_name; # De naam van deze zoekmachine
18. my $g_email; # Het e-mail-adres van de eigenaar van deze zoekmachine
19. my %g_deadlinks; # $g_deadlinks{ $link}{ $message}
20. # Hash met alle dode links met hun fout-melding
21. my $g_levels_deep; # Hoe diep mag in de directorie-structuur worden gzocht
22. my %g_inival; # Hierin alle variabelen uit 'archipol.ini'
23. my %g_search_tags; # Alle tags met namen van referenties waarnaar verder gezocht
24. # moet worden
25. my %g_do_not_search_extensions; # Files met deze extenties worden niet op links onderzocht
26. my %g_not_searched_extensions; # Na de zoekopdracht bevat deze hash alle extensies van files
27. # die niet op links zijn onderzocht
28. my %g_do_not_download_extensions; # Files met deze extensies worden niet ge'download'
29. my %g_do_not_process_files; # Files die niet op links moeten worden onderzocht
30. # Deze namen worden gelezen uit 'do_not_process.txt'
31. my %g_allowed_tags; # Tags die voor deze zoekmachine zijn toegestaan
32. my %g_not_searched_tags; # Tags die na de zoekopdracht niet op links zijn onderzocht,
33. # deze worden na de zoekopdracht in een file opgeslagen
34. my %g_extensions; # Na de zoekopdracht bevat deze hash alle file-extensies plus
35. # de paginanummers, deze worden na de zoekopdracht
36. # in een file opgeslagen
37. my %g_not_allowed_tags; # Tags die niet op links worden onderzocht
38. my $g_save_not_searched_extensions_in; # File-naam
39. my $g_save_not_searched_tags_in; # File-naam
40. my $g_save_extensions_in; # File-naam
41. my $g_save_links_in; # File-naam
42. my $g_save_local_links_in; # File-naam
43. my $g_save_deadlinks_in; # File-naam
44. my $g_save_files_in; # Directorie-naam
45. my $g_save_new_hrefs_in; # File-naam
46. my @g_first_files; # Array van file-namen die als eerste moeten
47. # worden geïndexceerd. Indien 'sitemap.txt' bestaat worden
48. # ook daaruit file-namen gelezen.
49. my $g_filename_for_no_filename; # Als in de URL geen file-naam voorkomt dan
50. # wordt dit de naam
51. my $g_root; # Start URL, boven deze link worden geen files gezocht
52. my @g_hrefs; # Array met alle gevonden URL's
53. my %g_hrefs; # Hash met alle gevonden URL's plus de pagina-nummers waarin
54. # deze URL wordt genoemd; $g_href{ "link"}{ "nummer"}
55. my %g_downloads; # Alle ge'download'e files
56. my @g_wfiles; # Array met alle files waarin gegevens worden geschreven
57. my $g_max_file_size; # Maximale file-grootte van een te 'download'en file
58. my $g_max_filename_size; # Maximale grootte van de filenaam
59. my $g_save_to_long_filenames_in; # File-naam
60. my $g_save_newhrefs_in; # Kan tijdens de debug-fase worden gebruikt.
61. ##################### end globals
62. ########### print_array( \@array)
63. sub print_array
64. {
65. foreach ( @{ shift @_})
66. {
67. print " : ". $_. $g_nl;
68. }
69. print $g_nl;
70. }
71. ########### print_hash( \%hash)
72. sub print_hash
73. {
74. my ( $key, $val);
75. my %h= %{ shift( @_)};
76. while( ( $key, $val)= each( %h))
77. {
78. print "$key => $val$g_nl";
79. }
80. }
81. ########### print_links( \%hrefs)
82. ########### makes use of @g_hrefs
83. sub print_links
84. {
85. my ( $href, $key, $val, $i);
86. my %hrefs= %{ shift( @_)};
87. for( $i= 0; $i<= $#g_hrefs; $i++)
88. {
89. $href= $g_hrefs[ $i];
90. print "\nHREF = $href$g_nl";
91. print "IN:$g_nl";
92. while( ( $key, $val)= each( %{ $hrefs{ $href}}))
93. {
94. print $g_hrefs[ $key]. $g_nl;
95. }
96. }
97. }
98. ################# print_header
99. sub print_header
100. {
101. print <<"END";
102.
103.
104.
105.
106. Download a site
107.
108.
109. END
110. }
111. ################ print_tail
112. sub print_tail
113. {
114. print <<"END";
115.
116.
117. END
118. }
119. ################ read_inifile()
120. sub read_inifile
121. {
122. my $fn= shift;
123.
124. open( INI, "<$fn") or die "$fn :$!";
125. while( )
126. {
127. my @parts;
128. my @klad;
129.
130. chomp;
131.
132. ( $_)= split '#';
133. if( $_)
134. {
135. s/\s+//g;
136. if( $_)
137. {
138. @parts= split( '=');
139. if( $parts[ 1])
140. {
141. if( @klad= split( ';', $parts[ 1]))
142. {
143. push( @{ $g_inival{ $parts[ 0]}}, @klad);
144. }
145. else
146. {
147. push( @{ $g_inival{ $parts[ 0]}}, $parts[ 1])
148. }
149. }
150. else
151. {
152. push( @{ $g_inival{ $parts[ 0]}}, '');
153. }
154. }
155. }
156. }
157. close( INI);
158. }
159. ################
160. ################ ini_globals()
161. sub ini_globals
162. {
163. my @klad;
164. ########### put some var's from archipol.ini in globals
165.
166. $g_debug= $g_inival{ 'debug'}->[0];
167.
168. $g_html= $g_inival{ 'html'}->[0];
169. if( $g_html)
170. {
171. use CGI::Carp qw(fatalsToBrowser);
172. print "Content-type: text/HTML\n\n";
173. $g_nl= "
\n";
174. } else{ $g_nl= "\n";}
175. $g_search_engine_name=$g_inival{ 'search_engine_name'}->[0];
176. $g_email= $g_inival{ 'email'}->[0];
177. $g_root= $g_inival{ 'root'}->[ 0];
178. $g_levels_deep= $g_inival{ 'how_deep_will_we_go'}->[0];
179. $g_save_files_in= $g_inival{ 'save_files_in'}->[0];
180. $g_filename_for_no_filename=
181. $g_inival{ 'filename_for_no_filename'}->[0];
182. @g_first_files= @{ $g_inival{ 'first_files'}};
183. $g_max_file_size= $g_inival{ 'max_file_size'}->[0];
184. $g_max_file_size=~ s/[\.\,\s]//g;
185. $g_max_file_size=~ s/k/000/;
186. $g_max_file_size=~ s/M/000000/;
187. $g_max_filename_size= $g_inival{ 'max_filename_size'}->[0];
188. $g_save_to_long_filenames_in=
189. $g_save_files_in. $g_inival{ 'save_to_long_filenames_in'}->[0];
190. $g_save_not_searched_extensions_in=
191. $g_save_files_in. $g_inival{ 'save_not_searched_extensions_in'}->[0];
192. $g_save_not_searched_tags_in=
193. $g_save_files_in. $g_inival{ 'save_not_searched_tags_in'}->[0];
194. $g_save_extensions_in=$g_save_files_in. $g_inival{ 'save_extensions_in'}->[0];
195. $g_save_links_in= $g_save_files_in. $g_inival{ 'save_links_in'}->[0];
196. $g_save_local_links_in=
197. $g_save_files_in. $g_inival{ 'save_local_links_in'}->[0];
198. $g_save_deadlinks_in= $g_save_files_in. $g_inival{ 'save_deadlinks_in'}->[0];
199. $g_save_new_hrefs_in= $g_save_files_in. "NEWHREFS";
200.
201. @g_wfiles=
202. ( $g_save_to_long_filenames_in
203. , $g_save_extensions_in
204. , $g_save_new_hrefs_in
205. , $g_save_deadlinks_in
206. , $g_save_not_searched_extensions_in
207. , $g_save_not_searched_tags_in
208. , $g_save_extensions_in
209. , $g_save_links_in
210. , $g_save_local_links_in
211. );
212. map {
213. @klad= split( ',');
214. $g_search_tags{ $klad[0]}= $klad[1];
215. } @{ $g_inival{ 'search_tags'}};
216.
217. map { $g_do_not_search_extensions{ $_}++;} @{ $g_inival{ 'do_not_search_extensions'}};
218. map { $g_do_not_download_extensions{ $_}++;} @{ $g_inival{ 'do_not_download_extensions'}};
219. map { $g_not_allowed_tags{ $_}++;} @{ $g_inival{ 'not_allowed_tags'}};
220. }
221. ###############
222. ############### create_paths()
223. sub create_paths
224. {
225. map { ###print "filename= '$_'$g_nl";
226. mkpath( dirname( $_));
227. } @g_wfiles;
228. }
229. ###############
230. ############### open_and_close_all_files_for_writing()
231. # TRY TO OPEN ALL THE FILES FOR WRITING
232. sub open_and_close_all_files_for_writing
233. {
234. map { print "Open $_ and delete.$g_nl" if $g_debug;
235. open( OUT, ">$_") or die "$_: $!";
236. close OUT;
237. unlink $_;
238. } @g_wfiles;
239. }
240. ###############
241. ############### ini_hrefs();
242. sub ini_hrefs
243. {
244. my $href;
245. foreach $href ( @g_first_files)
246. {
247. push( @g_hrefs, $href);
248. $g_hrefs{ $href}++;
249. }
250.
251. if( open( SITEMAP, ")
254. {
255. chomp;
256. ( $_)= split '#';
257. s/\s+//g;
258. if( $_)
259. {
260. push( @g_hrefs, $_);
261. $g_hrefs{ $_}++;
262. }
263. }
264. close SITEMAP;
265. }
266. }
267. ###############
268. ############### read_do_not_process();
269. sub read_do_not_process
270. {
271. if( open( DONT, "do_not_process.txt"))
272. {
273. while( )
274. {
275. chomp;
276. ( $_)= split '#';
277. s/\s+//g;
278. if( $_)
279. {
280. $g_do_not_process_files{ $_}++;
281. }
282. }
283. close DONT;
284. }
285. }
286. ###############
287. ###############
288. ##################################################################################
289. ############### MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN
290. read_inifile( 'archipol.ini');
291. ini_globals();
292. create_paths();
293. print_header() if $g_html;
294. open_and_close_all_files_for_writing(); # Om zeker te zijn dat hier geen problemen optreden
295. # en dat niet pas aan het einde van de download wordt ontdekt.
296. # Het programma maakt gebruik van een array '@g_hrefs' en van een hash '%g_hrefs'
297. # @g_hrefs wordt als stapel gebruikt en %g_hrefs om snel te kunnen detecteren dat een
298. # bepaalde link al is opgeslagen.
299. # Beide worden gevuld vóór de eerste download met '@g_first_files'
300. # en met de links's uit de file 'sitemap.txt' (indien die file bestaat)
301. ini_hrefs();
302. # Sommige files moeten niet worden ge'processed'.
303. read_do_not_process();
304. ########### DO THE SEARCHE AND DOWNLOAD FILES
305. {
306. my $number= 0; # Paginanummer
307. my $ua = new LWP::UserAgent;
308. $ua->agent( $g_search_engine_name. " e-mail:". $g_email);
309. $ua->from( $g_email);
310. $ua->max_size( $g_max_file_size);
311. # Make the parser.
312. my $parser= HTML::LinkExtor->new( );
313. my $i;
314. my $href;
315. my @parts;
316. my $url;
317. my $ext='';
318. my $res;
319. my $base= '';
320. my $klad;
321. my @klad;
322. my $filename;
323. my $full_filename;
324. my $dir;
325. my @links;
326. my $link;
327. my $n_can_not_connect;
328. print "Open $g_save_links_in.$g_nl" if $g_debug;
329. open( LINKS, ">$g_save_links_in") or die "$g_save_links_in: $!";
330. select( ( select( LINKS), $|= 1)[0]);
331. print "Open $g_save_local_links_in.$g_nl" if $g_debug;
332. open( LOCALLINKS, ">$g_save_local_links_in") or die "$g_save_local_links_in: $!";
333. select( ( select( LOCALLINKS), $|= 1)[0]);
334. print "Open $g_save_to_long_filenames_in.$g_nl" if $g_debug;
335. open( TOLONGFILENAMES, ">$g_save_to_long_filenames_in") or die "$g_save_to_long_filenames_in: $!";
336. select( ( select( TOLONGFILENAMES), $|= 1)[0]);
337. if( $g_debug) # Voor debug-doeleinden
338. {
339. open( NEWHREFS, ">$g_save_new_hrefs_in") or die "$g_save_new_hrefs_in: $!";
340. select( ( select( NEWHREFS), $|= 1)[0]);
341. }
342.
343. $|= 1; # Autoflush output
344. while( $number <= $#g_hrefs) # @g_hrefs grows while searching
345. {
346. $href= $g_hrefs[ $number];
347. print LINKS $href. "\n";
348. $url= $g_root. $href;
349. print "PROCESS : $number URL = $url$g_nl";
350. if( $g_do_not_process_files{ $href})
351. {
352. print "DON'T PROCESS$g_nl";
353. $number++;
354. ;;;;next;
355. }
356. ( $filename, $dir)= fileparse( $href);
357. $filename= $filename || $g_filename_for_no_filename;
358. $filename=~ s/([^a-zA-Z0-9\-\_\.])/sprintf( '%%%02X', ord($1))/eg;
359. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan door een '%'-teken
360. $dir=~ s/^\.[\\\/]//;
361. $dir=~ s/([^a-zA-Z0-9\-\_\.\/])/sprintf( '%%%02X', ord($1))/eg;
362. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan door een '%'-teken
363. print LOCALLINKS $dir. $filename. "\n"; # Locale file-naam in de file
364. # Request document and parse it as it arrives
365. my @newhrefs; # clear array
366. my @newtags; # clear array
367. # Set up a callback that collect page links
368. $res = $ua->request(HTTP::Request->new(GET => $url));
369. # Get the document, $res is a pointer to class
370. ##print "RES:$g_nl" if $g_debug;
371. ##print_hash( $res) if $g_debug;
372. print "res{ _msg}= |". $res->{ "_msg"}. "|$g_nl" if $g_debug;
373. if( ( $res->{ "_msg"} eq "OK") or ( lc( $res->{ "_msg"}) eq "partial content"))
374. {
375. $parser->parse( $res->content());
376. my $page= $res->content();
377. #### Save the file
378. print "href= $href$g_nl" if $g_debug;
379. print "FILENAME: $dir$filename\n" if $g_debug;
380.
381. # Probeer het path te creëren
382. # Zo ja, schrijf de file
383. eval
384. {
385. mkpath( $g_save_files_in.$dir);
386. };
387. if( $@)
388. {
389. warn $@;
390. }
391. else
392. {
393. $full_filename= $g_save_files_in. $dir. $filename;
394.
395. if( open( OUT, ">$full_filename"))
396. {
397. print OUT $page;
398. close OUT;
399. }
400. else{ warn "$full_filename: $!";}
401. }
402. {
403. my( $tag, %attr);
404. my( $key, $value);
405. $base = $res->base; # Expand all page URLs to absolute ones
406.
407. foreach $link ( $parser->links)
408. {
409. ($tag, %attr) = @$link;
410. if( $g_search_tags{ $tag})
411. {
412. while ( ( $key, $value)= each %attr)
413. {
414. $value= URI->new_abs( $value, $base);
415. print "tags= $tag - $key - $value$g_nl" if $g_debug;
416. if( $key eq $g_search_tags{ $tag})
417. {
418. ##print "newhrefs=$g_nl" if $g_debug;
419. ##print_array( \@newhrefs) if $g_debug;
420. push( @newhrefs, $value);
421. }
422. else
423. {
424. push( @newtags, "$tag\t$key\t$value");
425. }
426. }
427. }
428. else
429. {
430. while ( ( $key, $value)= each %attr)
431. {
432. $value= URI->new_abs( $value, $base);
433. ##print "tags= $tag - $key - $value$g_nl" if $g_debug;
434. $g_not_searched_tags{ $tag}{ $key}{ $number}++;
435. push( @newtags, "$tag\t$key\t$value");
436. }
437. }
438. }
439. }
440. ### Print them out
441. ##print join($g_nl, @newhrefs) if $g_debug;
442.
443. ##### search for new ref's
444. my @downloads;
445. for( $i= 0; $i<= $#newhrefs; $i++)
446. {
447. my $href= $newhrefs[ $i];
448. ## if it does't start with $g_root then it's from outside
449. if( $href=~ s/^$g_root//i)
450. {
451. if( !( $href=~ /^mailto:/i))
452. {
453. my $last_part;
454. $href=~ s/\/{2,}/\//; # remove double slashes
455. $href=~ s/#(.*)//; # skip everything after '#' including '#'
456. print "href = $href$g_nl" if $g_debug;
457.
458. @parts= split( "/", $href);
459. $ext= '';
460.
461. # Look for the extension of the file; html htm asp
462. # Extensions can be followed by var's; index.html&a=4 index.html#second_part
463. if( $#parts>= 0 and $#parts<= $g_levels_deep)
464. {
465. ##print( "#parts= $#parts$g_nl") if $g_debug;
466. ##print_array( \@parts) if $g_debug;
467. $last_part= $parts[ $#parts];
468. $ext= $last_part;
469. ##print "ext = $ext$g_nl" if $g_debug;
470. $ext=~ s/\?(.*)$//;
471. ##print "ext = $ext$g_nl" if $g_debug;
472. @klad= split( /\./, $ext);
473. ##print "\#klad = $#klad$g_nl" if $g_debug;
474. $href= join( '/', @parts);
475. if( $#klad> 0)
476. {
477. $ext= $klad[ $#klad];
478. ##print "ext = $ext$g_nl" if $g_debug;
479. }
480. else
481. {
482. $ext= '';
483. # $href is een map (directory) tenzij het laatste deel van de url begint met een '?' of '#'
484. # bijv. /dit/is/een/path/?key=val&key2=val2
485. # equivalent met bijv /dit/is/een/path/index.html?key=val&key2=val2
486. if( !( $last_part=~ m/^[\?\#]/))
487. {
488. $href.= '/';
489. }
490. }
491. $ext= lc( $ext);
492. ##if ( $ext) { print "ext = $ext$g_nl" if $g_debug;}
493. ##print "href = $href$g_nl" if $g_debug;
494. }
495. print "ext= $ext$g_nl" if $g_debug;
496. print "href = $href, $number$g_nl" if $g_debug;
497. ## if extension is OK put $href in @g_hrefs en %g_hrefs
498. if( ( !$g_do_not_search_extensions{ $ext} or !$ext)
499. and $href
500. )
501. {
502. if( length( $last_part)<= $g_max_filename_size)
503. {
504. if( !$g_hrefs{ $href})
505. {
506. print NEWHREFS $href. " number=". $number. "\n" if $g_debug;
507. push( @g_hrefs, $href);
508. }
509. print "newref = $href, $number$g_nl" if $g_debug;
510. $g_hrefs{ $href}{ $number}++;
511. }
512. else
513. {
514. print "Size filename to long!!$g_nl";
515. print "::::$last_part$g_nl";
516. print TOLONGFILENAMES "$last_part\n";
517. }
518. }
519. else
520. {
521. $g_not_searched_extensions{ $ext}{ $number}++;
522. push( @downloads, $href);
523. print "NOT ALLOWED EXT for search = $ext$g_nl" if $g_debug;
524. print "IN : $url$g_nl" if $g_debug;
525. print " $g_root$href$g_nl" if $g_debug;
526. }
527. }
528. }
529. }
530.
531. ##### do the same for other new tag's
532. for( $i= 0; $i<= $#newtags; $i++)
533. {
534. my ( $tag, $kind, $href)= split( "\t", $newtags[ $i]);
535. print "$tag $kind $href$g_nl" if $g_debug;
536. if( $href=~ s/^$g_root//i)
537. {
538. ##print "$tag $kind $href$g_nl" if $g_debug;
539. if( !$g_not_allowed_tags{ $tag})
540. {
541. push( @downloads, $href);
542. }
543. }
544. }
545. # search for extra download
546. # zoekt in CCS-tags
547. # bijv.
548. # background: url('/img/flare-middle.jpg');
549. {
550. # Indien in de pagina tekst tussen '{' en '}', dan
551. while( $page=~ m/{(.*?)}/gs)
552. {
553. my $klad= $1;
554. #print $klad;
555. #
556. # Als die tekst bestaat uit 'url'
557. # gevolgd door nul of meer spaties
558. # dan '('
559. # dan nul of meer spaties
560. # dan nul of een keer "'"
561. # dan enige tekst bestaande uit letters [a-zA-Z0-9]
562. # en/of '-' en/of '_' en/of '.' en/of '%' en/of '/'
563. while( $klad=~ m/url *?\( *?\'?([\w\-_.\%\/]*?)\'? *?\)/gsi)
564. {
565. $href= URI->new( $1, $base);
566. if( $href=~ s/^$g_root//i)
567. {
568. ##print "href= $href$g_nl" if $g_debug;
569. push( @downloads, $href);
570. }
571. }
572. }
573. }
574. $i= 0;
575. while( $i<= $#downloads)
576. {
577. my $last_part;
578. $href= $downloads[ $i];
579. if( $g_hrefs{ $href})
580. {
581. $g_hrefs{ $href}{ $number}++;
582. $i++;
583. ;;;;next;
584. }
585. else
586. {
587. $g_hrefs{ $href}{ $number}++;
588. }
589. if( $g_do_not_process_files{ $href})
590. {
591. print "DON'T DOWNLOAD: $href$g_nl";
592. $i++;
593. ;;;;next;
594. }
595. ##print "href = $href$g_nl" if $g_debug;
596. @parts= split( "/", $href);
597. $ext= '';
598. if( $#parts<= $g_levels_deep)
599. {
600. if( $#parts>= 0)
601. {
602. $parts[ $#parts]=~ s/#(.*)//;
603. $last_part= $parts[ $#parts];
604. $ext= $last_part;
605. $ext=~ s/\?(.*)$//;
606. ##print "ext = $ext$g_nl" if $g_debug;
607. @klad= split( /\./, $ext);
608. ##print "\#klad = $#klad$g_nl" if $g_debug;
609. if( $#klad>= 0)
610. {
611. $ext= $klad[ $#klad];
612. ##print "EXT = $ext$g_nl" if $g_debug;
613. }
614. else
615. {
616. $ext= '';
617. }
618. $ext= lc( $ext);
619. ##if ( $ext) { print "ext = $ext$g_nl" if $g_debug;}
620. $href= join( '/', @parts);
621. }
622. print "href = $href, $number$g_nl" if $g_debug;
623. if( !$g_do_not_download_extensions{ $ext}
624. and !$g_do_not_process_files{ $href}
625. )
626. {
627. if( length( $last_part)<= $g_max_filename_size)
628. {
629. if( !$g_downloads{ $href})
630. {
631. my $url= $g_root. $href;
632. ##print "href= $href$g_nl" if $g_debug;
633. print "DOWNLOAD : $url$g_nl";
634. # Get the document, $res is a pointer to class
635. $res = $ua->request(HTTP::Request->new(GET => $url));
636. ##print "res:$g_nl" if $g_debug;
637. ##print_hash( $res) if $g_debug;
638. if( ( $res->{ "_msg"} eq "OK") or ( lc($res->{ "_msg"}) eq "partial content"))
639. {
640. ( $filename, $dir)= fileparse( $href);
641. $filename= $filename || $g_filename_for_no_filename;
642. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan
643. # door een '%'-teken
644. $filename=~ s/([^a-zA-Z0-9\-\_\.])/sprintf( '%%%02X', ord($1))/eg;
645. $dir=~ s/^\.[\\\/]//;
646. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan
647. # door een '%'-teken
648. $dir=~ s/([^a-zA-Z0-9\-\_\.\/])/sprintf( '%%%02X', ord($1))/eg;
649. $full_filename= $g_save_files_in. $dir. $filename;
650. print "href= $href$g_nl" if $g_debug;
651. print "full filename= $full_filename$g_nl" if $g_debug;
652. eval
653. {
654. mkpath( $g_save_files_in.$dir);
655. };
656. if( $@)
657. {
658. warn $@;
659. }
660. else
661. {
662. if( open( OUT, ">$full_filename"))
663. {
664. binmode OUT;
665. print OUT $res->content();
666. close OUT;
667. }
668. else{ warn "$full_filename: $!";}
669. }
670. }
671. else
672. {
673. # Na 60x "Can't connect" de volgende.
674. if( ( $res->{ "_msg"}=~ m/^Can\'t connect to/i) and ( $n_can_not_connect< 60))
675. {
676. $n_can_not_connect++;
677. print $res->{ "_msg"}. $g_nl;
678. sleep 10;
679. ;;;;;;;;;;;;;;next;
680. }
681. else
682. {
683. $n_can_not_connect= 0;
684. print "Link not OK: ". $res->{ "_msg"}. $g_nl;
685. $g_deadlinks{ $href}{ $res->{ "_msg"}}++;
686. }
687. }
688. }
689. $g_downloads{ $href}++;
690. $g_extensions{ $ext}{ $number}++;
691. }
692. else
693. {
694. print "Size filename to long!!$g_nl";
695. print "::::$last_part$g_nl";
696. print TOLONGFILENAMES "$last_part\n";
697. }
698. }
699. }
700. $i++;
701. }
702. }
703. else ## if $res->{ "_msg"} ne "OK"
704. {
705. # Na 60x "Can't connect" de volgende.
706. if( $res->{ "_msg"}=~ m/^Can\'t connect to/i and ( $n_can_not_connect< 60))
707. {
708. $n_can_not_connect++;
709. print $res->{ "_msg"}. $g_nl;
710. sleep 10;
711. ;;;;;;next;
712. }
713. else
714. {
715. $n_can_not_connect= 0;
716. print "Link not OK: ". $res->{ "_msg"}. $g_nl;
717. $g_deadlinks{ $href}{ $res->{ "_msg"}}++;
718. }
719. }
720. $number++;
721. }
722. close( LINKS);
723. close( LOCALLINKS);
724. close( TOLONGFILENAMES);
725. close( NEWHREFS);
726. }
727. if( %g_deadlinks)
728. {
729. print "Write dead links$g_nl";
730. open( DEADLINKS, ">$g_save_deadlinks_in") or die "$g_save_deadlinks_in: $!";
731. {
732. my $i;
733. my $number;
734. my $deadlink;
735. my %msg;
736. my $msg;
737. my %value;
738. my $value;
739.
740. while ( ( $deadlink, $msg)= each %g_deadlinks)
741. {
742. print DEADLINKS "DEAD LINK $g_root$deadlink\n";
743. %msg= %$msg;
744. while( ( $msg, $value)= each %msg)
745. {
746. print DEADLINKS "MESSAGE = $msg\n";
747. }
748. print DEADLINKS "IN\t";
749. foreach $number ( keys %{ $g_hrefs{ $deadlink}})
750. {
751. print DEADLINKS $g_root. $g_hrefs[ $number]. "\n\t";
752. }
753. print DEADLINKS "\n";
754. }
755. }
756. close( DEADLINKS);
757. }
758. if( %g_not_searched_tags)
759. {
760. my $tag;
761. my $p_hash;
762. my $p_hash2;
763. my $number;
764. my $key;
765. my $n;
766.
767. print "Write not searched tags$g_nl";
768. open( NOTSEARCHEDTAGS, ">$g_save_not_searched_tags_in") or die "$g_save_not_searched_tags_in: $!";
769.
770. while( ( $tag, $p_hash)= each( %g_not_searched_tags))
771. {
772. while( ( $key, $p_hash2)= each( %$p_hash))
773. {
774. print NOTSEARCHEDTAGS $tag. "\t". $key. "\n";
775. while( ( $number, $n)= each( %$p_hash2))
776. {
777. print NOTSEARCHEDTAGS "\t: $g_hrefs[ $number]\n";
778. }
779. }
780. }
781. close NOTSEARCHEDTAGS;
782. }
783. if( %g_not_searched_extensions)
784. {
785. print "Write not searched links$g_nl";
786. open( NOTSEARCHED, ">$g_save_not_searched_extensions_in") or die "$g_save_not_searched_extensions_in: $!";
787. {
788. my $ext;
789. my $p_hash;
790. my $number;
791. my $n;
792.
793. while ( ( $ext, $p_hash)= each( %g_not_searched_extensions))
794. {
795. print NOTSEARCHED $ext. "\n";
796. while( ( $number, $n)= each( %$p_hash))
797. {
798. print NOTSEARCHED "\t: $g_hrefs[ $number]\n";
799. }
800. }
801. }
802. close NOTSEARCHED;
803. }
804. if( %g_extensions)
805. {
806. print "Write all file extensions$g_nl";
807. open( DOWNLOADED, ">$g_save_extensions_in") or die "$g_save_extensions_in: $!";
808. {
809. my $ext;
810. my $p_hash;
811. my $number;
812. my $n;
813.
814. while ( ( $ext, $p_hash)= each( %g_extensions))
815. {
816. print DOWNLOADED $ext. "\n";
817. while( ( $number, $n)= each( %$p_hash))
818. {
819. print DOWNLOADED "\t: $g_hrefs[ $number]\n";
820. }
821. }
822. }
823. close DOWNLOADED;
824. }
825. print_tail() if $g_html;