1. #!/usr/bin/perl -w 2. # archipol.cgi 3. # 20010716 4. # H.C.G. Druiven 5. # h.c.g.druiven@ub.rug.nl 6. use LWP::UserAgent; 7. use HTML::LinkExtor; 8. use URI; 9. use File::Basename; 10. use File::Path; 11. use strict; 12. no strict( 'refs'); # In de hashes kunnen ook referenties met alleen cijfers voorkomen 13. ###################### globals 14. my $g_debug; # Debug mode 15. my $g_html; # HTML mode, genereer output als HTML-pagina 16. my $g_nl; # in HTML-mode $g_nl= "
\n", anders $g_nl="\n"; 17. my $g_search_engine_name; # De naam van deze zoekmachine 18. my $g_email; # Het e-mail-adres van de eigenaar van deze zoekmachine 19. my %g_deadlinks; # $g_deadlinks{ $link}{ $message} 20. # Hash met alle dode links met hun fout-melding 21. my $g_levels_deep; # Hoe diep mag in de directorie-structuur worden gzocht 22. my %g_inival; # Hierin alle variabelen uit 'archipol.ini' 23. my %g_search_tags; # Alle tags met namen van referenties waarnaar verder gezocht 24. # moet worden 25. my %g_do_not_search_extensions; # Files met deze extenties worden niet op links onderzocht 26. my %g_not_searched_extensions; # Na de zoekopdracht bevat deze hash alle extensies van files 27. # die niet op links zijn onderzocht 28. my %g_do_not_download_extensions; # Files met deze extensies worden niet ge'download' 29. my %g_do_not_process_files; # Files die niet op links moeten worden onderzocht 30. # Deze namen worden gelezen uit 'do_not_process.txt' 31. my %g_allowed_tags; # Tags die voor deze zoekmachine zijn toegestaan 32. my %g_not_searched_tags; # Tags die na de zoekopdracht niet op links zijn onderzocht, 33. # deze worden na de zoekopdracht in een file opgeslagen 34. my %g_extensions; # Na de zoekopdracht bevat deze hash alle file-extensies plus 35. # de paginanummers, deze worden na de zoekopdracht 36. # in een file opgeslagen 37. my %g_not_allowed_tags; # Tags die niet op links worden onderzocht 38. my $g_save_not_searched_extensions_in; # File-naam 39. my $g_save_not_searched_tags_in; # File-naam 40. my $g_save_extensions_in; # File-naam 41. my $g_save_links_in; # File-naam 42. my $g_save_local_links_in; # File-naam 43. my $g_save_deadlinks_in; # File-naam 44. my $g_save_files_in; # Directorie-naam 45. my $g_save_new_hrefs_in; # File-naam 46. my @g_first_files; # Array van file-namen die als eerste moeten 47. # worden geïndexceerd. Indien 'sitemap.txt' bestaat worden 48. # ook daaruit file-namen gelezen. 49. my $g_filename_for_no_filename; # Als in de URL geen file-naam voorkomt dan 50. # wordt dit de naam 51. my $g_root; # Start URL, boven deze link worden geen files gezocht 52. my @g_hrefs; # Array met alle gevonden URL's 53. my %g_hrefs; # Hash met alle gevonden URL's plus de pagina-nummers waarin 54. # deze URL wordt genoemd; $g_href{ "link"}{ "nummer"} 55. my %g_downloads; # Alle ge'download'e files 56. my @g_wfiles; # Array met alle files waarin gegevens worden geschreven 57. my $g_max_file_size; # Maximale file-grootte van een te 'download'en file 58. my $g_max_filename_size; # Maximale grootte van de filenaam 59. my $g_save_to_long_filenames_in; # File-naam 60. my $g_save_newhrefs_in; # Kan tijdens de debug-fase worden gebruikt. 61. ##################### end globals 62. ########### print_array( \@array) 63. sub print_array 64. { 65. foreach ( @{ shift @_}) 66. { 67. print " : ". $_. $g_nl; 68. } 69. print $g_nl; 70. } 71. ########### print_hash( \%hash) 72. sub print_hash 73. { 74. my ( $key, $val); 75. my %h= %{ shift( @_)}; 76. while( ( $key, $val)= each( %h)) 77. { 78. print "$key => $val$g_nl"; 79. } 80. } 81. ########### print_links( \%hrefs) 82. ########### makes use of @g_hrefs 83. sub print_links 84. { 85. my ( $href, $key, $val, $i); 86. my %hrefs= %{ shift( @_)}; 87. for( $i= 0; $i<= $#g_hrefs; $i++) 88. { 89. $href= $g_hrefs[ $i]; 90. print "\nHREF = $href$g_nl"; 91. print "IN:$g_nl"; 92. while( ( $key, $val)= each( %{ $hrefs{ $href}})) 93. { 94. print $g_hrefs[ $key]. $g_nl; 95. } 96. } 97. } 98. ################# print_header 99. sub print_header 100. { 101. print <<"END"; 102. 103. 104. 105. 106. Download a site 107. 108. 109. END 110. } 111. ################ print_tail 112. sub print_tail 113. { 114. print <<"END"; 115. 116. 117. END 118. } 119. ################ read_inifile() 120. sub read_inifile 121. { 122. my $fn= shift; 123. 124. open( INI, "<$fn") or die "$fn :$!"; 125. while( ) 126. { 127. my @parts; 128. my @klad; 129. 130. chomp; 131. 132. ( $_)= split '#'; 133. if( $_) 134. { 135. s/\s+//g; 136. if( $_) 137. { 138. @parts= split( '='); 139. if( $parts[ 1]) 140. { 141. if( @klad= split( ';', $parts[ 1])) 142. { 143. push( @{ $g_inival{ $parts[ 0]}}, @klad); 144. } 145. else 146. { 147. push( @{ $g_inival{ $parts[ 0]}}, $parts[ 1]) 148. } 149. } 150. else 151. { 152. push( @{ $g_inival{ $parts[ 0]}}, ''); 153. } 154. } 155. } 156. } 157. close( INI); 158. } 159. ################ 160. ################ ini_globals() 161. sub ini_globals 162. { 163. my @klad; 164. ########### put some var's from archipol.ini in globals 165. 166. $g_debug= $g_inival{ 'debug'}->[0]; 167. 168. $g_html= $g_inival{ 'html'}->[0]; 169. if( $g_html) 170. { 171. use CGI::Carp qw(fatalsToBrowser); 172. print "Content-type: text/HTML\n\n"; 173. $g_nl= "
\n"; 174. } else{ $g_nl= "\n";} 175. $g_search_engine_name=$g_inival{ 'search_engine_name'}->[0]; 176. $g_email= $g_inival{ 'email'}->[0]; 177. $g_root= $g_inival{ 'root'}->[ 0]; 178. $g_levels_deep= $g_inival{ 'how_deep_will_we_go'}->[0]; 179. $g_save_files_in= $g_inival{ 'save_files_in'}->[0]; 180. $g_filename_for_no_filename= 181. $g_inival{ 'filename_for_no_filename'}->[0]; 182. @g_first_files= @{ $g_inival{ 'first_files'}}; 183. $g_max_file_size= $g_inival{ 'max_file_size'}->[0]; 184. $g_max_file_size=~ s/[\.\,\s]//g; 185. $g_max_file_size=~ s/k/000/; 186. $g_max_file_size=~ s/M/000000/; 187. $g_max_filename_size= $g_inival{ 'max_filename_size'}->[0]; 188. $g_save_to_long_filenames_in= 189. $g_save_files_in. $g_inival{ 'save_to_long_filenames_in'}->[0]; 190. $g_save_not_searched_extensions_in= 191. $g_save_files_in. $g_inival{ 'save_not_searched_extensions_in'}->[0]; 192. $g_save_not_searched_tags_in= 193. $g_save_files_in. $g_inival{ 'save_not_searched_tags_in'}->[0]; 194. $g_save_extensions_in=$g_save_files_in. $g_inival{ 'save_extensions_in'}->[0]; 195. $g_save_links_in= $g_save_files_in. $g_inival{ 'save_links_in'}->[0]; 196. $g_save_local_links_in= 197. $g_save_files_in. $g_inival{ 'save_local_links_in'}->[0]; 198. $g_save_deadlinks_in= $g_save_files_in. $g_inival{ 'save_deadlinks_in'}->[0]; 199. $g_save_new_hrefs_in= $g_save_files_in. "NEWHREFS"; 200. 201. @g_wfiles= 202. ( $g_save_to_long_filenames_in 203. , $g_save_extensions_in 204. , $g_save_new_hrefs_in 205. , $g_save_deadlinks_in 206. , $g_save_not_searched_extensions_in 207. , $g_save_not_searched_tags_in 208. , $g_save_extensions_in 209. , $g_save_links_in 210. , $g_save_local_links_in 211. ); 212. map { 213. @klad= split( ','); 214. $g_search_tags{ $klad[0]}= $klad[1]; 215. } @{ $g_inival{ 'search_tags'}}; 216. 217. map { $g_do_not_search_extensions{ $_}++;} @{ $g_inival{ 'do_not_search_extensions'}}; 218. map { $g_do_not_download_extensions{ $_}++;} @{ $g_inival{ 'do_not_download_extensions'}}; 219. map { $g_not_allowed_tags{ $_}++;} @{ $g_inival{ 'not_allowed_tags'}}; 220. } 221. ############### 222. ############### create_paths() 223. sub create_paths 224. { 225. map { ###print "filename= '$_'$g_nl"; 226. mkpath( dirname( $_)); 227. } @g_wfiles; 228. } 229. ############### 230. ############### open_and_close_all_files_for_writing() 231. # TRY TO OPEN ALL THE FILES FOR WRITING 232. sub open_and_close_all_files_for_writing 233. { 234. map { print "Open $_ and delete.$g_nl" if $g_debug; 235. open( OUT, ">$_") or die "$_: $!"; 236. close OUT; 237. unlink $_; 238. } @g_wfiles; 239. } 240. ############### 241. ############### ini_hrefs(); 242. sub ini_hrefs 243. { 244. my $href; 245. foreach $href ( @g_first_files) 246. { 247. push( @g_hrefs, $href); 248. $g_hrefs{ $href}++; 249. } 250. 251. if( open( SITEMAP, ") 254. { 255. chomp; 256. ( $_)= split '#'; 257. s/\s+//g; 258. if( $_) 259. { 260. push( @g_hrefs, $_); 261. $g_hrefs{ $_}++; 262. } 263. } 264. close SITEMAP; 265. } 266. } 267. ############### 268. ############### read_do_not_process(); 269. sub read_do_not_process 270. { 271. if( open( DONT, "do_not_process.txt")) 272. { 273. while( ) 274. { 275. chomp; 276. ( $_)= split '#'; 277. s/\s+//g; 278. if( $_) 279. { 280. $g_do_not_process_files{ $_}++; 281. } 282. } 283. close DONT; 284. } 285. } 286. ############### 287. ############### 288. ################################################################################## 289. ############### MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN # MAIN 290. read_inifile( 'archipol.ini'); 291. ini_globals(); 292. create_paths(); 293. print_header() if $g_html; 294. open_and_close_all_files_for_writing(); # Om zeker te zijn dat hier geen problemen optreden 295. # en dat niet pas aan het einde van de download wordt ontdekt. 296. # Het programma maakt gebruik van een array '@g_hrefs' en van een hash '%g_hrefs' 297. # @g_hrefs wordt als stapel gebruikt en %g_hrefs om snel te kunnen detecteren dat een 298. # bepaalde link al is opgeslagen. 299. # Beide worden gevuld vóór de eerste download met '@g_first_files' 300. # en met de links's uit de file 'sitemap.txt' (indien die file bestaat) 301. ini_hrefs(); 302. # Sommige files moeten niet worden ge'processed'. 303. read_do_not_process(); 304. ########### DO THE SEARCHE AND DOWNLOAD FILES 305. { 306. my $number= 0; # Paginanummer 307. my $ua = new LWP::UserAgent; 308. $ua->agent( $g_search_engine_name. " e-mail:". $g_email); 309. $ua->from( $g_email); 310. $ua->max_size( $g_max_file_size); 311. # Make the parser. 312. my $parser= HTML::LinkExtor->new( ); 313. my $i; 314. my $href; 315. my @parts; 316. my $url; 317. my $ext=''; 318. my $res; 319. my $base= ''; 320. my $klad; 321. my @klad; 322. my $filename; 323. my $full_filename; 324. my $dir; 325. my @links; 326. my $link; 327. my $n_can_not_connect; 328. print "Open $g_save_links_in.$g_nl" if $g_debug; 329. open( LINKS, ">$g_save_links_in") or die "$g_save_links_in: $!"; 330. select( ( select( LINKS), $|= 1)[0]); 331. print "Open $g_save_local_links_in.$g_nl" if $g_debug; 332. open( LOCALLINKS, ">$g_save_local_links_in") or die "$g_save_local_links_in: $!"; 333. select( ( select( LOCALLINKS), $|= 1)[0]); 334. print "Open $g_save_to_long_filenames_in.$g_nl" if $g_debug; 335. open( TOLONGFILENAMES, ">$g_save_to_long_filenames_in") or die "$g_save_to_long_filenames_in: $!"; 336. select( ( select( TOLONGFILENAMES), $|= 1)[0]); 337. if( $g_debug) # Voor debug-doeleinden 338. { 339. open( NEWHREFS, ">$g_save_new_hrefs_in") or die "$g_save_new_hrefs_in: $!"; 340. select( ( select( NEWHREFS), $|= 1)[0]); 341. } 342. 343. $|= 1; # Autoflush output 344. while( $number <= $#g_hrefs) # @g_hrefs grows while searching 345. { 346. $href= $g_hrefs[ $number]; 347. print LINKS $href. "\n"; 348. $url= $g_root. $href; 349. print "PROCESS : $number URL = $url$g_nl"; 350. if( $g_do_not_process_files{ $href}) 351. { 352. print "DON'T PROCESS$g_nl"; 353. $number++; 354. ;;;;next; 355. } 356. ( $filename, $dir)= fileparse( $href); 357. $filename= $filename || $g_filename_for_no_filename; 358. $filename=~ s/([^a-zA-Z0-9\-\_\.])/sprintf( '%%%02X', ord($1))/eg; 359. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan door een '%'-teken 360. $dir=~ s/^\.[\\\/]//; 361. $dir=~ s/([^a-zA-Z0-9\-\_\.\/])/sprintf( '%%%02X', ord($1))/eg; 362. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan door een '%'-teken 363. print LOCALLINKS $dir. $filename. "\n"; # Locale file-naam in de file 364. # Request document and parse it as it arrives 365. my @newhrefs; # clear array 366. my @newtags; # clear array 367. # Set up a callback that collect page links 368. $res = $ua->request(HTTP::Request->new(GET => $url)); 369. # Get the document, $res is a pointer to class 370. ##print "RES:$g_nl" if $g_debug; 371. ##print_hash( $res) if $g_debug; 372. print "res{ _msg}= |". $res->{ "_msg"}. "|$g_nl" if $g_debug; 373. if( ( $res->{ "_msg"} eq "OK") or ( lc( $res->{ "_msg"}) eq "partial content")) 374. { 375. $parser->parse( $res->content()); 376. my $page= $res->content(); 377. #### Save the file 378. print "href= $href$g_nl" if $g_debug; 379. print "FILENAME: $dir$filename\n" if $g_debug; 380. 381. # Probeer het path te creëren 382. # Zo ja, schrijf de file 383. eval 384. { 385. mkpath( $g_save_files_in.$dir); 386. }; 387. if( $@) 388. { 389. warn $@; 390. } 391. else 392. { 393. $full_filename= $g_save_files_in. $dir. $filename; 394. 395. if( open( OUT, ">$full_filename")) 396. { 397. print OUT $page; 398. close OUT; 399. } 400. else{ warn "$full_filename: $!";} 401. } 402. { 403. my( $tag, %attr); 404. my( $key, $value); 405. $base = $res->base; # Expand all page URLs to absolute ones 406. 407. foreach $link ( $parser->links) 408. { 409. ($tag, %attr) = @$link; 410. if( $g_search_tags{ $tag}) 411. { 412. while ( ( $key, $value)= each %attr) 413. { 414. $value= URI->new_abs( $value, $base); 415. print "tags= $tag - $key - $value$g_nl" if $g_debug; 416. if( $key eq $g_search_tags{ $tag}) 417. { 418. ##print "newhrefs=$g_nl" if $g_debug; 419. ##print_array( \@newhrefs) if $g_debug; 420. push( @newhrefs, $value); 421. } 422. else 423. { 424. push( @newtags, "$tag\t$key\t$value"); 425. } 426. } 427. } 428. else 429. { 430. while ( ( $key, $value)= each %attr) 431. { 432. $value= URI->new_abs( $value, $base); 433. ##print "tags= $tag - $key - $value$g_nl" if $g_debug; 434. $g_not_searched_tags{ $tag}{ $key}{ $number}++; 435. push( @newtags, "$tag\t$key\t$value"); 436. } 437. } 438. } 439. } 440. ### Print them out 441. ##print join($g_nl, @newhrefs) if $g_debug; 442. 443. ##### search for new ref's 444. my @downloads; 445. for( $i= 0; $i<= $#newhrefs; $i++) 446. { 447. my $href= $newhrefs[ $i]; 448. ## if it does't start with $g_root then it's from outside 449. if( $href=~ s/^$g_root//i) 450. { 451. if( !( $href=~ /^mailto:/i)) 452. { 453. my $last_part; 454. $href=~ s/\/{2,}/\//; # remove double slashes 455. $href=~ s/#(.*)//; # skip everything after '#' including '#' 456. print "href = $href$g_nl" if $g_debug; 457. 458. @parts= split( "/", $href); 459. $ext= ''; 460. 461. # Look for the extension of the file; html htm asp 462. # Extensions can be followed by var's; index.html&a=4 index.html#second_part 463. if( $#parts>= 0 and $#parts<= $g_levels_deep) 464. { 465. ##print( "#parts= $#parts$g_nl") if $g_debug; 466. ##print_array( \@parts) if $g_debug; 467. $last_part= $parts[ $#parts]; 468. $ext= $last_part; 469. ##print "ext = $ext$g_nl" if $g_debug; 470. $ext=~ s/\?(.*)$//; 471. ##print "ext = $ext$g_nl" if $g_debug; 472. @klad= split( /\./, $ext); 473. ##print "\#klad = $#klad$g_nl" if $g_debug; 474. $href= join( '/', @parts); 475. if( $#klad> 0) 476. { 477. $ext= $klad[ $#klad]; 478. ##print "ext = $ext$g_nl" if $g_debug; 479. } 480. else 481. { 482. $ext= ''; 483. # $href is een map (directory) tenzij het laatste deel van de url begint met een '?' of '#' 484. # bijv. /dit/is/een/path/?key=val&key2=val2 485. # equivalent met bijv /dit/is/een/path/index.html?key=val&key2=val2 486. if( !( $last_part=~ m/^[\?\#]/)) 487. { 488. $href.= '/'; 489. } 490. } 491. $ext= lc( $ext); 492. ##if ( $ext) { print "ext = $ext$g_nl" if $g_debug;} 493. ##print "href = $href$g_nl" if $g_debug; 494. } 495. print "ext= $ext$g_nl" if $g_debug; 496. print "href = $href, $number$g_nl" if $g_debug; 497. ## if extension is OK put $href in @g_hrefs en %g_hrefs 498. if( ( !$g_do_not_search_extensions{ $ext} or !$ext) 499. and $href 500. ) 501. { 502. if( length( $last_part)<= $g_max_filename_size) 503. { 504. if( !$g_hrefs{ $href}) 505. { 506. print NEWHREFS $href. " number=". $number. "\n" if $g_debug; 507. push( @g_hrefs, $href); 508. } 509. print "newref = $href, $number$g_nl" if $g_debug; 510. $g_hrefs{ $href}{ $number}++; 511. } 512. else 513. { 514. print "Size filename to long!!$g_nl"; 515. print "::::$last_part$g_nl"; 516. print TOLONGFILENAMES "$last_part\n"; 517. } 518. } 519. else 520. { 521. $g_not_searched_extensions{ $ext}{ $number}++; 522. push( @downloads, $href); 523. print "NOT ALLOWED EXT for search = $ext$g_nl" if $g_debug; 524. print "IN : $url$g_nl" if $g_debug; 525. print " $g_root$href$g_nl" if $g_debug; 526. } 527. } 528. } 529. } 530. 531. ##### do the same for other new tag's 532. for( $i= 0; $i<= $#newtags; $i++) 533. { 534. my ( $tag, $kind, $href)= split( "\t", $newtags[ $i]); 535. print "$tag $kind $href$g_nl" if $g_debug; 536. if( $href=~ s/^$g_root//i) 537. { 538. ##print "$tag $kind $href$g_nl" if $g_debug; 539. if( !$g_not_allowed_tags{ $tag}) 540. { 541. push( @downloads, $href); 542. } 543. } 544. } 545. # search for extra download 546. # zoekt in CCS-tags 547. # bijv. 548. # background: url('/img/flare-middle.jpg'); 549. { 550. # Indien in de pagina tekst tussen '{' en '}', dan 551. while( $page=~ m/{(.*?)}/gs) 552. { 553. my $klad= $1; 554. #print $klad; 555. # 556. # Als die tekst bestaat uit 'url' 557. # gevolgd door nul of meer spaties 558. # dan '(' 559. # dan nul of meer spaties 560. # dan nul of een keer "'" 561. # dan enige tekst bestaande uit letters [a-zA-Z0-9] 562. # en/of '-' en/of '_' en/of '.' en/of '%' en/of '/' 563. while( $klad=~ m/url *?\( *?\'?([\w\-_.\%\/]*?)\'? *?\)/gsi) 564. { 565. $href= URI->new( $1, $base); 566. if( $href=~ s/^$g_root//i) 567. { 568. ##print "href= $href$g_nl" if $g_debug; 569. push( @downloads, $href); 570. } 571. } 572. } 573. } 574. $i= 0; 575. while( $i<= $#downloads) 576. { 577. my $last_part; 578. $href= $downloads[ $i]; 579. if( $g_hrefs{ $href}) 580. { 581. $g_hrefs{ $href}{ $number}++; 582. $i++; 583. ;;;;next; 584. } 585. else 586. { 587. $g_hrefs{ $href}{ $number}++; 588. } 589. if( $g_do_not_process_files{ $href}) 590. { 591. print "DON'T DOWNLOAD: $href$g_nl"; 592. $i++; 593. ;;;;next; 594. } 595. ##print "href = $href$g_nl" if $g_debug; 596. @parts= split( "/", $href); 597. $ext= ''; 598. if( $#parts<= $g_levels_deep) 599. { 600. if( $#parts>= 0) 601. { 602. $parts[ $#parts]=~ s/#(.*)//; 603. $last_part= $parts[ $#parts]; 604. $ext= $last_part; 605. $ext=~ s/\?(.*)$//; 606. ##print "ext = $ext$g_nl" if $g_debug; 607. @klad= split( /\./, $ext); 608. ##print "\#klad = $#klad$g_nl" if $g_debug; 609. if( $#klad>= 0) 610. { 611. $ext= $klad[ $#klad]; 612. ##print "EXT = $ext$g_nl" if $g_debug; 613. } 614. else 615. { 616. $ext= ''; 617. } 618. $ext= lc( $ext); 619. ##if ( $ext) { print "ext = $ext$g_nl" if $g_debug;} 620. $href= join( '/', @parts); 621. } 622. print "href = $href, $number$g_nl" if $g_debug; 623. if( !$g_do_not_download_extensions{ $ext} 624. and !$g_do_not_process_files{ $href} 625. ) 626. { 627. if( length( $last_part)<= $g_max_filename_size) 628. { 629. if( !$g_downloads{ $href}) 630. { 631. my $url= $g_root. $href; 632. ##print "href= $href$g_nl" if $g_debug; 633. print "DOWNLOAD : $url$g_nl"; 634. # Get the document, $res is a pointer to class 635. $res = $ua->request(HTTP::Request->new(GET => $url)); 636. ##print "res:$g_nl" if $g_debug; 637. ##print_hash( $res) if $g_debug; 638. if( ( $res->{ "_msg"} eq "OK") or ( lc($res->{ "_msg"}) eq "partial content")) 639. { 640. ( $filename, $dir)= fileparse( $href); 641. $filename= $filename || $g_filename_for_no_filename; 642. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan 643. # door een '%'-teken 644. $filename=~ s/([^a-zA-Z0-9\-\_\.])/sprintf( '%%%02X', ord($1))/eg; 645. $dir=~ s/^\.[\\\/]//; 646. # Verander niet toegestane tekens in hun hexadeximale waarde voorafgegaan 647. # door een '%'-teken 648. $dir=~ s/([^a-zA-Z0-9\-\_\.\/])/sprintf( '%%%02X', ord($1))/eg; 649. $full_filename= $g_save_files_in. $dir. $filename; 650. print "href= $href$g_nl" if $g_debug; 651. print "full filename= $full_filename$g_nl" if $g_debug; 652. eval 653. { 654. mkpath( $g_save_files_in.$dir); 655. }; 656. if( $@) 657. { 658. warn $@; 659. } 660. else 661. { 662. if( open( OUT, ">$full_filename")) 663. { 664. binmode OUT; 665. print OUT $res->content(); 666. close OUT; 667. } 668. else{ warn "$full_filename: $!";} 669. } 670. } 671. else 672. { 673. # Na 60x "Can't connect" de volgende. 674. if( ( $res->{ "_msg"}=~ m/^Can\'t connect to/i) and ( $n_can_not_connect< 60)) 675. { 676. $n_can_not_connect++; 677. print $res->{ "_msg"}. $g_nl; 678. sleep 10; 679. ;;;;;;;;;;;;;;next; 680. } 681. else 682. { 683. $n_can_not_connect= 0; 684. print "Link not OK: ". $res->{ "_msg"}. $g_nl; 685. $g_deadlinks{ $href}{ $res->{ "_msg"}}++; 686. } 687. } 688. } 689. $g_downloads{ $href}++; 690. $g_extensions{ $ext}{ $number}++; 691. } 692. else 693. { 694. print "Size filename to long!!$g_nl"; 695. print "::::$last_part$g_nl"; 696. print TOLONGFILENAMES "$last_part\n"; 697. } 698. } 699. } 700. $i++; 701. } 702. } 703. else ## if $res->{ "_msg"} ne "OK" 704. { 705. # Na 60x "Can't connect" de volgende. 706. if( $res->{ "_msg"}=~ m/^Can\'t connect to/i and ( $n_can_not_connect< 60)) 707. { 708. $n_can_not_connect++; 709. print $res->{ "_msg"}. $g_nl; 710. sleep 10; 711. ;;;;;;next; 712. } 713. else 714. { 715. $n_can_not_connect= 0; 716. print "Link not OK: ". $res->{ "_msg"}. $g_nl; 717. $g_deadlinks{ $href}{ $res->{ "_msg"}}++; 718. } 719. } 720. $number++; 721. } 722. close( LINKS); 723. close( LOCALLINKS); 724. close( TOLONGFILENAMES); 725. close( NEWHREFS); 726. } 727. if( %g_deadlinks) 728. { 729. print "Write dead links$g_nl"; 730. open( DEADLINKS, ">$g_save_deadlinks_in") or die "$g_save_deadlinks_in: $!"; 731. { 732. my $i; 733. my $number; 734. my $deadlink; 735. my %msg; 736. my $msg; 737. my %value; 738. my $value; 739. 740. while ( ( $deadlink, $msg)= each %g_deadlinks) 741. { 742. print DEADLINKS "DEAD LINK $g_root$deadlink\n"; 743. %msg= %$msg; 744. while( ( $msg, $value)= each %msg) 745. { 746. print DEADLINKS "MESSAGE = $msg\n"; 747. } 748. print DEADLINKS "IN\t"; 749. foreach $number ( keys %{ $g_hrefs{ $deadlink}}) 750. { 751. print DEADLINKS $g_root. $g_hrefs[ $number]. "\n\t"; 752. } 753. print DEADLINKS "\n"; 754. } 755. } 756. close( DEADLINKS); 757. } 758. if( %g_not_searched_tags) 759. { 760. my $tag; 761. my $p_hash; 762. my $p_hash2; 763. my $number; 764. my $key; 765. my $n; 766. 767. print "Write not searched tags$g_nl"; 768. open( NOTSEARCHEDTAGS, ">$g_save_not_searched_tags_in") or die "$g_save_not_searched_tags_in: $!"; 769. 770. while( ( $tag, $p_hash)= each( %g_not_searched_tags)) 771. { 772. while( ( $key, $p_hash2)= each( %$p_hash)) 773. { 774. print NOTSEARCHEDTAGS $tag. "\t". $key. "\n"; 775. while( ( $number, $n)= each( %$p_hash2)) 776. { 777. print NOTSEARCHEDTAGS "\t: $g_hrefs[ $number]\n"; 778. } 779. } 780. } 781. close NOTSEARCHEDTAGS; 782. } 783. if( %g_not_searched_extensions) 784. { 785. print "Write not searched links$g_nl"; 786. open( NOTSEARCHED, ">$g_save_not_searched_extensions_in") or die "$g_save_not_searched_extensions_in: $!"; 787. { 788. my $ext; 789. my $p_hash; 790. my $number; 791. my $n; 792. 793. while ( ( $ext, $p_hash)= each( %g_not_searched_extensions)) 794. { 795. print NOTSEARCHED $ext. "\n"; 796. while( ( $number, $n)= each( %$p_hash)) 797. { 798. print NOTSEARCHED "\t: $g_hrefs[ $number]\n"; 799. } 800. } 801. } 802. close NOTSEARCHED; 803. } 804. if( %g_extensions) 805. { 806. print "Write all file extensions$g_nl"; 807. open( DOWNLOADED, ">$g_save_extensions_in") or die "$g_save_extensions_in: $!"; 808. { 809. my $ext; 810. my $p_hash; 811. my $number; 812. my $n; 813. 814. while ( ( $ext, $p_hash)= each( %g_extensions)) 815. { 816. print DOWNLOADED $ext. "\n"; 817. while( ( $number, $n)= each( %$p_hash)) 818. { 819. print DOWNLOADED "\t: $g_hrefs[ $number]\n"; 820. } 821. } 822. } 823. close DOWNLOADED; 824. } 825. print_tail() if $g_html;