Simply save this script
sitemap-to-csv.pl
to your computer. You will need Perl to run in.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use LWP::Simple; | |
use Data::Dumper; | |
use XML::Simple; | |
sub ReadXml { | |
my $url = $_[0]; | |
my $content; | |
if($url =~ /https/) { | |
my $browser = LWP::UserAgent->new; | |
my $response = $browser->get($url); | |
$content = $response->content; | |
} else { | |
$content = get($url); | |
} | |
die("Sitemap not found at $url.") if(!$content); | |
print("Processing XML file $url.\r\n"); | |
my $parser = new XML::Simple; | |
return $parser->XMLin($content); | |
} | |
sub ParseXml { | |
my $xml = $_[0]; | |
my $fh = $_[1]; | |
for my $row ( @{ $xml->{sitemap} } ) { | |
if($row->{loc} =~ /.xml$/) { | |
my $xml = ReadXml($row->{loc}); | |
ParseXml($xml, $fh); | |
$fh->flush(); | |
} else { | |
print Dumper($row); | |
} | |
} | |
if(ref($xml->{url}) eq 'ARRAY' ) { | |
for my $row ( @{ $xml->{url} } ) { | |
print $fh join(',', | |
$row->{loc}, | |
defined $row->{lastmod} ? $row->{lastmod} : '', | |
defined $row->{changefreq} ? $row->{changefreq} : '', | |
defined $row->{priority} ? $row->{priority} : '' | |
) . "\r\n"; | |
} | |
} | |
if(ref($xml->{url}) eq 'HASH' ) { | |
print $fh join(',', | |
$xml->{url}->{loc}, | |
defined $xml->{url}->{lastmod} ? $xml->{url}->{lastmod} : '', | |
defined $xml->{url}->{changefreq} ? $xml->{url}->{changefreq} : '', | |
defined $xml->{url}->{priority} ? $xml->{url}->{priority} : '' | |
) . "\r\n"; | |
} | |
} | |
die("Please supply a website URL.") if($ARGV[0] !~ /:\/\//); | |
print("Reading sitemap for $ARGV[0].\r\n"); | |
my $url = $ARGV[0]; | |
if($url !~ /.xml$/) { | |
$url =~ s/\/?$/\/sitemap.xml/; | |
} | |
my $filename = 'sitemap.csv'; | |
open(my $fh, '>', $filename) or die "Could not open file '$filename' $!"; | |
my $xml = ReadXml($url); | |
ParseXml($xml, $fh); | |
close $fh; | |
Then you can execute it from the command line with the following format:
// This will automatically append /sitemap.xml
./sitemap-to-csv.pl http://domain.com
// Or specify the exact file.
perl sitemap-to-csv.pl http://domain.com
// With SSL domain authentication override.
PERL_LWP_SSL_VERIFY_HOSTNAME=0 ./sitemap-to-csv.pl http://domain.com
No comments:
Post a Comment