#!/usr/bin/perl -w
use strict;
use threads ('yield',
'stack_size' => 64*4096,
'exit' => 'threads_only',
'stringify');
use POSIX qw(strftime);
my $verbose = 0;
my $rsync_first = 0;
my $cpus_to_use = 16; # level of parallelism
my $bzcat_grouping = 10; # files to pass to bzcat at once
my $path_to_log_tree;
my $threaded = 1;
my %month_to_num = (
'Jan' => '01',
'Feb' => '02',
'Mar' => '03',
'Apr' => '04',
'May' => '05',
'Jun' => '06',
'Jul' => '07',
'Aug' => '08',
'Sep' => '09',
'Oct' => '10',
'Nov' => '11',
'Dec' => '12',
);
my %products;
my %allversions;
my %all_files_list;
my %date_product_count;
my %date_version_count;
my $total_downloads;
# FIXME: ODF is -incredibly- lame in this regard ... we badly want R1C1 style referencing here [!]
sub coltoref($)
{
my $col = shift;
die ("odff needs R1C1") if ($col > 25);
return chr (ord('A') + $col);
}
sub print_date_cell($$)
{
my ($style,$date) = @_;
# sadly not truly a date but a year + ISO week number.
print << "EOF"
$date
EOF
;
}
sub find_logs($);
sub find_logs($)
{
my $path = shift;
my $dirh;
my @logfiles;
if (-l $path) {
$path = readlink $path;
}
if (-f $path ) {
if ($path =~ m/documentfoundation\.org.*[0-9][-_]access[_.]log/) {
if ($verbose) {
print STDERR "hit: $path\n";
}
return $path;
} else {
return;
}
}
if (!-d $path) {
return;
}
opendir ($dirh, $path) || die "can't open '$path': $!";
while (my $name = readdir ($dirh)) {
next if ($name =~ m/^\./);
push @logfiles, find_logs("$path/$name");
}
close ($dirh);
return @logfiles;
}
sub is_uninteresting_file($)
{
my $file = shift;
return 1 if ( $file =~ /^$/ );
return 1 if ( $file =~ /^{/ );
return 1 if ( $file =~ /^%/ );
return 1 if ( $file =~ /^debian-repo\/testing\// );
return 1 if ( $file =~ /^\/libreoffice\/old\// );
return 1 if ( $file =~ /^\/libreoffice\/src\// );
return 1 if ( $file =~ /^\/robots\.txt$/ );
return 1 if ( $file =~ /\/index\.php$/ );
return 1 if ( $file =~ /\/a\.sh$/ );
return 1 if ( $file =~ /^\/TIMESTAMP/ );
# ignore source
return 1 if ( $file =~ m|/src/| );
# ignore android remote
return 1 if ( $file =~ m/ImpressRemote.apk$/ );
# anywhere
return 1 if ( $file =~ /\/customer_testimonials.php/ );
# anything that is missing an extension (directory names, metafiles) and slash
return 1 if ( $file =~ /\/[^.\/]+$/ );
return 1 if ( $file =~ /^[^\/]+$/ );
# not interesting extensions
return 1 if ( $file =~ /\/$/ );
return 1 if ( $file =~ /\?C=[MNS];O=[AD]$/ );
return 1 if ( $file =~ /\.asc$/ );
return 1 if ( $file =~ /\.btih$/ );
return 1 if ( $file =~ /\.css$/ );
return 1 if ( $file =~ /\/favicon\.ico$/ );
return 1 if ( $file =~ /\.gif$/ );
return 1 if ( $file =~ /\.gpg$/ );
return 1 if ( $file =~ /\.html$/ );
return 1 if ( $file =~ /\.info\.php$/ );
return 1 if ( $file =~ /\.log$/ );
return 1 if ( $file =~ /\.magnet$/ );
return 1 if ( $file =~ /\.md5$/ );
return 1 if ( $file =~ /\.meta4$/ );
return 1 if ( $file =~ /\.metalink$/ );
return 1 if ( $file =~ /\.mirrorlist$/ );
return 1 if ( $file =~ /\/Packages$/ );
return 1 if ( $file =~ /\/Packages\.bz2$/ );
return 1 if ( $file =~ /\/Packages\.gz$/ );
return 1 if ( $file =~ /\/Packages\.lzma$/ );
return 1 if ( $file =~ /\/Packages\.xz$/ );
return 1 if ( $file =~ /\.png$/ );
return 1 if ( $file =~ /\/Release$/ );
return 1 if ( $file =~ /\.sha1$/ );
return 1 if ( $file =~ /\.sha256$/ );
return 1 if ( $file =~ /\.torrent$/ );
return 1 if ( $file =~ /\.zsync$/ );
# noise
return 1 if ( $file =~ /%/ );
return 1 if ( $file =~ /&/ );
# is interesting ...
return 0;
}
sub characterise($$)
{
my ($filerec, $file) = @_;
# currently based entirely on the filename
$file =~ m|/([^/]+)$| || die "not a filename: '$file'";
my $name = $1;
$name =~ s/BrOffice/LibO/; # BrOffice is obsolete
$name =~ s/-/_/g; # use underscores everywhere
my @elements = split(/_/, $name);
if (@elements < 2) {
print STDERR "Unknown filename '$name'\n";
return 0;
}
my $prod = $elements[0];
if ($prod eq 'LibO' &&
( $elements[1] eq 'SDK' || $elements[1] eq 'Dev' ) ) { # ignore sdk + dev-builds
return 0;
} elsif ( $prod eq 'LibreOfficePortableTest') { # ignore test builds
return 0;
# Odd - legacy stuff
} elsif ($prod eq 'libreoffice' && (
$name =~ m/\.tar\.gz$/ ||
$name =~ m/\.tar.bz2$/ ||
$name =~ m/\.tar\.xz$/)) { # source
return 0;
# obsolete snafu
} elsif ($prod eq 'libo3.4.4' && $name =~ /\.iso$/) {
$filerec->{version} = '3.4.4';
$filerec->{product} = 'Win-dvd';
# LibreOffice portable
} elsif ($prod eq 'LibreOfficePortable') {
if ($name =~ m/(\d\.\d\.\d).*\.exe$/) {
$filerec->{version} = $1;
$filerec->{product} = 'Win-portable';
} else {
print STDERR "Unknown portable version in '$name'\n";
return 0;
}
# Bread and butter:
} elsif ($prod eq 'LibO' || $prod eq 'LibreOffice' ||
$prod eq 'LO' || $prod eq 'LibOx') {
$filerec->{version} = $elements[1];
my $product;
if ($name =~ m/\.iso$/) {
if ($name =~ m/allproducts/) {
$product = "All-dvd";
} else {
$product = "Win-dvd";
}
} elsif ($name =~ m/Win_x86/) {
$product = "Win-x86";
} elsif ($name =~ m/Linux_x86-64/) {
$product = "Linux-x86-64";
} elsif ($name =~ m/Linux_x86/) {
$product = "Linux-x86";
} elsif ($name =~ m/MacOS_x86/) {
$product = "Mac-x86";
} elsif ($name =~ m/MacOS_PPC/) {
$product = "Mac-PPC";
} else {
print STDERR "Unknown product for '$name'\n";
}
$filerec->{product} = $product;
} else {
print STDERR "Unknown initial element '$prod' of '$name'\n";
return 0;
}
# characterise helppacks and langpacks
$name =~ s/helppack/langpack/g; # destructive !
$filerec->{langpack} = 0;
if ($name =~ /_langpack_/ ) {
$filerec->{langpack} = 1;
}
# print STDERR "'$name' is a lang-pack: " . $filerec->{langpack} . "\n";
return 1;
}
sub parse_log($)
{
my $log = shift;
my @files;
# in order to get a good representation of weeks at the start and end of the
# year (so that we don't get 1/2 of the data at the end, and 1/2 at the start
# of the next one), we use "epoch_week" - week since 1970-01-01 (1st week)
my $old_date = "";
my $epoch_week;
my %epoch_week_to_year;
while (<$log>) {
my $line = $_;
if ( $line =~ m/^([^ ]+) - - \[([^\/]+)\/([^\/]+)\/([^:]+):([0-9][0-9])[^\]]*\] "GET ([^"]*) HTTP\/[^"]*" ([0-9]+) ([0-9]+)/ ) {
#print "$1, $2, $3, $4, $5, $6\n";
my ( $ip, $day, $month, $year, $hour, $file, $status, $size ) = ( $1, $2, $month_to_num{$3}, $4, $5, $6, $7, $8 );
# we are interested only in redirects and successful downloads
next if ( $status != 302 && $status != 200 && $status != 206 );
# partial download? - only count when it finished
if ( $status == 206 )
{
if ( $line =~ / size:([0-9]+) bytes=([0-9]+)-([0-9]*)$/ )
{
my ( $wanted, $from ) = ( $1, $2 );
next if ( $wanted != $from + $size );
}
else {
next;
}
}
# canonicalize
$file =~ s/^\s+//;
$file =~ s/\s+$//;
$file =~ s/^http:\/\/download.documentfoundation.org//g;
$file =~ s/\/\//\//g;
$file =~ s/\?.*//g;
$file =~ s/;jsessionid=.*//g;
$file =~ s/\/libreoffice\/box\///g;
$file =~ s/\/libreoffice\/old\///g;
$file =~ s/\/libreoffice\/portable\///g;
$file =~ s/\/libreoffice\/stable\///g;
$file =~ s/\/libreoffice\/testing\///g;
# not interesting path starts
next if ( is_uninteresting_file ($file) );
my %filerec;
$filerec{file} = $file;
next if ( ! characterise(\%filerec, $file) );
# update the $epoch_week, if necessary
if ( "$year-$month-$day" ne $old_date ) {
$old_date = "$year-$month-$day";
# 1970-01-01 is Thursday, add 3 days (259200 seconds), and divide
my $seconds = POSIX::strftime( "%s", 0, 0, 12, $day, $month - 1, $year - 1900 ); # see the manual
$epoch_week = sprintf( "%d", ($seconds + 259200) / 604800 );
# remember the week
my $week = POSIX::strftime( "%V", 0, 0, 12, $day, $month - 1, $year - 1900 ); # see the manual
$epoch_week_to_year{$epoch_week} = sprintf( "$year-w%02d", $week );
}
$filerec{date} = $epoch_week;
$filerec{pretty_date} = $epoch_week_to_year{$epoch_week};
push @files, \%filerec;
}
# elsif ($verbose) { # don't touch a global variable it's bad news
# print STDERR "invalid line in apache logs: '$line'\n";
# }
}
return @files;
}
sub parse_type($@)
{
my $type = shift;
my @file_list = @_;
my @results;
my $log;
while (@file_list) {
my $files = "";
for (my $i = 0; $i < $bzcat_grouping; $i++) {
my $file = shift (@file_list) || next;
$files = "$files $file";
}
open ($log, "$type $files |") || die "Can't '$type $files': $!";
push @results, parse_log($log);
close $log;
print STDERR ".";
}
return @results;
}
sub parse_logs($)
{
my $filelist = shift;
my @results;
my @bzipped;
my @gzipped;
for my $file (@{$filelist}) {
if ($file =~ m/\.bz2$/) {
push @bzipped, $file;
} elsif ($file =~ m/\.gz$/) {
push @gzipped, $file;
} else {
my $log;
open ($log, "$file") || die "Can't open '$file': $!";
push @results, parse_log($log);
close $log;
}
}
push @results, parse_type('bzcat', @bzipped);
push @results, parse_type('zcat', @gzipped);
return \@results;
}
sub merge_results($)
{
my $list = shift;
for my $filerec (@{$list}) {
# without helppacks and langpacks
next if ( $filerec->{langpack} );
# build list of files
my $file = $filerec->{file};
if (!defined $all_files_list{$file}) {
$all_files_list{$file} = 0;
}
$all_files_list{$file}++;
my $date = $filerec->{pretty_date};
my $ver = $filerec->{version};
my $product = $filerec->{product};
# accumulate products
$products{$product} = 1;
# aggregate versions
$allversions{$ver} = 1;
$total_downloads++;
# aggregate by product
if ( !defined( $date_product_count{$date} ) ||
!defined( $date_product_count{$date}{$product} ) ) {
$date_product_count{$date}{$product} = 0;
}
++$date_product_count{$date}{$product};
# aggregate by version
if ( !defined( $date_version_count{$date} ) ||
!defined( $date_version_count{$date}{$ver} ) ) {
$date_version_count{$date}{$ver} = 0;
}
++$date_version_count{$date}{$ver};
}
}
sub spawn_parse_log_thread($)
{
my ($file_list) = @_;
return threads->create( { 'context' => 'list' },
sub { return parse_logs($file_list); } );
}
while (my $arg = shift @ARGV) {
if ($arg eq '-v' || $arg eq '--verbose') {
$verbose = 1;
} elsif ($arg eq '-c' || $arg eq '--cpus') {
$cpus_to_use = shift @ARGV;
} elsif ($arg eq '-u' || $arg eq '--update') {
$rsync_first = 1;
} elsif (!defined $path_to_log_tree) {
$path_to_log_tree = $arg;
} else {
die "Unknown parameter '$arg'";
}
}
if (!defined $path_to_log_tree) {
$path_to_log_tree = `pwd`;
chomp ($path_to_log_tree);
$path_to_log_tree = "$path_to_log_tree/downloads";
}
# update first
if ($rsync_first) {
system('rsync --delete -av bilbo.documentfoundation.org:/var/log/apache2/download.documentfoundation.org/ downloads/download.documentfoundation.org/ 1>&2');
system('rsync --delete -av bilbo.documentfoundation.org:/var/log/apache2/downloadarchive.documentfoundation.org/ downloads/downloadarchive.documentfoundation.org/ 1>&2');
system('rsync -av bilbo2.documentfoundation.org:/var/log/apache2/download.documentfoundation.org/ downloads/bilbo2.documentfoundation.org/ 1>&2');
}
my @log_filenames = find_logs ($path_to_log_tree);
if ($verbose) {
print STDERR "Have log paths of:\n\t" . (join("\n\t", @log_filenames)) . "\n";
}
# the slow piece - parsing the logs
my $files_in = @log_filenames;
my $parallel = $cpus_to_use;
print STDERR "reading log data $files_in files:\n";
if ($threaded) {
# divide up the work first.
my @thread_files;
for (my $i = 0; $i < $parallel; $i++) {
my @foo; $thread_files[$i] = \@foo;
}
while (@log_filenames) {
for (my $i = 0; $i < $parallel; $i++) {
my $file = shift (@log_filenames) || next;
push @{$thread_files[$i]}, $file;
}
}
my @threads;
for (my $i = 0; $i < $parallel; $i++) {
my $file_list = $thread_files[$i];
if (scalar (@{$file_list}) > 0) {
push @threads, spawn_parse_log_thread ($file_list);
}
}
print STDERR "joining threads: ";
while (@threads) {
my $thread = shift @threads;
merge_results($thread->join());
print STDERR "joined";
}
print STDERR "\n";
} else {
merge_results(parse_logs(\@log_filenames));
}
my $generated_stamp = "Generated on: " . qx(date --rfc-3339=seconds);
# ---------------------------------------------------------------------------------
# now output this as a spreadsheet ... fods ...
print << "EOF"
-
-
%
$generated_stamp
Total downloads:
Date
EOF
;
# ---------------------------------------------------------------------------------
# By Product sheet
my @prods = sort keys %products;
for my $product (@prods) {
print << "EOF"
$product
EOF
;
}
print << "EOF"
Total
EOF
;
my $row = 1;
my $colcount = @prods;
my $colname = coltoref ($colcount);
# print STDERR "cols: $colcount - colname $colname @prods\n";
for my $date (sort keys %date_product_count) {
print << "EOF"
EOF
;
print_date_cell("isodate", $date);
for my $product (@prods) {
my $count = $date_product_count{$date}->{$product};
$count = 0 if (!defined $count);
print << "EOF"
EOF
;
}
$row++;
print << "EOF"
EOF
;
}
# Summary / formulae
{
print << "EOF"
Total
EOF
;
my $col;
for ($col = 1; $col <= $colcount + 1; $col++) {
my $ref = coltoref ($col);
print (" \n");
}
print << "EOF"
EOF
;
}
# Summary as %ages ...
{
print << "EOF"
Percent
EOF
;
my $col;
$row++;
my $totalref = coltoref($colcount + 1) . "$row";
for ($col = 1; $col <= $colcount + 1; $col++) {
my $ref = coltoref ($col);
print (" \n");
}
print << "EOF"
EOF
;
}
# ---------------------------------------------------------------------------------
# By version sheet
# First collapse trivial / invalid versions - under 0.2%
my @todelete = ();
my $threshold = (2 * $total_downloads) / 1000;
for my $version (keys %allversions) {
my $total = 0;
for my $date (keys %date_version_count) {
my $count = $date_version_count{$date}->{$version};
$count = 0 if(!defined $count);
$total = $total + $count;
}
if ($total < $threshold) {
# print STDERR "collapsing trivial version '$version' count $total into 'invalid'\n";
push @todelete, $version;
for my $date (keys %date_version_count) {
my $count = $date_version_count{$date}->{$version};
if (defined $count) {
if (!defined $date_version_count{$date}->{'invalid'}) {
$date_version_count{$date}->{'invalid'} = $count;
} else {
$date_version_count{$date}->{'invalid'} += $count;
}
}
}
}
}
if (@todelete) {
for my $version (@todelete) {
delete $allversions{$version};
}
$allversions{'invalid'} = 1; # so we get the result
}
print << "EOF"
Date
EOF
;
for my $version (sort keys %allversions) {
print << "EOF"
$version
EOF
;
}
print << "EOF"
EOF
;
for my $date (sort keys %date_version_count) {
print << "EOF"
EOF
;
print_date_cell("isodate", $date);
for my $ver (sort keys %allversions) {
my $count = $date_version_count{$date}->{$ver};
$count = 0 if(!defined $count);
print << "EOF"
EOF
;
}
print << "EOF"
EOF
;
}
print << "EOF"
EOF
;
# ---------------------------------------------------------------------------------
# misc. debugging / information
print << "EOF"
Name
count
EOF
;
for my $file (sort { $all_files_list{$b} <=> $all_files_list{$a} } keys %all_files_list) {
my $count = $all_files_list{$file};
print << "EOF"
$file
EOF
;
}
print << "EOF"
EOF
;
# ---------------------------------------------------------------------------------
# end of spreadsheet ...
print << "EOF"
EOF
;