#!/usr/bin/perl

# This script can be used on a mirror (or e.g. merkel.debian.org) to
# produce an overview of the size of the archive. The numbers reflect
# the "raw" size of the archive: the size of packages and source files.
# It does not include the size of files containing meta data, nor of
# various separate directories.

# Copyright (c) 2009 Frans Pop <fjp@debian.org>

use warnings;
use strict;
use Cwd;
use Getopt::Long;
use File::Temp qw/ tempfile /;
use Compress::Zlib;

our @arches= qw(source all i386 amd64 alpha arm armel hppa ia64 m68k mips mipsel powerpc s390 sparc kfreebsd-i386 kfreebsd-amd64);
our @sects= qw(main contrib non-free main/debian-installer);
our @suites= qw(oldstable stable testing unstable);
our %dists;
our (@source_files, @package_files);
our (%files, %sizes, %total, %width);

our $root="/srv/ftp.debian.org/ftp/dists";
chdir($root) or die "chdir $root: $!";

my ($tfh, $tfile) = tempfile();
END { unlink $tfile if $tfile }

### Collect the data

foreach my $suite (@suites) {
  next unless -f "$root/$suite/Release";
  if (open my $release, "<", "$root/$suite/Release") {
    while (<$release>) {
      if (/^Codename:/) {
	($dists{$suite}) = m/^Codename:\s+(.*)/i;
	last;
      }
    }
    close $release;
  }

  foreach my $sect (@sects) {
    next unless -d "$root/$suite/$sect";
    print(STDERR "Processing $suite $sect\n");
    foreach my $arch (@arches) {
      my $file;
      if ($arch eq "source") {
	$file = "source/Sources.gz";
	next unless -f "$root/$suite/$sect/$file";
	parse_sources($file, $suite, $sect);
      } else {
	$file = "binary-$arch/Packages.gz";
	next unless -f "$root/$suite/$sect/$file";
	parse_packages($file, $suite, $sect, $arch);
      }
    }
  }
}

### Print the tables

foreach my $suite (@suites) {
  next unless exists $dists{$suite};
  $width{$suite} = exists $sizes{"d$suite"} ? 16 : 6;
}

print("Total archive size (binary + source) per section:\n\n");
printf("%10s ", "(in MiB)");
foreach my $suite (@suites) {
  next unless exists $dists{$suite};
  printf("| %$width{$suite}s ", $dists{$suite});
}
printf("| %6s\n", "all");
print_ruler();
foreach my $sect (@sects) {
  next unless exists $sizes{all}{$sect};
  if ($sect eq "main/debian-installer") {
    printf("%-10s", "main/d-i");
  } else {
    printf("%-10s", $sect);
  }
  foreach my $suite (@suites) {
    next unless exists $dists{$suite};
    if (exists $sizes{$suite}{$sect}) {
      $total{$suite} += $sizes{$suite}{$sect};
      printf(" | %6i", int((1 + $sizes{$suite}{$sect}) /1024/1024));
      if (exists $sizes{"d$suite"}{$sect}) {
        $total{"d$suite"} += $sizes{"d$suite"}{$sect};
        printf(" (+%6i)", int((1 + $sizes{"d$suite"}{$sect}) /1024/1024));
      }
    } else {
      print(" | " . (" " x $width{$suite}));
    }
  }
  printf(" | %6i\n", int((1 + $sizes{all}{$sect}) /1024/1024));
  $total{all} += $sizes{all}{$sect};
}
print_ruler();
printf("%-9s ", "total");
foreach my $suite (@suites) {
  next unless exists $dists{$suite};
  printf(" | %6i", int((1 + $total{$suite}) /1024/1024));
  printf(" (+%6i)", int((1 + $total{"d$suite"}) /1024/1024)) if exists $total{"d$suite"};
}
printf(" | %6i", int((1 + $total{all}) /1024/1024)."\n");

print("\n\n");
print("Archive size per architecture (source and arch=all packages are shown separately):\n\n");
printf("%10s ", "(in MiB)");
foreach my $suite (@suites) {
  next unless exists $dists{$suite};
  printf("| %$width{$suite}s ", $dists{$suite});
}
printf("| %6s\n", "all");
print_ruler();
foreach my $arch (@arches) {
  next unless exists $sizes{all}{$arch};
  my $parch = $arch;
  $parch =~ s/kfree/k/;
  printf("%-10s", $parch);
  foreach my $suite (@suites) {
    next unless exists $dists{$suite};
    if (exists $sizes{$suite}{$arch}) {
      printf(" | %6i", int((1 + $sizes{$suite}{$arch}) /1024/1024));
      printf(" (+%6i)", int((1 + $sizes{"d$suite"}{$arch}) /1024/1024)) if exists $sizes{"d$suite"}{$arch};
    } else {
      printf(" | " . (" " x $width{$suite}));
    }
  }
  printf(" | %6i\n", int((1 + $sizes{all}{$arch}) /1024/1024));
}

my @ts=gmtime(time());
printf("\nAll numbers reflect the state of the archive per %i %s %i.\n", $ts[3],
       (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec))[$ts[4]], $ts[5] + 1900);

### Functions

sub print_ruler {
  print("-" x 11);
  foreach my $suite (@suites) {
    next unless exists $dists{$suite};
    print("|" . "-" x ($width{$suite} + 2));
  }
  print("|" . "-" x 7 . "\n");
  return;
}

sub parse_packages {
  my ($file, $suite, $sect, $arch) = @_;
  my ($line, $res, $size, $filename, $architecture);
  local $/ = "\n\n";
  system_redirect_io("gzip -d", "$root/$suite/$sect/$file", "$tfile");
  open(my $tfh, "<", $tfile) or die "$tfile: $!";
  for (;;) {
    my $buf;
    unless (defined( $buf = <$tfh> )) {
      last if eof;
      die "$file: $!" if $!;
    }
    $_ = $buf;
    ($filename) = m/^Filename:\s+(.*)/im;
    $filename =~ s:/+:/:; # remove redundant slashes in paths
    ($architecture) = m/^Architecture:\s+(.*)/im;
    ($size) = m/^Size:\s+(\d+)/im;

    if (! exists $files{$filename}{$suite}) {
      $sizes{$suite}{$sect} += $size;
      $sizes{$suite}{$architecture} += $size;
      if (($suite eq "stable" && exists $dists{oldstable} &&
	   ! exists $files{$filename}{oldstable}) ||
	  ($suite eq "testing" && exists $dists{stable} &&
	   ! exists $files{$filename}{stable}) ||
	  ($suite eq "unstable" && exists $dists{testing} &&
	   ! exists $files{$filename}{testing})) {
	$sizes{"d$suite"}{$sect} += $size;
	$sizes{"d$suite"}{$architecture} += $size;
      }
    }
    if (! exists $files{$filename}{x}) {
      $sizes{all}{$sect} += $size;
      $sizes{all}{$architecture} += $size;
    }
    $files{$filename}{x} = 1;
    $files{$filename}{$suite} = 1;
  }
  close($tfh);
  return;
}
sub parse_sources {
  my ($file, $suite, $sect) = @_;
  my ($line, $res, $size, $directory, $filename, $md5sum);
  local $/ = "\n\n";
  system_redirect_io("gzip -d", "$root/$suite/$sect/$file", "$tfile");
  open(my $tfh, "<", $tfile) or die "$tfile: $!";
  for (;;) {
    my $buf;
    unless (defined( $buf = <$tfh> )) {
      last if eof;
      die "$file: $!" if $!;
    }
    $_ = $buf;
    ($directory) = m/^Directory:\s+(.*)/im;
    while (m/^ ([A-Za-z0-9]{32} .*)/mg) {
      ($md5sum, $size, $filename)=split(' ', $1, 3);
      $filename = "$directory/$filename";
      $filename =~ s:/+:/:; # remove redundant slashes in paths

      if (! exists $files{$filename}{$suite}) {
	$sizes{$suite}{$sect} += $size;
	$sizes{$suite}{source} += $size;
	if (($suite eq "stable" && exists $dists{oldstable} &&
	     ! exists $files{$filename}{oldstable}) ||
	    ($suite eq "testing" && exists $dists{stable} &&
	     ! exists $files{$filename}{stable}) ||
	    ($suite eq "unstable" && exists $dists{testing} &&
	     ! exists $files{$filename}{testing})) {
	  $sizes{"d$suite"}{$sect} += $size;
	  $sizes{"d$suite"}{source} += $size;
	}
      }
      if (! exists $files{$filename}{x}) {
	$sizes{all}{$sect} += $size;
	$sizes{all}{source} += $size;
      }
      $files{$filename}{x} = 1;
      $files{$filename}{$suite} = 1;
    }
  }
  close($tfh);
  return;
}

# run system() with stdin and stdout redirected to files
# unlinks stdout target file first to break hard links
sub system_redirect_io {
  my ($command, $fromfile, $tofile) = @_;

  if (-f $tofile) {
    unlink($tofile) or die "unlink($tofile) failed: $!";
  }
  system("$command <$fromfile >$tofile");
  return;
}
