#!/usr/bin/perl -w
# -*- Mode: perl -*-
#template by ~tconnors/bin/newplscript
#Fri Nov 27 23:01:20 EST 2009

# $Revision: 1.13 $ $Date: 2024/06/28 02:48:59 $
# $Id: md5sumcache,v 1.13 2024/06/28 02:48:59 tconnors Exp $
# $Header: /home/tconnors/cvsroot/bin/md5sumcache,v 1.13 2024/06/28 02:48:59 tconnors Exp $
# $RCSfile: md5sumcache,v $

# This program grabs the md5sums of those files supplied on the
# commandline, reading from a prior cache if the file has already been
# cached with the correct mtime

use strict;
use warnings;
use File::Temp qw/ :mktemp  /;
use POSIX;

sub isNum {
  ($_[0] =~ /^[\d]+$/);
}

sub isHexNum {
  ($_[0] =~ /^[\dA-Fa-f]+$/);
}

my $debug= defined $ENV{DEBUG};

sub debug {
  print STDERR "@_" if $debug;
}

my $basecache=".md5sumcache";
my $cache="$ENV{HOME}/$basecache";
my (%known_files,%known_md5sums,%flag_possible_removal);
my (%cached_mtime,%cached_size,%cached_md5sum,%stat);

my @obtain_real_md5sums;

my $SHORTHOST=`hostname`;
chomp $SHORTHOST;
# /net/host is equivalent to ..../sshfs/*@host.*/.  Lets merge them as
# far as the hash is concerned (but do the stat on the real referenced
# file, and output the real filename, as /net/host might not have been
# available on this network when the symlink that caused us to
# reference it was originally created (always assume that /net/<host>
# is currently available - we don't want to be deduplicating across
# ssh)).
sub munge_sshfs {
  my $filename = shift;
  $filename =~ s/.*\/sshfs\/[^@\/]*?@?([^.@\/]*)\.?[^@\/]*\//\/net\/$1\//;
  # also remove /net/hostname on the current host
  $filename =~ s/\/net\/$SHORTHOST//;
  return $filename;
}

sub read_cache {
  if (-e $cache) {
    if (open(FH, "mycat '$cache'|")) {
      while (<FH>) {
        chomp;
        my ($mtime, $size, $md5sum, $filename)=split(' ', $_, 4);
        next if ! defined $filename;

        $cached_mtime{$filename} =$mtime;
        $cached_size{$filename}  =$size;
        $cached_md5sum{$filename}=$md5sum;
        $stat{$filename}=0;  # a placeholder which we fill when asked
                             # to return the md5sum from a file
      }
      close FH;
    }
  } else {
    warn "\nmd5sumcache: No $cache exists yet";
  }
}

sub cached_stat {
  my ($filename)=(@_);

  my $munged_filename = munge_sshfs($filename);

  if (defined $stat{$filename} && ($stat{$filename} == 0)) {
    # we haven't filled the placeholder yet - but we do know that the
    # file exists in the cache, so stat it now

    debug "\nstat($filename)";
    my @stat = stat($filename);

    $stat{$filename}=\@stat;   #can cache negative stats too - we
                               #don't discard them because we always
                               #want to find where in the filesystem
                               #something was stored prior to it
                               #having been deleted or "disappeared"

    my $mtime  = $cached_mtime{$munged_filename};
    my $size   = $cached_size{$munged_filename};
    my $md5sum = $cached_md5sum{$munged_filename};
    if (@stat) {
      $known_md5sums{$md5sum}=1;
      #        if (defined $known_files{$filename}) {
      #          debug "\nRedefining $filename";
      #        }
      $known_files{$munged_filename}=[ $mtime, $size, $md5sum ]; # mtime, size and md5sum here
                                                                 # are from the cache file
    } else {
      # file from cache not found, so flag for its removal from the
      # history list if found to be a duplicate

      # later on, we go through this list and write the entries
      # back to the cache file for any that don't have duplicated
      # md5sums.  Hash by md5sum so we only store one copy.
      # preferntially store one with an mtime.
      if ($mtime || !defined $flag_possible_removal{$md5sum}) {
        $flag_possible_removal{$md5sum}=[ $mtime, $size, $filename ];
      }
    }
  }

  if (defined $stat{$munged_filename}) {
    return @{$stat{$munged_filename}}
  } else {
    debug "\nstat($filename)";
    return stat($filename);
  }
}

sub add_filename {
  my ($filename)=(@_);
  my $known=0;
  my $munged_filename = munge_sshfs($filename);
  if (defined $stat{$munged_filename}) {
    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
      = cached_stat($filename); #mtime here is from the current stat()
    if ((defined $mtime) &&
        ($mtime eq $known_files{$munged_filename}[0]) &&
        ($size eq $known_files{$munged_filename}[1])) {
      print "$known_files{$munged_filename}[2] $filename\n"; #exists with right mtime and size in cache
      debug "\nFound stat($filename) matching in cache";
      $known=1;
    }
  }
  if (!$known) {
    debug "\nNot already known: $filename";
    push @obtain_real_md5sums, $filename;
  }
}

debug "Reading cache\n";
read_cache;
if (@ARGV) {
  foreach my $file (@ARGV) {
    add_filename ($file);
  }
} else {
  foreach my $file (<STDIN>) {
    chomp $file;
    add_filename ($file);
  }
}

if (@obtain_real_md5sums) {
  my ($md5fh, $md5fn) = mkstemp("/tmp/${basecache}.tmp.XXXXXX") or die "\nmd5sumcache: Can't mkstemp /tmp/${basecache}.tmp.XXXXXX: $!";
  #FIXME: optionally strip all EXIF info from file comparing
  open(FHO, "| xargs -0 --no-run-if-empty md5sum | sed 's/\\\\\\(.\\)/\\1/g' > $md5fn") or die "\nmd5sumcache: can't open xargs pipe to md5sum: $!";
  foreach my $filename (@obtain_real_md5sums) {
    print FHO "$filename\0" or last;
    debug "\nDidn't find $filename in cache.";
  }
  close(FHO) or die "\nmd5sumcache: can't complete xargs pipe to md5sum";

  open(FHI, $md5fn) or die "\nmd5sumcache: can't read md5sum temp file: $!";
  while (<FHI>) {
    chomp;   #is newline delimeted
    my ($md5sum, $filename) = split(' ', $_, 2);

    my $munged_filename = munge_sshfs($filename);

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks)
      = defined $stat{$munged_filename} && $stat{$munged_filename} ? @{$stat{$munged_filename}} : stat($filename);
#    if (defined $known_files{$munged_filename}) {
#      debug "\nRedefining $filename";
#    }
    if ($mtime) {  # may have disappeared.
      $known_files{$munged_filename}=[$mtime, $size, $md5sum];
      $known_md5sums{$md5sum}=1;
      print "$md5sum $filename\n";
    }
  }
  close FHI;
  unlink $md5fn or die "\nmd5sumcache: Can't unlink $md5fn: $!";
}

close STDIN;
close STDOUT;
#      close STDERR; # keep stderr available for die messages;
#      hopefully nobody is trying to capture our output with stderr.

# if files have been flagged for removal or if we obtained new
# md5sums, then write the changes to the database in the background
if (%flag_possible_removal || @obtain_real_md5sums) {
  # exit as soon as we can, and delegate the writing of stuff to a
  # file to a child we create
#  if (fork > 0) {  #ie, if an error (we don't care if there is, we
#                   #just continue writing the file as is), or if we
#                   #are the child, then continue, otherwise the parent
#                   #exits closing all its FDs (hopefully the child
#                   #will close all FDs that are being captured too).
#    _exit(0);  #stop the expensive memory garbage collection at end
#  } else {
    # transfer those cached results over to the current list for all
    # files we didn't end up reading ourselves
    foreach my $filename (keys %stat) {
      if (!defined $known_files{$filename}) {
        $known_files{$filename}=[ $cached_mtime{$filename}, $cached_size{$filename}, $cached_md5sum{$filename} ];
      }
    }
    # clear some memory that won't be used again as quickly as
    # possible, as the system is likely to be stressed by now, and the
    # exiting parent will be allowing the caller to start continuing
    # now, wanting its memory back
    %stat=();
    %cached_mtime=();
    %cached_size=();
    %cached_md5sum=();
    @obtain_real_md5sums=undef;

    debug "\nWriting cache.";
    open(FHO, "|gzip -c > ${cache}~") or die "\nmd5sumcache: Can't open ${cache}~ for write: $!";
    # sort is so file compresses the most
    foreach my $filename (sort {
      ($a cmp $b) ||
        ($known_files{$a}[2] cmp $known_files{$b}[2]) ||
          ($known_files{$a}[1] <=> $known_files{$b}[1]) ||
          ($known_files{$a}[0] <=> $known_files{$b}[0])
        } (keys %known_files)) {
      #filename already munged
      my ($mtime,$size,$md5sum)=@{$known_files{$filename}};
      print FHO "$mtime $size $md5sum $filename\n" or last;
    }
    foreach my $md5sum (sort {
      ($flag_possible_removal{$a}[2] cmp $flag_possible_removal{$b}[2]) ||
        ($a cmp $b) ||
          ($flag_possible_removal{$a}[1] <=> $flag_possible_removal{$b}[1]) ||
          ($flag_possible_removal{$a}[0] <=> $flag_possible_removal{$b}[0])
        } (keys %flag_possible_removal)) {
      if (!$known_md5sums{$md5sum}) {
        # file no longer exists at all, so print it out as a
        # historical reference.  If it does exist, the file sharing
        # that md5sum has already been printed out, and we can trace
        # it that way
        my ($mtime,$size,$filename)=
          ($flag_possible_removal{$md5sum}[0],$flag_possible_removal{$md5sum}[1],$flag_possible_removal{$md5sum}[2]);
        #filename already munged
        print FHO "$mtime $size $md5sum $filename\n" or last;
      }
    }
    close FHO or die "\nmd5sumcache: Can't close ${cache}~ for write: $!";

    rename "${cache}~", $cache or die "\nmd5sumcache: Can't rename ${cache}~, $cache: $!";
#  }
}

_exit(0);  #stop the expensive memory garbage collection at end - why would perl garbage collect at end even though you're exiting?  Because otherwise non-memory related destructors wouldn't get a chance to run.  We don't care about such destructors here - but we do have lots of circular references
