#!/usr/bin/env perl
#
# This file is a part of KleanSweep.
#
# Copyright (C) 2005-2006 Pawel Stolowski <pawel.stolowski@wp.pl>
#
# KleanSweep is free software; you can redestribute it and/or modify it
# under terms of GNU General Public License by Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY. See GPL for more details.
#

use strict;
use File::Find;
use File::Compare;
use File::Temp;
use Digest::MD5;
use Getopt::Long;

#
# Produces list of files matching various search criterias.
#
# each output line contains concatenated tags and filename:
# tag:filename
# ...
#
# Possible tags:
#
# b - backup file
# d<id> - duplicated file (id is a unique number; files with same ids are duplicates)
# l - locale file
# m - dead menu entry file
# t - obsolete thumbnails
# O - orphaned file
# s - broken symlink
# t - temporary file
# U - unused file
# x - broken executable
# y - empty dir 
# z - empty file
#
# Command-line options:
#
# -s                        search for broken symlinks
# -z                        search for empty files
# -y                        search for empty dirs
# -l                        search for unused locales
# -m                        search for broken menu entries
# -t                        search for obsolete thumbnails
# -b                        search for backup files 
# -d                        search for duplicated files
# -x                        search for broken executables
# --rpm                     perform rpm orphans check
# --portage                 perform emerge orphans check
# --deb                     perform deb orphans check
# --comp                    compare not only sums but contents when searching for uplicates
# --exclude <dir>           exclude dir from any checks
# --orphans-exclude <dir>   exclude dir from rpm/deb/portage orphans search
# --orphans-include <dir>   include dir in rpm/deb/portage orphans search
# --writable                ignore non-writable files
# --progress                show current directory
# --blacklist <file>        read blacklist from file
# 

our %sizes;
our %dupref;
our %mdsums;

our $seek_limit = 0;
our $seek_bexec = 0;
our $seek_bak = 0;
our $seek_locale = 0;
our $seek_menus = 0;
our $seek_thumbs = 0;
our $seek_empty_files = 0;
our $seek_empty_dirs = 0;
our $seek_dup = 0;
our $seek_symlinks = 0;
our $seek_orph = 0;
our $seek_orpms = 0; #temporary helper flag used for commandline option - seek_orph is really used later
our $seek_odebs = 0; #temporary helper flag - seek_orph is really used
our $seek_oemerge = 0;
our $seek_old = 0; # number of days, if greater than zero
our $only_writable = 0;
our $only_md5sum = 0;
our $dup_min_size = 1;
our $show_progress = 0;
our %pkg_files; # files registered in packages database specific for requested distro
our %emerge_files; # files registered in portage
our $currtime; # current time in seconds, used if seeking for old files
our $blacklist_fname;
our %blacklist; # blacklisted files
our $scoring_fname; # scoring rules
our $score_rules; # rules (perl code) to interpret
our $count = 0; #number of files found (used only if seek_limit>0)

our $bakfiles_re;

our @path;
our @excludedirs;
our @orphdirs;
our @orphexcludedirs;

our @localedirs = ('/usr/share/locale', '/usr/local/share/locale'); #parent locale directories (usually /usr/share/locale)
our @valid_locales; # country names considered valid (used) by the system
our $dpkgdir = '/var/lib/dpkg';

#
# checks if given absolute path is a part of any given path
sub inpaths
{
	my ($dir, $pathsref) = @_; # path to test, reference to array with paths
	my $path;
	foreach $path (@{$pathsref})
	{
		return 1 if $dir =~ /^\Q$path/;
	}
	0;
}

#
# appends trailing / to paths that miss it; removes non-existing dirs
sub sanitize_paths
{
	my @dirs = ();
	foreach (@_)
	{
		next if ! &checkdir($_);
		$_ .= '/' if ! /\/$/;
		push(@dirs, $_);
	}
	@dirs;
}

#
# calculates file checksum; basically a wrapper for Digest::MD5.
sub filesum
{
	my $file = shift;
	if (open(FILE, $file))
	{
		binmode(FILE);
		my $sum = Digest::MD5->new;
		$sum->addfile(*FILE);
		close(FILE);
		return $sum->hexdigest;
	}
	0;
}

#
# checks if file can be found in $PATH, just like 'which' does.
# assumes @path array is filled with paths.
sub which
{
	my $file = shift;
	return -e $file if $file =~ /^\//;
	foreach (@path)
	{
		return 1 if -e "$_/$file";
	}
	0;
}

sub escape_target
{
	my $target = shift;
	# backslash-escape special characters (: and \)
	$target =~ s/([,:\\])/\\$1/g;
	$target;
}

sub calc_score
{
	$_ = shift;
	return 0 if defined $blacklist{$_};
	my $score = 9;
	while (1)
	{
		eval $score_rules;
		last;
	}
	if ($score < 0)
	{
		$score = 0;
	}
	elsif ($score > 9)
	{
		$score = 9;
	}
	$score;
}

#
# main subroutine used by find. performs all requested tests on each file.
sub process_entry
{
	my $target; # symlink / menu entry / thumbnail target
	my $fname = $File::Find::name;
	my $inode = '';

	return if $only_writable && ! -w;
	return if &inpaths($fname, \@excludedirs);

	my $tag;

	if ( -l )
	{
		$tag = 's' if $seek_symlinks && ! -e readlink;
		if ($seek_symlinks && ! -e ($target = readlink))
		{
			$tag = 's'.&escape_target($target).',';
		}
	}
	elsif ( -f )
	{
		my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$fsize,$atime,$mtime,$ctime,$blksize,$bloks) = stat($fname);
		$inode = $dev.$ino if $seek_dup;
		$tag .= 'b' if $seek_bak && $fname =~ /$bakfiles_re/oi;
		$tag .= 'z' if $seek_empty_files && -f && -z;
		##$tag .= 'l' if $seek_locale && $fname =~ /\.mo$/ && $fname =~ /^$localedirs_re)/o && $File::Find::dir !~ /$valid_locales_re/o;
		##$tag .= 'U' if $seek_old && ($currtime - $atime) > $seek_old;
		if ($seek_thumbs && $fname =~ /\.thumbnails\/(?:fail|large|normal)\/[a-f0-9]{32}\.png$/)
		{
			{
				if (open(FILE, "identify -verbose $fname |"))
				{
					while (<FILE>)
					{
						if (/\s*Thumb::URI:\s+file:\/\/(.+)/)
						{
							last if -e $1;
							$tag .= 't'.&escape_target($1).',';
							last;
						}
					}
					close(FILE);
				}
			}
		}
		if ($seek_orph && &inpaths($File::Find::dir, \@orphdirs) && ! &inpaths($File::Find::dir, \@orphexcludedirs))
		{
			if (defined $pkg_files{$fname})
			{
				delete $pkg_files{$fname}; #free up some memory
			}
			else
			{
				$tag .= 'O';
			}
		}
		if ($seek_menus && /\.desktop$/)
		{
			my $exec = '';
			my $tryexec = '';
			if (open(FILE, $fname))
			{
				while (<FILE>)
				{
					if (/^\s*TryExec\s*=\s*([\w\/\.,-:]+)/)
					{
						$tryexec = $1;
						last;
					}
					elsif (/^\s*Exec\s*=\s*([\w\/\.,-:]+)/)
					{
						$exec = $1;
					}
				}
				close(FILE);
				$exec = $tryexec if $tryexec ne '';
				if ($exec ne '' && ! &which($exec))
				{
					$tag = 'm'.&escape_target($exec).',';
				}
			}
		}
		if ($seek_bexec && -x && &inpaths($fname, \@path))
		{
			if (open(LDD, "ldd $fname 2>&1 |"))
			{
				my @libs = ();
				while (<LDD>)
				{
					push (@libs, $1) if /\s*([^\s]+?)\s*=>\s*not found/;
				}
				close(LDD);
				if (scalar @libs)
				{
					$tag .= 'x'.&escape_target((sort @libs)[0]).','; #show only first missing library (after sorting)
				}
			}
		}
		if ($seek_dup && $fsize >= $dup_min_size)
		{
			my $add = 1;
			if (defined $sizes{$fsize})
			{
				my $md;
				if (scalar @{$sizes{$fsize}} == 1)
				{
					my $prevfile = @{$sizes{$fsize}}->[0];
					$mdsums{$prevfile} = &filesum($prevfile);
				}
				$mdsums{$fname} = ($md = &filesum($fname));
				foreach (@{$sizes{$fsize}})
				{
					if ($md eq $mdsums{$_} && ($only_md5sum || &compare($fname, $_) == 0))
					{
						my ($dev,$ino) = stat($_);
						my $oinode = $dev.$ino;
						my $duptag = $oinode.'d';
						if (! defined $dupref{$oinode})
						{
							print &calc_score($_)."$duptag\:$_\n";
							($seek_limit > 0) && (++$count >= $seek_limit) && exit(0);
						}
						$inode = $oinode;
						$dupref{$oinode} = 1;
						$tag .= 'd';
						$add = 0;
						last;
					}
				}
			}
			push(@{$sizes{$fsize}}, $fname) if $add == 1;
		}
	}
	if ($tag ne '')
	{
		print &calc_score($fname)."$inode$tag:$fname\n";
		($seek_limit > 0) && (++$count >= $seek_limit) && exit(0);
	}
}

#
# filter out globally excluded directories
sub preprocess_dir
{
	my $fname = $File::Find::dir;
	print '@:'.$fname."\n" if $show_progress;
	if ($seek_empty_dirs && scalar @_ <= 2)
	{
		if (! &inpaths($fname, \@excludedirs) && (!$only_writable || -w))
		{
			my ($dev,$ino,$mode) = stat($fname);
			my $inode = '';
			$inode = $dev.$ino if $seek_dup;
			print &calc_score($fname).$inode.'y:'.$fname."\n";
			($seek_limit > 0) && (++$count >= $seek_limit) && exit(0);
		}
	}
	grep { (! -d) || ! &inpaths("$File::Find::dir/$_", \@excludedirs); } @_;
}

#
# returns true if dir exists and is absolute path
sub checkdir
{
	my $dir = shift;
	return $dir =~ /^\// && -d $dir;
}

####################################################################################
#
# Execution starts here
#
####################################################################################

select(STDOUT);
$| = 1;

$bakfiles_re = '(?:\.bak|\.old|\.save|\.tmp|~|-|(?:\/var\/lib\/nvidia\/\d\d\d))$|(?:\.#prelink#\.)';

GetOptions('z' => \$seek_empty_files,
	   'y' => \$seek_empty_dirs,
	   'l' => \$seek_locale,
	   's' => \$seek_symlinks,
	   'm' => \$seek_menus,
	   't' => \$seek_thumbs,
	   'b' => \$seek_bak,
	   'x' => \$seek_bexec,
	   'rpm' => \$seek_orpms,
	   'deb' => \$seek_odebs,
	   ##'portage' => \$seek_oemerge,
	   'd' => \$seek_dup,
	   ##'unused=i' => \$seek_old,
	   'limit=i' => \$seek_limit,
	   'comp' => \$only_md5sum,
   	   'writable' => \$only_writable,
	   'exclude=s' => \@excludedirs,
	   'orphans-exclude=s' => \@orphexcludedirs,
	   'orphans-include=s' => \@orphdirs,
	   'progress' => \$show_progress,
	   'blacklist=s' => \$blacklist_fname,
	   'scoring=s' => \$scoring_fname
   );

#
# filter-out non-existing dirs from provided paths
@excludedirs = grep { checkdir $_ } @excludedirs;
@orphdirs =  grep { checkdir $_ } @orphdirs;
@orphexcludedirs = grep { checkdir $_ } @orphexcludedirs;
#
# set default include / exclude dirs if no paths
@excludedirs = ('/proc', '/sys', '/mnt', '/media', '/cdrom', '/dev') if ! scalar @excludedirs;
@orphdirs = ('/bin', '/sbin', '/boot', '/etc', '/usr') if ! scalar @orphdirs;
@orphexcludedirs = ('/tmp') if ! scalar @orphexcludedirs;
@path = split(':', $ENV{'PATH'});

@excludedirs = &sanitize_paths(@excludedirs);
@orphdirs = &sanitize_paths(@orphdirs);
@orphexcludedirs = &sanitize_paths(@orphexcludedirs);
@path = &sanitize_paths(@path);

if ($seek_thumbs && ! which('identify'))
{
	$seek_thumbs = 0;
}

if ($blacklist_fname ne '')
{
	if (open(BLIST, $blacklist_fname))
	{
		while (<BLIST>)
		{
			next if /(?:^\s*#)|(?:^\s*$)/;
			map { $blacklist{$_} = 1; } glob;
		}
		close(BLIST);
	}
	else 
	{
		print STDERR "Can't open blacklist file. Ignored.\n";
	}
}

if ($scoring_fname ne '')
{
	if (open(SCORE, $scoring_fname))
	{
		$score_rules = '';
		while (<SCORE>)
		{
			next if /(?:^\s*#)|(?:^\s*$)/;
			$score_rules .= $_;
		}
		close(SCORE);
	}
	else
	{
		print STDERR "Can't open score rules. Ignored.\n";
	}
}

#if ($seek_old)
#{
#	$currtime = time();
#	$seek_old = $seek_old * 24 * 3600; # convert days to seconds
#}
   
#if ($seek_locale)
#{
	#
	# build array of valid locale names
#	foreach (@user_locales, $ENV{'LANG'}, 'C')
#	{
#		push(@valid_locales, "\/$1") if /^([a-zA-Z_]+)/;
#	}
#	$valid_locales_re = join('|', @valid_locales);
#}

my $start_dir = shift;
if ($start_dir eq '' || ! -d $start_dir)
{
	print STDERR "Please specify existing path as a starting directory\n";
	exit(1);
}

#######################################################################
# RPM-based distros support
#######################################################################
if ($seek_orpms && which('rpm') && open(RPMFILES, 'rpm -qal|'))
{
	print "\@:Creating list of installed files...\n" if $show_progress;
	while (<RPMFILES>) 
	{
		chomp;
		$pkg_files{$_} = 1;
	}
	close(RPMFILES);
	$seek_orph = 1;
}

#######################################################################
# GENTOO portage support with equery
#######################################################################
if ($seek_oemerge && which('equery') && open(EMERGEFILES, 'equery -C list -i|'))
{
	while (<EMERGEFILES>)
	{
		if (/^\w+?-\w+?\/(.+?)-\d/)
		{
			if (open(QUERY, "equery -C files $1|"))
			{
				while (<QUERY>)
				{
					next if ! /^\//;
					chomp;
					if (/^\s->\s/) # match symlink entry
					{
						$pkg_files{$1} = 1;
					}
					else
					{
						$pkg_files{$_} = 1;
					}
				}
				close(QUERY);
			}
		}
	}
	close(EMERGEFILES);
	$seek_orph = 1;
}

#######################################################################
# Debian support with dpkg
#######################################################################
if ($seek_odebs && -d $dpkgdir && -r $dpkgdir)
{
	print "\@:Creating list of installed files...\n" if $show_progress;
	my @lists = glob("$dpkgdir/info/*.list");
	my @conff = glob("$dpkgdir/info/*.conffiles");
	my $list;
	foreach $list (@lists, @conff, "$dpkgdir/diversions")
	{
		last if !open(DEBFILES, $list);
		while (<DEBFILES>) 
		{
			chomp;
			$pkg_files{$_} = 1 if /^\//;
		}
		close(DEBFILES);
	}
	delete $pkg_files{'/.'};
	$seek_orph = 1;
}

#######################################################################
# start seeking
#######################################################################
$ENV{'LC_LANG'} = 'C';
find({wanted => \&process_entry, preprocess => \&preprocess_dir}, $start_dir);

