
# Copyright (C) 2014 Simon Kainz <simon@familiekainz.at>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# he Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL-2'.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.



use strict;
use warnings;


package DUCK;
my $VERSION ='0.7';
my $COPYRIGHT_YEAR ='2014';


use String::Similarity;
use File::Which;
use WWW::Curl::Easy;
use strict;
use IPC::Open3;
use IO::Select;
use Net::DNS;
use Mail::Address;
use Data::Dumper;

my $callbacks;

my $self;
my $helpers={
    svn =>0,
    bzr =>0,
    git =>0,
    darcs =>1, # This works always as it uses WWW::Curl::Easy
    hg => 0,
    browser =>1 # This works always as we use WWW::Curl::Easy;
};


my $cli_options;

my $tools=
{
    git => {
	cmd => 'git',
	args => ['ls-remote','%URL%']
    },
	    
    hg =>{
		cmd => 'hg',
		args => ['id','%URL%']
	},

    bzr => {
		cmd => 'bzr',
		args => ['-Ossl.cert_reqs=none','log','%URL%']
    },

    svn => {
	cmd => 'svn',
	args => ['--non-interactive','--trust-server-cert','info','%URL%']
}
	    
	     
};

sub version
{
    return $VERSION;
}

sub copyright_year
{
    return $COPYRIGHT_YEAR;
}

sub new {
    my $class = shift;
     $self = {};
     bless $self, $class;
    $self->__find_helpers();


    foreach (keys %$tools)
    {
	$tools->{$_}->{'args_count'}=scalar @{$tools->{$_}->{'args'}};
    }
    return $self;
}

sub cb()
{
    $callbacks=
    {
	
	"Vcs-Browser" =>\&browser,
	"Vcs-Darcs" =>\&darcs,
	"Vcs-Git" =>\&git,
	"Vcs-Hg" =>\&hg,
	"Vcs-Svn" =>\&svn,
        "Vcs-Bzr" =>\&bzr,
	"Homepage" => \&browser,
	"URL" => \&browser,
	"Email" => \&email,
	"Maintainer" => \&maintainer,
	"Uploaders" => \&uploaders,
	"Try-HTTPS" => \&try_https,
        "SVN" => \&svn
	    
    };
    
    return $callbacks;
}

sub setOptions()
{
    shift;
    my ($ke,$va)=@_;
    $cli_options->{$ke}=$va;
}

sub __find_helpers()
{

    $helpers->{git}=1 unless !defined (which('git'));
    $helpers->{svn}=1 unless !defined (which('svn'));
    $helpers->{hg}=1 unless !defined (which('hg'));
    $helpers->{bzr}=1 unless !defined (which('bzr'));
}

sub getHelpers()
{ return $helpers; }

sub git()
{
    my ($url)=@_;

    my @urlparts=split(/\s+/,$url);
    
    if ($tools->{'git'}->{'args_count'})
    {
    splice(@{$tools->{'git'}->{'args'}},$tools->{'git'}->{'args_count'});
    }


    if ($urlparts[1])
    {
	if ($urlparts[1] eq "-b" && $urlparts[2])
	{
	    push(@{$tools->{'git'}->{'args'}},'-b '.$urlparts[2]);
	}
    }
    return __run_helper('git',$urlparts[0]);
}

sub bzr()
{
    my ($url)=@_;
    return __run_helper('bzr',$url);
}


sub hg()
{
    my ($url)=@_;
    return __run_helper('hg',$url);
}

sub svn()
{
    my ($url)=@_;
	$ENV{SVN_SSH}='ssh -o BatchMode=yes';
    return __run_helper('svn',$url);
}

sub browser()
{

    my $enforce=1;

   my ($url)=@_;
    
    $url =~ s/\.*$//g;

    if (! ( $cli_options->{'no-https'}))
	{
	    $cli_options->{'no-https'}=1;
	}

    if ( ($cli_options->{'no-https'}==0) && (!($url =~ m/https:\/\//i )) )
    {
	return try_https($url);
    }
    else
    {
	
	
    return __run_browser($url);
    }
}




sub try_https()
{
    my $similarity_th=0.9;
    my ($url)=@_;
    $url =~ s/\.*$//g;

    my $res;

    my $erghttp= __run_browser($url);

    if ($erghttp->{'retval'} >0 ) {return $erghttp;}
    my $secure_url= $url; 
    $secure_url=~ s/http:/https:/g;


    my $erghttps= __run_browser($secure_url);
    
    if ($erghttps->{'retval'} >0 )
    {
	# error with https, so do not suggest switching to https, report only http check results
	return $erghttp;
    }

    # otherwise check similarity, and report if pages are (quite) the same 

    if ($erghttps->{'retval'} == 0)
    {
	# https worked, now try to find out if pages match

	my $similarity= similarity $erghttp->{'body'}, $erghttps->{'body'};


	if ($similarity > $similarity_th)
	{
	    $res->{'retval'}=2;
	    $res->{'response'}="The web page at $url works, but is also available via $secure_url, please consider switching to HTTPS urls.";
	    return $res;
	
	}
	
    } else
    {
	# report nothing
	$res->{'retval'}=0;
	return $res;
       
    }
	




    $res->{'retval'}=0;
    $res->{'response'}="lolz";
    $res->{'url'}=$url;
    return $res;

}

sub darcs()
{
    my ($url)=@_;
    my $darcsurltemp=$url;
    $darcsurltemp =~ s/\/$//;
    $darcsurltemp.='/_darcs/hashed_inventory';
    return __run_browser($darcsurltemp);
}




sub uploaders()
{
    my ($line_uploaders)=@_;
    $line_uploaders =~ s/\n/ /g;
    my @emails;

    if ($line_uploaders =~ /@/)
    {
	@emails=Mail::Address->parse($line_uploaders);
    }
    my $res;
#    print Dumper @emails;
    foreach my $email(@emails)
    {
	my $es=$email->address();
	my $r=check_domain($es);
    
	if ($r->{retval}>0)
	{
	    if (!$res->{retval})
	    {
		$res=$r;
	    } else
	    {
		$res->{retval}=$r->{retval};
		$res->{response}.="\n".$r->{response};
		$res->{url}="foo";
	    }
	    
	}
	
    }
    
    if (!$res->{retval})
    {
	$res->{'retval'}=0;
	$res->{'response'}="";
	$res->{'url'}=$line_uploaders;
    }
    return $res;

}

sub maintainer()
{
    my ($email)=@_;
     return check_domain($email);
}



sub email()
{
    my ($email) =@_;
    return check_domain($email);
}


sub __run_browser {


    my $certainty;
    my @SSLs=(CURL_SSLVERSION_DEFAULT,
      CURL_SSLVERSION_TLSv1,
      CURL_SSLVERSION_SSLv2,
      CURL_SSLVERSION_SSLv3,
      CURL_SSLVERSION_TLSv1_0,
      CURL_SSLVERSION_TLSv1_1,
      CURL_SSLVERSION_TLSv1_2);

    my ($url,$return_ref)=@_;
    
    #check if URL is mailto: link
    
    if ($url =~/mailto:\s*.+@.+/)
    {
    return check_domain($url);
    }
    
    my $curl = WWW::Curl::Easy->new;
    
    my @website_moved_regexs=('new homepage','update your links','we have moved','buy this domain','domain .* for sale', 'order this domain');
  

    my @website_moved_whitelist=('anonscm.debian.org.*duck.git');
    
    $curl->setopt(CURLOPT_HEADER,0);
    $curl->setopt(CURLOPT_SSL_VERIFYPEER,0);
    $curl->setopt(CURLOPT_SSL_VERIFYHOST,0);
    $curl->setopt(CURLOPT_CERTINFO,0);
    $curl->setopt(CURLOPT_FOLLOWLOCATION,1);
    $curl->setopt(CURLOPT_SSL_CIPHER_LIST,'ALL');
    $curl->setopt(CURLOPT_MAXREDIRS,10);     
    $curl->setopt(CURLOPT_TIMEOUT,60);
    $curl->setopt(CURLOPT_USERAGENT,'Mozilla/5.0 (X11; Linux x86_64; rv:10.0.4) Gecko/20100101 Firefox/10.0.4 Iceweasel/10.0.4');
    $curl->setopt(CURLOPT_URL, $url);

    my $response_body;
    my $response_code;
    my $retcode;
    my $response;

    foreach my $s (@SSLs)
    {
    $curl->setopt(CURLOPT_WRITEDATA,\$response_body);
    $curl->setopt(CURLOPT_SSLVERSION,$s);
    # Starts the actual request
    $retcode = $curl->perform;
    $response_code = $curl->getinfo(CURLINFO_HTTP_CODE);
    $response=$curl->strerror($retcode)." ".$curl->errbuf."\n";

    if ($retcode == 35) { next;}
    if ($retcode == 56) {next;}
    last;
    }

    # Looking at the results...
    my $status=0;
    my $disp=0;

 
    if ($retcode == 0) # no curl error, but maybe a http error
    {
	#default to error
	$status=1;
	$disp=1;

	#handle ok cases, 200 is ok for sure
	if ($response_code ==200 )
	{
	    $status=0;
	    $disp=0;
	}


	if ($response_code ==226 )
	{
	    $status=0;
	    $disp=0;
	}

	if ($response_code ==227 )
	{
	    $status=0;
	    $disp=0;
	}

	if ($response_code ==302 ) #temporary redirect is ok
	{
	    $status=0;
	    $disp=0;
	}

	if ($response_code ==403)
	{
	    ## special case for sourceforge.net sites
	    ## sourceforge seems to always return correct pages wit http code 40.
	    
	    if ( $url =~ m/(sourceforge|sf).net/i)
	    {
		# print "Sourceforge site, so hande special!!";
		$status=0;
		$disp=0;
	    }


	}
	my $whitelisted=0;

	foreach my $whitelist_url (@website_moved_whitelist)
	{
	    if ( $url =~ m/$whitelist_url/i)
	
	    {$whitelisted=1;}

	}
	if ($whitelisted == 0)
	  {  
	      foreach my $regex (@website_moved_regexs)
	      {
		  #   print "$regex\n";
		  if ($response_body =~ m/$regex/i )
		  {
		      $disp=2;
		      $response.="Website seems to be outdated, is probably a parked domain or for sale. Please update your links!\nMatching regular expression: m/".$regex."/i";
		      $certainty="wild-guess";
		      last;
		  }
	      }
	  }
	
    }
    else {  # we have a curl error, so we show this entry for sure
	$status=1;
	$disp=1;
    }


    my $ret;
    $ret->{'retval'}=$disp;
    $ret->{'response'}="Curl:$retcode HTTP:$response_code $response";
    $ret->{'url'}=$url;
    $ret->{'body'}=$response_body;
    $ret->{'certainty'}=$certainty;
    return $ret; 
}



sub __run_helper {
    
    my ($tool,$url)=@_;
   return undef unless $helpers->{$tool} == 1;
   return undef unless defined $tools->{$tool};

   my @args=@{$tools->{$tool}->{'args'}};

   for(@args){s/\%URL\%/$url/g}

    my $pid;
    my $command;
    my $timeout;


    if ($cli_options->{'timeout'})
    {

	my $timeout_value=60;
	if ( ( $cli_options->{'timeout_seconds'} ))
	    {
		$timeout_value=$cli_options->{'timeout_seconds'};
		$timeout_value =~ s/[^0-9]//;
	    }
	unshift @args,$tools->{$tool}->{'cmd'};
	unshift @args,$timeout_value."s";
	$command="/usr/bin/timeout";
	$pid=open3(\*WRITE,\*READ,0,$command,@args);
    
    }
    else
    {
    $pid=open3(\*WRITE,\*READ,0,$tools->{$tool}->{'cmd'},@args);

    }

   my @results = <READ>;
   waitpid ($pid,0);
   close READ;

   my $retval=$?;
   my $ret;
   $ret->{'retval'}=$retval;
   $ret->{'response'}=join("",@results);
   $ret->{'url'}=$url;
   return $ret;
}

sub check_domain($)
		 {


    
		     my $res = Net::DNS::Resolver->new;
		     my ($email) = @_;
		     my @emails=Mail::Address->parse($email);
		     $email=$emails[0]->address();
#		     $email=$email->address();
		     my @domain = ( $email =~ m/^[^@]*@([^?^&^>]*).*/);

		     my @queries=('MX','A','AAAA');
		     my @results;
		     my $iserror=1;
		     foreach my $query (@queries)
		     {
			 my $q=$res->query($domain[0],$query);
			 
			 if ($q)
			 {
			     my @answers=$q->answer;
			     my $mxcount=scalar @answers;
			     push (@results,$mxcount." ".$query." entries found.");
			     $iserror=0;
			     last;
			 } else
			 {
			     push (@results,"$email: No ".$query." entry found.");
			 }
			 
		     }
		     
		     
		     my $ret;
		     $ret->{'retval'}=$iserror;
		     $ret->{'response'}=join("\n",@results);
		     $ret->{'url'}=$email;
		     return $ret;
		     
		     
		 }





1;
