#!/usr/bin/perl -w use strict; my $my_url = qr/(\w+\.)?debain\.org/io; # Change according to your site. my $spam_get = qr/(\#|^http%)/io; my $spam_url = qr/(\#|^http%)/io; my $spam_id = qr/^[^\)]$/io; my $html_url = qr/(php|php5|py|pl|cgi|html?|\/)$/io; my %requests = (); my %html_requests = (); my %spammers = (); my %referrers = (); my @lines = qw//; sub extract { my ($line) = @_; # Matches against default Apache log entries. $line =~ /^(\S+)\s-\s-\s[^\"]+\"([^\"]+) HTTP[^\"]+\"\s\d+\s\d+\s\"([^\"]+)"\s\"([^\"]+)\"/o; return ($1, $2, $3, $4); } # Identify spammers. while (<>) { my ($ip, $get, $ref, $id) = extract($_); next unless defined $ip; next if $ip !~ /^\d+\.\d+\.\d+\.\d+$/; # Crap emergency. FIXME: Check those. push(@lines, $_); my $get_novar = $get; $get_novar =~ s/\?.*//; #print "Line: $_\n"; #print "Extracted: $ip, $get, $ref, $id\n"; # Count the number of URLs opened. $requests{$ip} = [] unless exists $requests{$ip}; $html_requests{$ip} = [] unless exists $html_requests{$ip}; push(@{$requests{$ip}}, $get); push(@{$html_requests{$ip}}, $get) if $get_novar =~ /$html_url/; # Collect and skip spammers. next if exists $spammers{$ip}; $spammers{$ip} = 1, next if $get =~ /$spam_get/; $spammers{$ip} = 1, next if $ref =~ /$spam_url/; $spammers{$ip} = 1, next if $id =~ /$spam_id/; } # Collect referrers. for my $line (@lines) { my ($ip, $get, $ref, $id) = extract($line); # If a client requested only HTML files then that means # a) The HTML page that he opened has no other document embedded, # such as an image or a style sheet. I consider this a rare # enough scenario so that the request can be ignored. # b) The client opened a HTML page but not the embedded documents. # I consider him a spammer in that case. #print "IP1: $ip\n"; my $n_requests = scalar(@{$requests{$ip}}); my $n_html_requests = scalar(@{$html_requests{$ip}}); next if $n_requests == $n_html_requests; next if exists $spammers{$ip}; next if $ref =~ /$my_url/; # Count the matches in a hash. $referrers{$ref}++; } # Helps sorting the result. sub ascending_referrers { $referrers{$a} <=> $referrers{$b}; } # Print the result. for my $key (sort ascending_referrers (keys %referrers)) { my $matches = $referrers{$key}; print "$matches\t$key\n"; }