#!/usr/bin/perl -w # md5_get.pl - Don Yang (uguu.org) # # Retrieve a file using HTTP and compute MD5. # # ./md5_get.pl [output file] # # If output file is not specified, use last part of URL as file name. # # Not a full HTTP client by any means. If the server sends anyhing # unexpected, the header will be printed and script will be terminated # immediately. # # This is somewhat okay for small files, but on large files, there is # a ~10x slowdown trying to read packets and compute MD5 at the same # time. If you have enough memory to keep the file in system cache, # it's almost always faster to run wget+md5sum serially. # # 09/22/07 use strict; use Digest::MD5; use Socket; use constant READ_BUFFER_SIZE => 0x10000; die "$0 [output file]\n" unless $#ARGV == 0 || $#ARGV == 1; # Parse URL my ($host, $port, $uri); if( $ARGV[0] =~ m{^http://([^/]+):(\d+)(/.*)$} ) { $host = $1; $port = $2; $uri = $3; } elsif( $ARGV[0] =~ m{^http://([^/]+)(/.*)$} ) { $host = $1; $port = 80; $uri = $2; } else { die "Can not parse $ARGV[0] as an URL\n"; } # Set output my $outfile = ""; if( $#ARGV == 1 ) { $outfile = $ARGV[1]; } else { unless( $uri =~ m{.*/([^/]+)} && $1 ne "." && $1 ne ".." ) { die "Can not get output file name from $uri\n"; } $outfile = $1; print "Output file set to: $outfile\n"; } # Resolve server address my $ip; if( $host !~ /^\d+\.\d+\.\d+\.\d+$/ ) { unless( defined($ip = gethostbyname($host)) ) { die "Can not resolve $host\n"; } print "$host -> ", inet_ntoa($ip), "\n"; } else { unless( defined($ip = inet_aton($host)) ) { die "$host is not a valid address\n"; } } # Open output my $file_handle; open $file_handle, "> $outfile" or die $!; binmode($file_handle); # Connect to server my $server; die $! unless( socket($server, PF_INET, SOCK_STREAM, getprotobyname("tcp")) && connect($server, sockaddr_in($port, $ip)) && binmode($server) ); # Send request my $data = "GET $uri HTTP/1.0\015\012" . "Host: $host\015\012" . "Connection: close\015\012\015\012"; defined(send($server, $data, 0)) or die $!; # Get reply my $md5 = Digest::MD5->new; my $expected_size = undef; my $actual_size = 0; my $read_size = READ_BUFFER_SIZE; my $zero = 0; my $header = undef; my $time = time; while( defined(recv($server, $data, $read_size, 0)) ) { # Check header unless( defined($header) ) { # Separate header and data $header = ""; while( $data ne "" ) { my $eol = index($data, "\012"); $eol = index($data, "\015") if $eol < -1; if( $eol < -1 ) { die "Unexpected end of header\n"; } $header .= substr($data, 0, $eol + 1); $data = substr($data, $eol + 1); last if( index($header, "\015\012\015\012") >= 0 || index($header, "\012\012") >= 0 || index($header, "\015\015") >= 0 ); } print "Reply:\n$header\n"; # Check header unless( $header =~ m{^HTTP/1.[01] (\d+)}s ) { die "Unexpected protocol, expected HTTP/1.0 or HTTP/1.1\n"; unless( $1 == 200 ) { die "Unexpected status, expected 200\n"; } } if( $header =~ /\ncontent-length:\s*(\d+)/is ) { $expected_size = $1; print "Expected file size: $expected_size\n"; } } # Stop when reads fail repeatedly $read_size = length($data); if( $read_size <= 0 ) { last if ++$zero >= 2; } else { $zero = 0; } if( defined($expected_size) ) { # Getting file with some expected size if( $actual_size + $read_size > $expected_size ) { $read_size = $expected_size - $actual_size; $data = substr($data, 0, $read_size); } print $file_handle $data; $md5->add($data); $actual_size += $read_size; $read_size = $expected_size - $actual_size; last if $read_size <= 0; $read_size = READ_BUFFER_SIZE if $read_size > READ_BUFFER_SIZE; } else { # Getting file of unknown size print $file_handle $data; $md5->add($data); $actual_size += $read_size; } print "\rrecv $actual_size"; } # Pritn final checksum $time = time - $time; print "\nReceived $actual_size bytes in $time seconds\n", "MD5: ", $md5->hexdigest, "\n"; # Cleanup shutdown($server, 2); close($server); close($file_handle);