The current version of imdb.pl included with MythTV is not downloading hi-res poster images for movies due to a site change at IMDB’s end. I have edited imdb.pl to include a fix as detailed in https://bugs.launchpad.net/mythbuntu/+bug/256027 and included below.
To get it going, backup your old imdb.pl file in /usr/share/mythtv/mythvideo/scripts/ and replace with:
#!/usr/bin/perl -w # # This perl script is intended to perform movie data lookups based on # the popular www.imdb.com website # # For more information on MythVideo's external movie lookup mechanism, see # the README file in this directory. # # Author: Tim Harvey (tharvey AT alumni.calpoly DOT edu) # Modified: Andrei Rjeousski # v1.1 # - Added amazon.com covers and improved handling for imdb posters # v1.2 # - when searching amazon, try searching for main movie name and if nothing # is found, search for informal name # - better handling for amazon posters, see if movie title is a substring # in the search results returned by amazon # - fixed redirects for some movies on impawards # v1.3 # - fixed search for low res images (imdb changed the page layout) # - added cinemablend poster search # - added nexbase poster search # - removed amazon.com searching for now # changes: # 10-26-2007: # Added release date (in ISO 8601 form) to output # 9-10-2006: Anduin Withers # Changed output to utf8 use LWP::Simple; # libwww-perl providing simple HTML get actions use HTML::Entities; use URI::Escape; eval "use DateTime::Format::Strptime"; my $has_date_format = $@ ? 0 : 1; use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P); use Getopt::Std; $title = "IMDB Query"; $version = "v1.3.5"; $author = "Tim Harvey, Andrei Rjeousski"; my @countries = qw(USA UK Canada Japan); binmode(STDOUT, ":utf8"); # display usage sub usage { print "usage: $0 -hdrviMPD [parameters]\n"; print " -h help\n"; print " -d debug\n"; print " -r dump raw query result data only\n"; print " -v display version\n"; print " -i display info\n"; print "\n"; print " -M [options] <query> get movie list\n"; print " some known options are:\n"; print " type=[fuzy] looser search\n"; print " from_year=[int] limit matches to year\n"; print " to_year=[int] limit matches to year\n"; print " sort=[smart] ??\n"; print " tv=[no|both|only] limits between tv and movies\n"; print " Note: multiple options must be separated by ';'\n"; print " -P <movieid> get movie poster\n"; print " -D <movieid> get movie data\n"; exit(-1); } # display 1-line of info that describes the version of the program sub version { print "$title ($version) by $author\n" } # display 1-line of info that can describe the type of query used sub info { print "Performs queries using the www.imdb.com website.\n"; } # display detailed help sub help { version(); info(); usage(); } sub trim { my ($str) = @_; $str =~ s/^\s+//; $str =~ s/\s+$//; return $str; } # returns text within 'data' between 'beg' and 'end' matching strings sub parseBetween { my ($data, $beg, $end)=@_; # grab parameters my $ldata = lc($data); my $start = index($ldata, lc($beg)) + length($beg); my $finish = index($ldata, lc($end), $start); if ($start != (length($beg) -1) && $finish != -1) { my $result = substr($data, $start, $finish - $start); # return w/ decoded numeric character references # (see http://www.w3.org/TR/html4/charset.html#h-5.3.1) decode_entities($result); return $result; } return ""; } # get Movie Data sub getMovieData { my ($movieid)=@_; # grab movieid parameter if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);} my $name_link_pat = qr'<a href="/name/[^"]*">([^<]*)</a>'m; # get the search results page my $request = "http://www.imdb.com/title/tt" . $movieid . "/"; if (defined $opt_d) { printf("# request: '%s'\n", $request); } my $response = get $request; if (defined $opt_r) { printf("%s", $response); } # parse title and year my $year = ""; my $title = parseBetween($response, "<title>", "</title>"); if ($title =~ m#(.+) \((\d+).*\)#) # Note some years have a /II after them? { $title = $1; $year = $2; } elsif ($title =~ m#(.+) \(\?\?\?\?\)#) { $title = $1; } # parse director my $data = parseBetween($response, ">Director:</h5>", "</div>"); if (!length($data)) { $data = parseBetween($response, ">Directors:</h5>", "</div>"); } my $director = join(",", ($data =~ m/$name_link_pat/g)); # parse writer # (Note: this takes the 'first' writer, may want to include others) $data = parseBetween($response, ">Writers <a href=\"/wga\">(WGA)</a>:</h5>", "</div>"); if (!length($data)) { $data = parseBetween($response, ">Writer:</h5>", "</div>"); } if (!length($data)) { $data = parseBetween($response, ">Writers:</h5>", "</div>"); } my $writer = join(",", ($data =~ m/$name_link_pat/g)); # parse release date my $releasedate = ''; if ($has_date_format) { my $dtp = new DateTime::Format::Strptime(pattern => '%d %b %Y', on_error => 'undef'); my $dt = $dtp->parse_datetime(parseBetween($response, ">Release Date:</h5> ", "<a ")); if (defined($dt)) { $releasedate = $dt->strftime("%F"); } } # parse plot my $plot = parseBetween($response, ">Plot Outline:</h5> ", "</div>"); if (!$plot) { $plot = parseBetween($response, ">Plot Summary:</h5> ", "</div>"); } if (!$plot) { $plot = parseBetween($response, ">Plot:</h5>", "</div>"); } if ($plot) { # replace name links in plot (example 0388795) $plot =~ s/$name_link_pat/$1/g; # replace title links my $title_link_pat = qr!<a href="/title/[^"]*">([^<]*)</a>!m; $plot =~ s/$title_link_pat/$1/g; # plot ends at first remaining link my $plot_end = index($plot, "<a "); if ($plot_end != -1) { $plot = substr($plot, 0, $plot_end); } $plot = trim($plot); } # parse user rating my $userrating = parseBetween($response, ">User Rating:</b>", "</b>"); $userrating = parseBetween($userrating, "<b>", "/"); # parse MPAA rating my $ratingcountry = "USA"; my $movierating = trim(parseBetween($response, ">MPAA</a>:</h5>", "</div>")); if (!$movierating) { $movierating = parseBetween($response, ">Certification:</h5>", "</div>"); $movierating = parseBetween($movierating, "certificates=$ratingcountry", "/a>"); $movierating = parseBetween($movierating, ">", "<"); } # parse movie length my $rawruntime = trim(parseBetween($response, ">Runtime:</h5>", "</div>")); my $runtime = trim(parseBetween($rawruntime, "", " min")); for my $country (@countries) { last if ($runtime =~ /^-?\d/); $runtime = trim(parseBetween($rawruntime, "$country:", " min")); } # parse cast # Note: full cast would be from url: # www.imdb.com/title/<movieid>/fullcredits my $cast = ""; $data = parseBetween($response, "Cast overview, first billed only", "/table>"); if (!$data) { $data = parseBetween($response, "Series Cast Summary", "/table>"); } if (!$data) { $data = parseBetween($response, "Complete credited cast", "/table>"); } if ($data) { $cast = join(',', ($data =~ m/$name_link_pat/g)); $cast = trim($cast); } # parse genres my $lgenres = ""; $data = parseBetween($response, "<h5>Genre:</h5>","</div>"); if ($data) { my $genre_pat = qr'/Sections/Genres/(?:[a-z ]+/)*">([^<]+)<'im; $lgenres = join(',', ($data =~ /$genre_pat/g)); } # parse countries $data = parseBetween($response, "Country:</h5>","</div>"); my $country_pat = qr'/Sections/Countries/[A-Z]+/">([^<]+)</a>'i; my $lcountries = trim(join(",", ($data =~ m/$country_pat/g))); # output fields (these field names must match what MythVideo is looking for) print "Title:$title\n"; print "Year:$year\n"; print "ReleaseDate:$releasedate\n"; print "Director:$director\n"; print "Plot:$plot\n"; print "UserRating:$userrating\n"; print "MovieRating:$movierating\n"; print "Runtime:$runtime\n"; print "Writers: $writer\n"; print "Cast:$cast\n"; print "Genres: $lgenres\n"; print "Countries: $lcountries\n"; } # dump Movie Poster sub getMoviePoster { my ($movieid)=@_; # grab movieid parameter if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);} # get the search results page my $request = "http://www.imdb.com/title/tt" . $movieid . "/posters"; if (defined $opt_d) { printf("# request: '%s'\n", $request); } my $response = get $request; if (defined $opt_r) { printf("%s", $response); } if (!defined $response) {return;} my $uri = ""; # look for references to impawards.com posters - they are high quality my $site = "http://www.impawards.com"; my $impsite = parseBetween($response, "<a href=\"".$site, "\">"); if ($impsite) { $impsite = $site . $impsite; if (defined $opt_d) { print "# Searching for poster at: ".$impsite."\n"; } my $impres = get $impsite; if (defined $opt_d) { printf("# got %i bytes\n", length($impres)); } if (defined $opt_r) { printf("%s", $impres); } # making sure it isnt redirect $uri = parseBetween($impres, "0;URL=..", "\">"); if ($uri ne "") { if (defined $opt_d) { printf("# processing redirect to %s\n",$uri); } # this was redirect $impsite = $site . $uri; $impres = get $impsite; } # do stuff normally $uri = parseBetween($impres, "<img SRC=\"posters/", "\" ALT"); # uri here is relative... patch it up to make a valid uri if ($uri =~ /\.(jpe?g|gif|png)$/) { if (!($uri =~ /http:(.*)/ )) { my $path = substr($impsite, 0, rindex($impsite, '/') + 1); $uri = $path."posters/".$uri; } if (defined $opt_d) { print "# found ipmawards poster: $uri\n"; } } else { $uri = ""; } } # try looking on MoTechPosters if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)motechposters/i) { if ($1 ne "") { if (defined $opt_d) { print "# found MoTechPosters poster page: $1 \n"; } my $cinres = get $1; if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); } if (defined $opt_r) { printf("%s", $cinres); } if ($cinres =~ m/<img src="([^"]*?$movieid[^"]*?)"/i) { if (defined $opt_d) { print "# MoTechPosters url retreived\n"; } $uri = "http://posters.motechnet.com".$1; } } } # try looking on nexbase if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)nexbase/i) { if ($1 ne "") { if (defined $opt_d) { print "# found nexbase poster page: $1 \n"; } my $cinres = get $1; if (defined $cinres) { if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); } if (defined $opt_r) { printf("%s", $cinres); } if ($cinres =~ m/<a id="photo_url" href="([^"]*?)" ><\/a>/i) { if (defined $opt_d) { print "# nexbase url retreived\n"; } $uri = $1; } } } } # try looking on cinemablend if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)cinemablend/i) { if ($1 ne "") { if (defined $opt_d) { print "# found cinemablend poster page: $1 \n"; } my $cinres = get $1; if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); } if (defined $opt_r) { printf("%s", $cinres); } if ($cinres =~ m#<img\b[^>]+\bsrc="(/images/reviews/[^"]*?)"#i) { if (defined $opt_d) { print "# cinemablend url retreived\n"; } $uri = "http://www.cinemablend.com/".$1; } } } # if the impawards site attempt didn't give a filename grab it from imdb if ($uri eq "") { if (defined $opt_d) { print "# looking for imdb posters\n"; } my $host = "http://posters.imdb.com/posters/"; $uri = parseBetween($response, $host, "\"><td><td><a href=\""); if ($uri ne "") { $uri = $host.$uri; } else { if (defined $opt_d) { print "# no poster found\n"; } } } my @movie_titles; my $found_low_res = 0; my $k = 0; # no poster found, take lowres image from imdb if ($uri eq "") { if (defined $opt_d) { print "# looking for lowres imdb posters\n"; } my $host = "http://www.imdb.com/title/tt" . $movieid . "/"; $response = get $host; # Better handling for low resolution posters # if ($response =~ m/<a name="poster".*<img.*src="([^"]*).*<\/a>/ig) { if (defined $opt_d) { print "# found low res poster at: $1\n"; } $uri = $1; $found_low_res = 1; } else { if (defined $opt_d) { print "# no low res poster found\n"; } $uri = ""; } if (defined $opt_d) { print "# starting to look for movie title\n"; } # get main title if (defined $opt_d) { print "# Getting possible movie titles:\n"; } $movie_titles[$k++] = parseBetween($response, "<title>", "<\/title>"); if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; } # now we get all other possible movie titles and store them in the titles array while($response =~ m/>([^>^\(]*)([ ]{0,1}\([^\)]*\)[^\(^\)]*[ ]{0,1}){0,1}\(informal title\)/g) { $movie_titles[$k++] = trim($1); if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; } } } print "$uri\n"; } # dump Movie list: 1 entry per line, each line as 'movieid:Movie Title' sub getMovieList { my ($filename, $options)=@_; # grab parameters # If we wanted to inspect the file for any reason we can do that now # # Convert filename into a query string # (use same rules that Metadata::guesTitle does) my $query = $filename; $query = uri_unescape($query); # in case it was escaped # Strip off the file extension if (rindex($query, '.') != -1) { $query = substr($query, 0, rindex($query, '.')); } # Strip off anything following '(' - people use this for general comments if (rindex($query, '(') != -1) { $query = substr($query, 0, rindex($query, '(')); } # Strip off anything following '[' - people use this for general comments if (rindex($query, '[') != -1) { $query = substr($query, 0, rindex($query, '[')); } # IMDB searches do better if any trailing ,The is left off $query =~ /(.*), The$/i; if ($1) { $query = $1; } # prepare the url $query = uri_escape($query); if (!$options) { $options = "" ;} if (defined $opt_d) { printf("# query: '%s', options: '%s'\n", $query, $options); } # get the search results page # some known IMDB options are: # type=[fuzy] looser search # from_year=[int] limit matches to year (broken at imdb) # to_year=[int] limit matches to year (broken at imdb) # sort=[smart] ?? # tv=[no|both|only] limits between tv and movies (broken at imdb) #$options = "tt=on;nm=on;mx=20"; # not exactly clear what these options do my $request = "http://www.imdb.com/find?q=$query;$options"; if (defined $opt_d) { printf("# request: '%s'\n", $request); } my $response = get $request; if (defined $opt_r) { print $response; exit(0); } # check to see if we got a results page or a movie page # looking for 'add=<movieid>" target=' which only exists # in a movie description page my $movienum = parseBetween($response, "add=", "\""); if (!$movienum) { $movienum = parseBetween($response, ";add=", "'"); } if ($movienum) { if ($movienum !~ m/^[0-9]+$/) { if (defined $opt_d) { printf("# Error: IMDB movie number ($movienum), isn't.\n"); } exit(0); } if (defined $opt_d) { printf("# redirected to movie page\n"); } my $movietitle = parseBetween($response, "<title>", "</title>"); $movietitle =~ m#(.+) \((\d+)\)#; $movietitle = $1; print "$movienum:$movietitle\n"; exit(0); } # extract possible matches # possible matches are grouped in several catagories: # exact, partial, and approximate my $popular_results = parseBetween($response, "<b>Popular Titles</b>", "</table>"); my $exact_matches = parseBetween($response, "<b>Titles (Exact Matches)</b>", "</table>"); my $partial_matches = parseBetween($response, "<b>Titles (Partial Matches)</b>", "</table>"); # my $approx_matches = parseBetween($response, "<b>Titles (Approx Matches)</b>", # "</table>"); # parse movie list from matches my $beg = "<tr>"; my $end = "</tr>"; my $count = 0; my @movies; # my $data = $exact_matches.$partial_matches; my $data = $popular_results.$exact_matches; # resort to partial matches if no exact if ($data eq "") { $data = $partial_matches; } # resort to approximate matches if no exact or partial # if ($data eq "") { $data = $approx_matches; } if ($data eq "") { if (defined $opt_d) { printf("# no results\n"); } return; } my $start = index($data, $beg); my $finish = index($data, $end, $start); my $year; my $type; my $title; while ($start != -1 && $start < length($data)) { $start += length($beg); my $entry = substr($data, $start, $finish - $start); $start = index($data, $beg, $finish + 1); $finish = index($data, $end, $start); my $title = ""; my $year = ""; my $type = ""; my $movienum = ""; # Some titles are identical, IMDB indicates this by appending /I /II to # the release year. # e.g. "Mon meilleur ami" 2006/I vs "Mon meilleur ami" 2006/II if ($entry =~ m/<a href="\/title\/tt(\d+)\/.*\">(.+)<\/a> \((\d+)\/?[a-z]*\)(?: \((.+)\))?/i) { $movienum = $1; $title = $2; $year = $3; $type = $4 if ($4); } else { if (defined $opt_d) { print("Unrecognized entry format ($entry)\n"); } next; } my $skip = 0; # fix broken 'tv=no' option if ($options =~ /tv=no/) { if ($type eq "TV") { if (defined $opt_d) {printf("# skipping TV program: %s\n", $title);} $skip = 1; } } if ($options =~ /tv=only/) { if ($type eq "") { if (defined $opt_d) {printf("# skipping Movie: %s\n", $title);} $skip = 1; } } # fix broken 'from_year=' option if ($options =~ /from_year=(\d+)/) { if ($year < $1) { if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);} $skip = 1; } } # fix broken 'to_year=' option if ($options =~ /to_year=(\d+)/) { if ($year > $1) { if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);} $skip = 1; } } # option to strip out videos (I think that's what '(V)' means anyway?) if ($options =~ /video=no/) { if ($type eq "V") { if (defined $opt_d) { printf("# skipping Video program: %s\n", $title); } $skip = 1; } } # (always) strip out video game's (why does IMDB give these anyway?) if ($type eq "VG") { if (defined $opt_d) {printf("# skipping videogame: %s\n", $title);} $skip = 1; } # add to array if (!$skip) { my $moviename = $title; if ($year ne "") { $moviename .= " ($year)"; } # $movies[$count++] = $movienum . ":" . $title; $movies[$count++] = $movienum . ":" . $moviename; } } # display array of values for $movie (@movies) { print "$movie\n"; } } # # Main Program # # parse command line arguments getopts('ohrdivDMP'); # print out info if (defined $opt_v) { version(); exit 1; } if (defined $opt_i) { info(); exit 1; } # print out usage if needed if (defined $opt_h || $#ARGV<0) { help(); } if (defined $opt_D) { # take movieid from cmdline arg $movieid = shift || die "Usage : $0 -D <movieid>\n"; getMovieData($movieid); } elsif (defined $opt_P) { # take movieid from cmdline arg $movieid = shift || die "Usage : $0 -P <movieid>\n"; getMoviePoster($movieid); } elsif (defined $opt_M) { # take query from cmdline arg $options = shift || die "Usage : $0 -M [options] <query>\n"; $query = shift; if (!$query) { $query = $options; $options = ""; } getMovieList($query, $options); } # vim: set expandtab ts=3 sw=3 :