package xsmeral.artnet.scraper;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.datatype.DatatypeConstants;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;
import org.htmlcleaner.TagNode;
import org.openrdf.model.URI;
import org.openrdf.model.vocabulary.OWL;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.model.vocabulary.RDFS;
import xsmeral.semnet.scraper.onto.EntityClass;
import xsmeral.semnet.scraper.onto.Term;
import xsmeral.semnet.crawler.model.EntityDocument;
import xsmeral.semnet.scraper.AbstractScraper;
import static xsmeral.semnet.util.XPathUtil.*;
public class CSFDScraper {
public static class Film extends AbstractScraper {
private static final String NODE_ACTORS
= "Hrají:";
private static final String NODE_DIRECTORS
= "Režie:";
private final Pattern PATT_ORIGIN = Pattern.compile("(?:(?!\\d+)([^,]+),?\\s*)?(?:(\\d{4}),?\\s*)?(?:(?:\\d+\\s*x\\s*)?(?:\\s*(\\d+)\\s*min))?");
public static final String NAMESPACE
= "http://www.csfd.cz#";
@Term
public static final URI TYPE = RDF.TYPE;
@Term("Used for IMDb link")
public static final URI SAME_AS = OWL.SAMEAS;
@Term("Used for www link")
public static final URI SEE_ALSO = RDFS.SEEALSO;
@Term
@EntityClass
public static final URI FILM = f.createURI(NAMESPACE, "film");
@Term
public static final URI FILM_NAME = RDFS.LABEL;
@Term
public static final URI DIRECTED_BY = f.createURI(NAMESPACE, "directed_by");
@Term
public static final URI ACTS_IN = f.createURI(NAMESPACE, "acts_in");
@Term
public static final URI GENRE = f.createURI(NAMESPACE, "genre");
@Term
public static final URI ORIGIN = f.createURI(NAMESPACE, "origin");
@Term
public static final URI YEAR = f.createURI(NAMESPACE, "year");
@Term("Duration in minutes")
public static final URI DURATION = f.createURI(NAMESPACE, "duration");
@Override
protected void scrape
(EntityDocument doc
) throws Exception {
TagNode root = doc.getDocument();
TagNode contentNode = (TagNode) root.evaluateXPath("//*[@id='profile']/div[@class='content']")[0];
TagNode infoNode = (TagNode) contentNode.evaluateXPath("div[@class='info']")[0];
// state type of this entity
fact(TYPE, FILM);
// name
String mainName
= queryText
(infoNode,
"h1");
fact(FILM_NAME, lit(mainName));
// other names
for (TagNode node : queryNodes(infoNode, "ul[@class='names']/li/h3")) {
fact(FILM_NAME, lit(getText(node)));
}
// genres
// Drama / Thriller / Mysteriózní
String genresStr
= queryText
(infoNode,
"p[@class='genre']");
if (genresStr != null) {
String[] genres
= genresStr.
split("/");
for (String genreStr
: genres
) {
String genre
= genreStr.
trim();
fact(GENRE, lit(genre));
}
}
// (countries), (year), ((nn x ) duration) - each optional
// USA / Velká Británie / Německo / Francie, 2011, 30 x 22 min
String originStr
= queryText
(infoNode,
"p[@class='origin']");
if (originStr != null) {
Matcher m = PATT_ORIGIN.matcher(originStr);
if (m.find()) {
String countriesStr
= m.
group(1);
String durationStr
= m.
group(3);
if (countriesStr != null) {
String[] countries
= countriesStr.
split("/");
for (String country
: countries
) {
fact(ORIGIN, lit(country));
}
}
if (yearStr != null) {
fact
(YEAR, f.
createLiteral(Integer.
parseInt(yearStr.
trim())));
}
if (durationStr != null) {
fact
(DURATION, f.
createLiteral(Integer.
parseInt(durationStr.
trim())));
}
}
}
List<TagNode> creatorNodes = queryNodes(infoNode, "div[h4]");
for (TagNode creatorsNode : creatorNodes) {
String nodeType
= queryText
(creatorsNode,
"h4");
if (nodeType.contains(NODE_ACTORS)) {
for (String actor
: queryTextNodes
(creatorsNode,
"span/a/@href")) {
fact(uri(actor), ACTS_IN, current());
}
} else if (nodeType.contains(NODE_DIRECTORS)) {
for (String director
: queryTextNodes
(creatorsNode,
"span/a/@href")) {
fact(DIRECTED_BY, uri(director));
}
}
}
TagNode linksNode = (TagNode) contentNode.evaluateXPath("ul[@class='links']")[0];
if (linksNode != null && linksNode.hasChildren()) {
// imdb link
String imdbLink
= queryText
(linksNode,
"li/a[@class='imdb']/@href");
if (imdbLink != null) {
fact(SAME_AS, uri(imdbLink));
}
// www link
String wwwLink
= queryText
(linksNode,
"li/a[@class='www']/@href");
if (wwwLink != null) {
fact(SEE_ALSO, uri(wwwLink));
}
}
}
@Override
public String getNamespace
() {
return NAMESPACE;
}
}
public static class Creator extends AbstractScraper {
private static final String DIR_FILMOGRAPHY
= "Režijní";
private static final String ACT_FILMOGRAPHY
= "Herecká";
private static final Pattern PATT_BIRTH = Pattern.compile("nar\\.\\s*(\\d{1,2})\\.(\\d{1,2})\\.(\\d{4})");
public static final String NAMESPACE
= "http://www.csfd.cz#";
@Term
public static final URI TYPE = RDF.TYPE;
@Term("Used for IMDb link")
public static final URI SAME_AS = OWL.SAMEAS;
@Term
@EntityClass
public static final URI DIRECTOR = f.createURI(NAMESPACE, "director");
@Term
@EntityClass
public static final URI ACTOR = f.createURI(NAMESPACE, "actor");
@Term
public static final URI PERSON_NAME = RDFS.LABEL;
@Term
public static final URI BIRTH_DATE = f.createURI(NAMESPACE, "birth_date");
@Override
protected void scrape
(EntityDocument doc
) throws Exception {
TagNode root = doc.getDocument();
TagNode contentNode = (TagNode) root.evaluateXPath("//*[@id='profile']/div[@class='content']")[0];
TagNode infoNode = (TagNode) contentNode.evaluateXPath("div[@class='info']")[0];
// name
String name
= queryText
(infoNode,
"h1");
if (name != null) {
fact(PERSON_NAME, lit(name));
}
// type
for (String filmography
: queryTextNodes
(root,
"data(//*[@id='filmography']//div[@class='header']/h2)")) {
if (filmography.contains(DIR_FILMOGRAPHY)) {
fact(TYPE, DIRECTOR);
} else if (filmography.contains(ACT_FILMOGRAPHY)) {
fact(TYPE, ACTOR);
}
}
// birth date
String birthStr
= queryText
(infoNode,
"ul[1]/li");
if (birthStr != null) {
Matcher m = PATT_BIRTH.matcher(birthStr);
if (m.find()) {
try {
int day
= Integer.
parseInt(m.
group(1));
int month
= Integer.
parseInt(m.
group(2));
int year
= Integer.
parseInt(m.
group(3));
XMLGregorianCalendar birthDate = DatatypeFactory.newInstance().newXMLGregorianCalendarDate(year, month, day, DatatypeConstants.FIELD_UNDEFINED);
fact(BIRTH_DATE, f.createLiteral(birthDate));
}
}
}
// imdb link
String imdbLink
= queryText
(contentNode,
"ul[@class='links']/li/a[@class='imdb']/@href");
if (imdbLink != null) {
fact(SAME_AS, uri(imdbLink));
}
}
@Override
public String getNamespace
() {
return NAMESPACE;
}
}
}