xsmeral.semnet.crawler.model
Class HostDescriptor

java.lang.Object
  extended by xsmeral.semnet.crawler.model.HostDescriptor

@XStreamConverter(value=HostDescConverter.class)
public class HostDescriptor
extends Object

The main configuration element of HTMLCrawler. Contains:


Constructor Summary
HostDescriptor()
          Creates empty source URL map and entity descriptor collection
HostDescriptor(String baseURL, String name, String charset, Integer crawlDelay, Boolean sourceFirst, Map<Pattern,Integer> sourceURLPatterns, Collection<EntityDescriptor> entityDescriptors)
          Initializes all fields
 
Method Summary
 void addEntityDescriptor(EntityDescriptor entityDescriptor)
           
 void addSourceURLPattern(Pattern pattern, int updateFreq)
           
 boolean equals(Object obj)
           
 String getBaseURL()
          Returns base URL of this host - the root level for crawling.
 Integer getCrawlDelay()
          Returns the crawl delay.
 Collection<EntityDescriptor> getEntityDescriptors()
          Returns EntityDescriptors which represent entities in this host (pages that will be scraped)
 String getCharset()
          Returns the (user-defined) charset used by this host.
 String getName()
          Returns (arbitrary, user-assigned) name of this host.
 Map<Pattern,Integer> getSourceURLPatterns()
          Returns patterns of source URLs mapped to corresponding update frequencies.
 int hashCode()
           
 boolean isSourceFirst()
          Indicates whether source URLs should be crawled first
 void setBaseURL(String baseURL)
           
 void setCrawlDelay(Integer crawlDelay)
           
 void setEntityDescriptors(Collection<EntityDescriptor> entityDescriptors)
           
 void setCharset(String charset)
           
 void setName(String name)
           
 void setSourceFirst(boolean sourceFirst)
           
 void setSourceURLPatterns(Map<Pattern,Integer> sourceURLPatterns)
           
 String toString()
           
 
Methods inherited from class java.lang.Object
clone, finalize, getClass, notify, notifyAll, wait, wait, wait
 

Constructor Detail

HostDescriptor

public HostDescriptor()
Creates empty source URL map and entity descriptor collection


HostDescriptor

public HostDescriptor(String baseURL,
                      String name,
                      String charset,
                      Integer crawlDelay,
                      Boolean sourceFirst,
                      Map<Pattern,Integer> sourceURLPatterns,
                      Collection<EntityDescriptor> entityDescriptors)
Initializes all fields

Method Detail

getBaseURL

public String getBaseURL()
Returns base URL of this host - the root level for crawling.


setBaseURL

public void setBaseURL(String baseURL)

getEntityDescriptors

public Collection<EntityDescriptor> getEntityDescriptors()
Returns EntityDescriptors which represent entities in this host (pages that will be scraped)


setEntityDescriptors

public void setEntityDescriptors(Collection<EntityDescriptor> entityDescriptors)

addEntityDescriptor

public void addEntityDescriptor(EntityDescriptor entityDescriptor)

getName

public String getName()
Returns (arbitrary, user-assigned) name of this host.


setName

public void setName(String name)

getSourceURLPatterns

public Map<Pattern,Integer> getSourceURLPatterns()
Returns patterns of source URLs mapped to corresponding update frequencies. Source URLs represent pages that are only crawled (searched for links), not scraped.


setSourceURLPatterns

public void setSourceURLPatterns(Map<Pattern,Integer> sourceURLPatterns)

addSourceURLPattern

public void addSourceURLPattern(Pattern pattern,
                                int updateFreq)

getCharset

public String getCharset()
Returns the (user-defined) charset used by this host. If the charset is not specified, the crawler tries to guess it from the content.


setCharset

public void setCharset(String charset)

getCrawlDelay

public Integer getCrawlDelay()
Returns the crawl delay. Might be set to override the one found in the Robots Policy.


setCrawlDelay

public void setCrawlDelay(Integer crawlDelay)

isSourceFirst

public boolean isSourceFirst()
Indicates whether source URLs should be crawled first


setSourceFirst

public void setSourceFirst(boolean sourceFirst)

equals

public boolean equals(Object obj)
Overrides:
equals in class Object

hashCode

public int hashCode()
Overrides:
hashCode in class Object

toString

public String toString()
Overrides:
toString in class Object