001 package org.util.html.factory;
002
003
004 import java.util.*;
005 import java.io.*;
006 import java.net.*;
007 import java.awt.*;
008 import java.awt.event.*;
009 import javax.swing.*;
010 import javax.swing.event.*;
011
012 import org.util.html.objects.*;
013 import org.util.log.*;
014
015 import org.util.xml.parse.*;
016 import org.util.xml.parse.policy.*;
017 import org.util.xml.element.*;
018
019 public class HTMLDocumentFactory {
020
021 private LogListener log_listener_;
022 private URLConnection connection_;
023 private ParserPolicy html_document_parser_policy_;
024 private ParserPolicy head_tag_parser_policy_;
025 private ParserPolicy body_tag_parser_policy_;
026 private HTMLDocument current_document_;
027
028 public HTMLDocumentFactory() {
029
030 URLConnection.setDefaultAllowUserInteraction(true);
031 // User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.13) Gecko/2009080317 Fedora/3.0.13-1.fc10 Firefox/3.0.13 GTB5
032
033 html_document_parser_policy_ = new HTMLParserPolicy(){
034 @Override public boolean throwExceptionIfDocumentHasError() {
035 return false;
036 }
037 @Override public Element allowElement(Element element) {
038 return element;
039 }
040 @Override public ParserPolicy getInnerPolicy(Element element) {
041 if(!element.isTagElement()) return null;
042 TagElement tag = (TagElement)element;
043 if(tag.getKey().toLowerCase().equals("body"))
044 return body_tag_parser_policy_;
045 else if(tag.getKey().toLowerCase().equals("head"))
046 return head_tag_parser_policy_;
047 return this;
048 }
049 };
050
051 head_tag_parser_policy_ = new HTMLParserPolicy(){
052 @Override public boolean throwExceptionIfDocumentHasError() {
053 return false;
054 }
055 @Override public Element allowElement(Element element) {
056 super.allowElement(element);
057 return element;
058 }
059 };
060
061 body_tag_parser_policy_ = new HTMLParserPolicy(){
062 @Override public boolean throwExceptionIfDocumentHasError() {
063 return false;
064 }
065 @Override public Element allowElement(Element element) {
066 if(element.isTextElement()) {
067 TextElement text = (TextElement)element;
068 HTMLText tobj = new HTMLText(current_document_);
069 tobj.setText(text.getValue());
070 current_document_.add(tobj);
071 return element;
072 } else {
073 TagElement tag = (TagElement)element;
074 String key = tag.getKey().toLowerCase();
075 if(key.equals("img")) {
076 HTMLImg timg = new HTMLImg(current_document_);
077 try{
078 timg.setURL(new URL(tag.getAttributeValue("src")));
079 }catch(Exception e){}
080 current_document_.add(timg);
081 return element;
082 }
083 }
084 return element;
085 }
086 };
087 }
088
089 public void setLogListener(LogListener log_listener) {
090 log_listener_ = log_listener;
091 }
092
093 public HTMLDocument createDocument(URL url, HTMLDocument doc) throws Exception {
094 connection_ = url.openConnection();
095 return createDocument(url, connection_, connection_.getInputStream(), doc);
096 }
097
098 public HTMLDocument createDocument(URL url, URLConnection connection, InputStream is, HTMLDocument document) throws Exception {
099 assert is != null;
100
101 if(document==null)
102 document = new HTMLDocument();
103
104 current_document_ = document;
105 current_document_.clear();
106 current_document_.setDocumentBase(url);
107 connection_ = connection;
108 ElementParser parser = null;
109 String encoding = null;
110 if(connection_!=null)
111 encoding = connection_.getContentEncoding();
112 if(encoding != null)
113 parser = new ElementParser(is, encoding);
114 else
115 parser = new ElementParser(is);
116
117 parser.setPolicy(html_document_parser_policy_);
118
119 Element[] element_list = parser.parse();
120
121 System.out.println("skipped:");
122 System.out.println("---------------------");
123 //for(Element element : element_list)
124 // System.out.println(element);
125
126 return current_document_;
127 }
128
129 }