| /* |
| * Copyright (C) 2009 Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| // How we handle the base tag better. |
| // Current status: |
| // At now the normal way we use to handling base tag is |
| // a) For those links which have corresponding local saved files, such as |
| // savable CSS, JavaScript files, they will be written to relative URLs which |
| // point to local saved file. Why those links can not be resolved as absolute |
| // file URLs, because if they are resolved as absolute URLs, after moving the |
| // file location from one directory to another directory, the file URLs will |
| // be dead links. |
| // b) For those links which have not corresponding local saved files, such as |
| // links in A, AREA tags, they will be resolved as absolute URLs. |
| // c) We comment all base tags when serialzing DOM for the page. |
| // FireFox also uses above way to handle base tag. |
| // |
| // Problem: |
| // This way can not handle the following situation: |
| // the base tag is written by JavaScript. |
| // For example. The page "www.yahoo.com" use |
| // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL |
| // of page when loading page. So when saving page as completed-HTML, we assume |
| // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved |
| // completed-HTML page, then the JavaScript will insert a base tag |
| // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to |
| // local saved resource files will be resolved as |
| // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource |
| // files can not be loaded correctly. Also the page will be rendered ugly since |
| // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame |
| // files can not be fetched. |
| // Now FireFox, IE and WebKit based Browser all have this problem. |
| // |
| // Solution: |
| // My solution is that we comment old base tag and write new base tag: |
| // <base href="." ...> after the previous commented base tag. In WebKit, it |
| // always uses the latest "href" attribute of base tag to set document's base |
| // URL. Based on this behavior, when we encounter a base tag, we comment it and |
| // write a new base tag <base href="."> after the previous commented base tag. |
| // The new added base tag can help engine to locate correct base URL for |
| // correctly loading local saved resource files. Also I think we need to inherit |
| // the base target value from document object when appending new base tag. |
| // If there are multiple base tags in original document, we will comment all old |
| // base tags and append new base tag after each old base tag because we do not |
| // know those old base tags are original content or added by JavaScript. If |
| // they are added by JavaScript, it means when loading saved page, the script(s) |
| // will still insert base tag(s) to DOM, so the new added base tag(s) can |
| // override the incorrect base URL and make sure we alway load correct local |
| // saved resource files. |
| |
| #include "config.h" |
| #include "WebPageSerializerImpl.h" |
| |
| #include "Document.h" |
| #include "DocumentLoader.h" |
| #include "DocumentType.h" |
| #include "Element.h" |
| #include "FrameLoader.h" |
| #include "HTMLAllCollection.h" |
| #include "HTMLElement.h" |
| #include "HTMLFormElement.h" |
| #include "HTMLMetaElement.h" |
| #include "HTMLNames.h" |
| #include "KURL.h" |
| #include "TextEncoding.h" |
| #include "markup.h" |
| |
| #include "DOMUtilitiesPrivate.h" |
| #include "WebFrameImpl.h" |
| #include "WebURL.h" |
| #include "WebVector.h" |
| |
| using namespace WebCore; |
| |
| namespace WebKit { |
| |
| // Maximum length of data buffer which is used to temporary save generated |
| // html content data. This is a soft limit which might be passed if a very large |
| // contegious string is found in the page. |
| static const unsigned dataBufferCapacity = 65536; |
| |
| WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, |
| const TextEncoding& textEncoding, |
| Document* document, |
| const String& directoryName) |
| : url(url) |
| , textEncoding(textEncoding) |
| , document(document) |
| , directoryName(directoryName) |
| , isHTMLDocument(document->isHTMLDocument()) |
| , haveSeenDocType(false) |
| , haveAddedCharsetDeclaration(false) |
| , skipMetaElement(0) |
| , isInScriptOrStyleTag(false) |
| , haveAddedXMLProcessingDirective(false) |
| , haveAddedContentsBeforeEnd(false) |
| { |
| } |
| |
| String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( |
| const Element* element, SerializeDomParam* param, bool* needSkip) |
| { |
| StringBuilder result; |
| |
| *needSkip = false; |
| if (param->isHTMLDocument) { |
| // Skip the open tag of original META tag which declare charset since we |
| // have overrided the META which have correct charset declaration after |
| // serializing open tag of HEAD element. |
| if (element->hasTagName(HTMLNames::metaTag)) { |
| const HTMLMetaElement* meta = static_cast<const HTMLMetaElement*>(element); |
| // Check whether the META tag has declared charset or not. |
| String equiv = meta->httpEquiv(); |
| if (equalIgnoringCase(equiv, "content-type")) { |
| String content = meta->content(); |
| if (content.length() && content.contains("charset", false)) { |
| // Find META tag declared charset, we need to skip it when |
| // serializing DOM. |
| param->skipMetaElement = element; |
| *needSkip = true; |
| } |
| } |
| } else if (element->hasTagName(HTMLNames::htmlTag)) { |
| // Check something before processing the open tag of HEAD element. |
| // First we add doc type declaration if original document has it. |
| if (!param->haveSeenDocType) { |
| param->haveSeenDocType = true; |
| result.append(createMarkup(param->document->doctype())); |
| } |
| |
| // Add MOTW declaration before html tag. |
| // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. |
| result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url)); |
| } else if (element->hasTagName(HTMLNames::baseTag)) { |
| // Comment the BASE tag when serializing dom. |
| result.append("<!--"); |
| } |
| } else { |
| // Write XML declaration. |
| if (!param->haveAddedXMLProcessingDirective) { |
| param->haveAddedXMLProcessingDirective = true; |
| // Get encoding info. |
| String xmlEncoding = param->document->xmlEncoding(); |
| if (xmlEncoding.isEmpty()) |
| xmlEncoding = param->document->loader()->writer()->encoding(); |
| if (xmlEncoding.isEmpty()) |
| xmlEncoding = UTF8Encoding().name(); |
| result.append("<?xml version=\""); |
| result.append(param->document->xmlVersion()); |
| result.append("\" encoding=\""); |
| result.append(xmlEncoding); |
| if (param->document->xmlStandalone()) |
| result.append("\" standalone=\"yes"); |
| result.append("\"?>\n"); |
| } |
| // Add doc type declaration if original document has it. |
| if (!param->haveSeenDocType) { |
| param->haveSeenDocType = true; |
| result.append(createMarkup(param->document->doctype())); |
| } |
| } |
| return result.toString(); |
| } |
| |
| String WebPageSerializerImpl::postActionAfterSerializeOpenTag( |
| const Element* element, SerializeDomParam* param) |
| { |
| StringBuilder result; |
| |
| param->haveAddedContentsBeforeEnd = false; |
| if (!param->isHTMLDocument) |
| return result.toString(); |
| // Check after processing the open tag of HEAD element |
| if (!param->haveAddedCharsetDeclaration |
| && element->hasTagName(HTMLNames::headTag)) { |
| param->haveAddedCharsetDeclaration = true; |
| // Check meta element. WebKit only pre-parse the first 512 bytes |
| // of the document. If the whole <HEAD> is larger and meta is the |
| // end of head part, then this kind of pages aren't decoded correctly |
| // because of this issue. So when we serialize the DOM, we need to |
| // make sure the meta will in first child of head tag. |
| // See http://bugs.webkit.org/show_bug.cgi?id=16621. |
| // First we generate new content for writing correct META element. |
| result.append(WebPageSerializer::generateMetaCharsetDeclaration( |
| String(param->textEncoding.name()))); |
| |
| param->haveAddedContentsBeforeEnd = true; |
| // Will search each META which has charset declaration, and skip them all |
| // in PreActionBeforeSerializeOpenTag. |
| } else if (element->hasTagName(HTMLNames::scriptTag) |
| || element->hasTagName(HTMLNames::styleTag)) { |
| param->isInScriptOrStyleTag = true; |
| } |
| |
| return result.toString(); |
| } |
| |
| String WebPageSerializerImpl::preActionBeforeSerializeEndTag( |
| const Element* element, SerializeDomParam* param, bool* needSkip) |
| { |
| String result; |
| |
| *needSkip = false; |
| if (!param->isHTMLDocument) |
| return result; |
| // Skip the end tag of original META tag which declare charset. |
| // Need not to check whether it's META tag since we guarantee |
| // skipMetaElement is definitely META tag if it's not 0. |
| if (param->skipMetaElement == element) |
| *needSkip = true; |
| else if (element->hasTagName(HTMLNames::scriptTag) |
| || element->hasTagName(HTMLNames::styleTag)) { |
| ASSERT(param->isInScriptOrStyleTag); |
| param->isInScriptOrStyleTag = false; |
| } |
| |
| return result; |
| } |
| |
| // After we finish serializing end tag of a element, we give the target |
| // element a chance to do some post work to add some additional data. |
| String WebPageSerializerImpl::postActionAfterSerializeEndTag( |
| const Element* element, SerializeDomParam* param) |
| { |
| StringBuilder result; |
| |
| if (!param->isHTMLDocument) |
| return result.toString(); |
| // Comment the BASE tag when serializing DOM. |
| if (element->hasTagName(HTMLNames::baseTag)) { |
| result.append("-->"); |
| // Append a new base tag declaration. |
| result.append(WebPageSerializer::generateBaseTagDeclaration( |
| param->document->baseTarget())); |
| } |
| |
| return result.toString(); |
| } |
| |
| void WebPageSerializerImpl::saveHTMLContentToBuffer( |
| const String& result, SerializeDomParam* param) |
| { |
| m_dataBuffer.append(result); |
| encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, |
| param, |
| DoNotForceFlush); |
| } |
| |
| void WebPageSerializerImpl::encodeAndFlushBuffer( |
| WebPageSerializerClient::PageSerializationStatus status, |
| SerializeDomParam* param, |
| FlushOption flushOption) |
| { |
| // Data buffer is not full nor do we want to force flush. |
| if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity) |
| return; |
| |
| String content = m_dataBuffer.toString(); |
| m_dataBuffer = StringBuilder(); |
| |
| // Convert the unicode content to target encoding |
| CString encodedContent = param->textEncoding.encode( |
| content.characters(), content.length(), EntitiesForUnencodables); |
| |
| // Send result to the client. |
| m_client->didSerializeDataForFrame(param->url, |
| WebCString(encodedContent.data(), encodedContent.length()), |
| status); |
| } |
| |
| void WebPageSerializerImpl::openTagToString(Element* element, |
| SerializeDomParam* param) |
| { |
| // FIXME: use StringBuilder instead of String. |
| bool needSkip; |
| // Do pre action for open tag. |
| String result = preActionBeforeSerializeOpenTag(element, param, &needSkip); |
| if (needSkip) |
| return; |
| // Add open tag |
| result += "<" + element->nodeName().lower(); |
| // Go through all attributes and serialize them. |
| const NamedNodeMap *attrMap = element->attributes(true); |
| if (attrMap) { |
| unsigned numAttrs = attrMap->length(); |
| for (unsigned i = 0; i < numAttrs; i++) { |
| result += " "; |
| // Add attribute pair |
| const Attribute *attribute = attrMap->attributeItem(i); |
| result += attribute->name().toString(); |
| result += "=\""; |
| if (!attribute->value().isEmpty()) { |
| const String& attrValue = attribute->value(); |
| |
| // Check whether we need to replace some resource links |
| // with local resource paths. |
| const QualifiedName& attrName = attribute->name(); |
| if (elementHasLegalLinkAttribute(element, attrName)) { |
| // For links start with "javascript:", we do not change it. |
| if (attrValue.startsWith("javascript:", false)) |
| result += attrValue; |
| else { |
| // Get the absolute link |
| WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerElement(element); |
| String completeURL = subFrame ? subFrame->frame()->document()->url() : |
| param->document->completeURL(attrValue); |
| // Check whether we have local files for those link. |
| if (m_localLinks.contains(completeURL)) { |
| if (!param->directoryName.isEmpty()) |
| result += "./" + param->directoryName + "/"; |
| result += m_localLinks.get(completeURL); |
| } else |
| result += completeURL; |
| } |
| } else { |
| if (param->isHTMLDocument) |
| result += m_htmlEntities.convertEntitiesInString(attrValue); |
| else |
| result += m_xmlEntities.convertEntitiesInString(attrValue); |
| } |
| } |
| result += "\""; |
| } |
| } |
| |
| // Do post action for open tag. |
| String addedContents = postActionAfterSerializeOpenTag(element, param); |
| // Complete the open tag for element when it has child/children. |
| if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) |
| result += ">"; |
| // Append the added contents generate in post action of open tag. |
| result += addedContents; |
| // Save the result to data buffer. |
| saveHTMLContentToBuffer(result, param); |
| } |
| |
| // Serialize end tag of an specified element. |
| void WebPageSerializerImpl::endTagToString(Element* element, |
| SerializeDomParam* param) |
| { |
| bool needSkip; |
| // Do pre action for end tag. |
| String result = preActionBeforeSerializeEndTag(element, |
| param, |
| &needSkip); |
| if (needSkip) |
| return; |
| // Write end tag when element has child/children. |
| if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) { |
| result += "</"; |
| result += element->nodeName().lower(); |
| result += ">"; |
| } else { |
| // Check whether we have to write end tag for empty element. |
| if (param->isHTMLDocument) { |
| result += ">"; |
| // FIXME: This code is horribly wrong. WebPageSerializerImpl must die. |
| if (!static_cast<const HTMLElement*>(element)->ieForbidsInsertHTML()) { |
| // We need to write end tag when it is required. |
| result += "</"; |
| result += element->nodeName().lower(); |
| result += ">"; |
| } |
| } else { |
| // For xml base document. |
| result += " />"; |
| } |
| } |
| // Do post action for end tag. |
| result += postActionAfterSerializeEndTag(element, param); |
| // Save the result to data buffer. |
| saveHTMLContentToBuffer(result, param); |
| } |
| |
| void WebPageSerializerImpl::buildContentForNode(Node* node, |
| SerializeDomParam* param) |
| { |
| switch (node->nodeType()) { |
| case Node::ELEMENT_NODE: |
| // Process open tag of element. |
| openTagToString(static_cast<Element*>(node), param); |
| // Walk through the children nodes and process it. |
| for (Node *child = node->firstChild(); child; child = child->nextSibling()) |
| buildContentForNode(child, param); |
| // Process end tag of element. |
| endTagToString(static_cast<Element*>(node), param); |
| break; |
| case Node::TEXT_NODE: |
| saveHTMLContentToBuffer(createMarkup(node), param); |
| break; |
| case Node::ATTRIBUTE_NODE: |
| case Node::DOCUMENT_NODE: |
| case Node::DOCUMENT_FRAGMENT_NODE: |
| // Should not exist. |
| ASSERT_NOT_REACHED(); |
| break; |
| // Document type node can be in DOM? |
| case Node::DOCUMENT_TYPE_NODE: |
| param->haveSeenDocType = true; |
| default: |
| // For other type node, call default action. |
| saveHTMLContentToBuffer(createMarkup(node), param); |
| break; |
| } |
| } |
| |
| WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, |
| bool recursiveSerialization, |
| WebPageSerializerClient* client, |
| const WebVector<WebURL>& links, |
| const WebVector<WebString>& localPaths, |
| const WebString& localDirectoryName) |
| : m_client(client) |
| , m_recursiveSerialization(recursiveSerialization) |
| , m_framesCollected(false) |
| , m_localDirectoryName(localDirectoryName) |
| , m_htmlEntities(false) |
| , m_xmlEntities(true) |
| { |
| // Must specify available webframe. |
| ASSERT(frame); |
| m_specifiedWebFrameImpl = static_cast<WebFrameImpl*>(frame); |
| // Make sure we have non 0 client. |
| ASSERT(client); |
| // Build local resources map. |
| ASSERT(links.size() == localPaths.size()); |
| for (size_t i = 0; i < links.size(); i++) { |
| KURL url = links[i]; |
| ASSERT(!m_localLinks.contains(url.string())); |
| m_localLinks.set(url.string(), localPaths[i]); |
| } |
| |
| ASSERT(m_dataBuffer.isEmpty()); |
| } |
| |
| void WebPageSerializerImpl::collectTargetFrames() |
| { |
| ASSERT(!m_framesCollected); |
| m_framesCollected = true; |
| |
| // First, process main frame. |
| m_frames.append(m_specifiedWebFrameImpl); |
| // Return now if user only needs to serialize specified frame, not including |
| // all sub-frames. |
| if (!m_recursiveSerialization) |
| return; |
| // Collect all frames inside the specified frame. |
| for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) { |
| WebFrameImpl* currentFrame = m_frames[i]; |
| // Get current using document. |
| Document* currentDoc = currentFrame->frame()->document(); |
| // Go through sub-frames. |
| RefPtr<HTMLAllCollection> all = currentDoc->all(); |
| for (Node* node = all->firstItem(); node; node = all->nextItem()) { |
| if (!node->isHTMLElement()) |
| continue; |
| Element* element = static_cast<Element*>(node); |
| WebFrameImpl* webFrame = |
| WebFrameImpl::fromFrameOwnerElement(element); |
| if (webFrame) |
| m_frames.append(webFrame); |
| } |
| } |
| } |
| |
| bool WebPageSerializerImpl::serialize() |
| { |
| if (!m_framesCollected) |
| collectTargetFrames(); |
| |
| bool didSerialization = false; |
| KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url(); |
| |
| for (unsigned i = 0; i < m_frames.size(); ++i) { |
| WebFrameImpl* webFrame = m_frames[i]; |
| Document* document = webFrame->frame()->document(); |
| const KURL& url = document->url(); |
| |
| if (!url.isValid() || !m_localLinks.contains(url.string())) |
| continue; |
| |
| didSerialization = true; |
| |
| String encoding = document->loader()->writer()->encoding(); |
| const TextEncoding& textEncoding = encoding.isEmpty() ? UTF8Encoding() : TextEncoding(encoding); |
| String directoryName = url == mainURL ? m_localDirectoryName : ""; |
| |
| SerializeDomParam param(url, textEncoding, document, directoryName); |
| |
| Element* documentElement = document->documentElement(); |
| if (documentElement) |
| buildContentForNode(documentElement, ¶m); |
| |
| encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush); |
| } |
| |
| ASSERT(m_dataBuffer.isEmpty()); |
| m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished); |
| return didSerialization; |
| } |
| |
| } // namespace WebKit |