blob: c8c75297f42d78b89fe70dd9f94001337f9fce99 [file] [log] [blame]
/*
* Copyright (C) 2009 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "WebPageSerializer.h"
#include "DocumentLoader.h"
#include "Element.h"
#include "Frame.h"
#include "HTMLAllCollection.h"
#include "HTMLFrameOwnerElement.h"
#include "HTMLInputElement.h"
#include "HTMLNames.h"
#include "KURL.h"
#include "Vector.h"
#include "WebCString.h"
#include "WebFrame.h"
#include "WebFrameImpl.h"
#include "WebPageSerializerClient.h"
#include "WebPageSerializerImpl.h"
#include "WebString.h"
#include "WebURL.h"
#include "WebVector.h"
#include "WebView.h"
#include <wtf/text/StringConcatenate.h>
using namespace WebCore;
namespace {
KURL getSubResourceURLFromElement(Element* element)
{
ASSERT(element);
const QualifiedName* attributeName = 0;
if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
attributeName = &HTMLNames::srcAttr;
else if (element->hasTagName(HTMLNames::inputTag)) {
HTMLInputElement* input = static_cast<HTMLInputElement*>(element);
if (input->isImageButton())
attributeName = &HTMLNames::srcAttr;
} else if (element->hasTagName(HTMLNames::bodyTag)
|| element->hasTagName(HTMLNames::tableTag)
|| element->hasTagName(HTMLNames::trTag)
|| element->hasTagName(HTMLNames::tdTag))
attributeName = &HTMLNames::backgroundAttr;
else if (element->hasTagName(HTMLNames::blockquoteTag)
|| element->hasTagName(HTMLNames::qTag)
|| element->hasTagName(HTMLNames::delTag)
|| element->hasTagName(HTMLNames::insTag))
attributeName = &HTMLNames::citeAttr;
else if (element->hasTagName(HTMLNames::linkTag)) {
// If the link element is not css, ignore it.
if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
// FIXME: Add support for extracting links of sub-resources which
// are inside style-sheet such as @import, @font-face, url(), etc.
attributeName = &HTMLNames::hrefAttr;
}
} else if (element->hasTagName(HTMLNames::objectTag))
attributeName = &HTMLNames::dataAttr;
else if (element->hasTagName(HTMLNames::embedTag))
attributeName = &HTMLNames::srcAttr;
if (!attributeName)
return KURL();
String value = element->getAttribute(*attributeName);
// Ignore javascript content.
if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
return KURL();
return element->document()->completeURL(value);
}
void retrieveResourcesForElement(Element* element,
Vector<Frame*>* visitedFrames,
Vector<Frame*>* framesToVisit,
Vector<KURL>* frameURLs,
Vector<KURL>* resourceURLs)
{
// If the node is a frame, we'll process it later in retrieveResourcesForFrame.
if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
|| element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
&& element->isFrameOwnerElement()) {
Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame();
if (frame) {
if (!visitedFrames->contains(frame))
framesToVisit->append(frame);
return;
}
}
KURL url = getSubResourceURLFromElement(element);
if (url.isEmpty() || !url.isValid())
return; // No subresource for this node.
// Ignore URLs that have a non-standard protocols. Since the FTP protocol
// does no have a cache mechanism, we skip it as well.
if (!url.protocolInHTTPFamily() && !url.isLocalFile())
return;
if (!resourceURLs->contains(url))
resourceURLs->append(url);
}
void retrieveResourcesForFrame(Frame* frame,
const WebKit::WebVector<WebKit::WebCString>& supportedSchemes,
Vector<Frame*>* visitedFrames,
Vector<Frame*>* framesToVisit,
Vector<KURL>* frameURLs,
Vector<KURL>* resourceURLs)
{
KURL frameURL = frame->loader()->documentLoader()->request().url();
// If the frame's URL is invalid, ignore it, it is not retrievable.
if (!frameURL.isValid())
return;
// Ignore frames from unsupported schemes.
bool isValidScheme = false;
for (size_t i = 0; i < supportedSchemes.size(); ++i) {
if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
isValidScheme = true;
break;
}
}
if (!isValidScheme)
return;
// If we have already seen that frame, ignore it.
if (visitedFrames->contains(frame))
return;
visitedFrames->append(frame);
if (!frameURLs->contains(frameURL))
frameURLs->append(frameURL);
// Now get the resources associated with each node of the document.
RefPtr<HTMLAllCollection> allNodes = frame->document()->all();
for (unsigned i = 0; i < allNodes->length(); ++i) {
Node* node = allNodes->item(i);
// We are only interested in HTML resources.
if (!node->isElementNode())
continue;
retrieveResourcesForElement(static_cast<Element*>(node),
visitedFrames, framesToVisit,
frameURLs, resourceURLs);
}
}
} // namespace
namespace WebKit {
bool WebPageSerializer::serialize(WebFrame* frame,
bool recursive,
WebPageSerializerClient* client,
const WebVector<WebURL>& links,
const WebVector<WebString>& localPaths,
const WebString& localDirectoryName)
{
WebPageSerializerImpl serializerImpl(
frame, recursive, client, links, localPaths, localDirectoryName);
return serializerImpl.serialize();
}
bool WebPageSerializer::retrieveAllResources(WebView* view,
const WebVector<WebCString>& supportedSchemes,
WebVector<WebURL>* resourceURLs,
WebVector<WebURL>* frameURLs) {
WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame());
if (!mainFrame)
return false;
Vector<Frame*> framesToVisit;
Vector<Frame*> visitedFrames;
Vector<KURL> frameKURLs;
Vector<KURL> resourceKURLs;
// Let's retrieve the resources from every frame in this page.
framesToVisit.append(mainFrame->frame());
while (!framesToVisit.isEmpty()) {
Frame* frame = framesToVisit[0];
framesToVisit.remove(0);
retrieveResourcesForFrame(frame, supportedSchemes,
&visitedFrames, &framesToVisit,
&frameKURLs, &resourceKURLs);
}
// Converts the results to WebURLs.
WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
for (size_t i = 0; i < resourceKURLs.size(); ++i) {
resultResourceURLs[i] = resourceKURLs[i];
// A frame's src can point to the same URL as another resource, keep the
// resource URL only in such cases.
size_t index = frameKURLs.find(resourceKURLs[i]);
if (index != notFound)
frameKURLs.remove(index);
}
*resourceURLs = resultResourceURLs;
WebVector<WebURL> resultFrameURLs(frameKURLs.size());
for (size_t i = 0; i < frameKURLs.size(); ++i)
resultFrameURLs[i] = frameKURLs[i];
*frameURLs = resultFrameURLs;
return true;
}
WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
{
return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">");
}
WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
{
return String::format("\n<!-- saved from url=(%04d)%s -->\n",
static_cast<int>(url.spec().length()),
url.spec().data());
}
WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
{
if (baseTarget.isEmpty())
return makeString("<base href=\".\">");
return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">");
}
} // namespace WebKit