Commit 2de77018 authored by jcivelli@chromium.org's avatar jcivelli@chromium.org

2011-03-23 Jay Civelli <jcivelli@chromium.org>

        Reviewed by David Levin.

        Relanding:
        Moving the method that is used to retrieve all the resources in
        a page from the Chromium code (dom_operations.cc) to WebKit.
        https://bugs.webkit.org/show_bug.cgi?id=55859

        * WebKit.gyp:
        * public/WebPageSerializer.h:
        * public/WebURL.h:
        (WebKit::operator==):
        (WebKit::operator!=):
        * public/WebVector.h:
        (WebKit::WebVector::contains):
        * src/WebDataSourceImpl.cpp:
        (WebKit::WebDataSourceImpl::WebDataSourceImpl):
        * src/WebPageSerializer.cpp:
        (WebCore::getSubResourceURLFromElement):
        (WebCore::retrieveResourcesForElement):
        (WebCore::retrieveResourcesForFrame):
        (WebKit::WebPageSerializer::retrieveAllResources):
        * tests/WebFrameTest.cpp:
        * tests/WebPageSerializerTest.cpp: Added.
        * tests/data/pageserialization/awesome.png: Added.
        * tests/data/pageserialization/embed_iframe.html: Added.
        * tests/data/pageserialization/object_iframe.html: Added.
        * tests/data/pageserialization/simple_iframe.html: Added.
        * tests/data/pageserialization/simple_page.html: Added.
        * tests/data/pageserialization/top_frame.html: Added.

git-svn-id: http://svn.webkit.org/repository/webkit/trunk@81846 268f45cc-cd09-0410-ab3c-d52691b4dbfc
parent ce99e9f8
2011-03-23 Jay Civelli <jcivelli@chromium.org>
Reviewed by David Levin.
Relanding:
Moving the method that is used to retrieve all the resources in
a page from the Chromium code (dom_operations.cc) to WebKit.
https://bugs.webkit.org/show_bug.cgi?id=55859
* WebKit.gyp:
* public/WebPageSerializer.h:
* public/WebURL.h:
(WebKit::operator==):
(WebKit::operator!=):
* public/WebVector.h:
(WebKit::WebVector::contains):
* src/WebDataSourceImpl.cpp:
(WebKit::WebDataSourceImpl::WebDataSourceImpl):
* src/WebPageSerializer.cpp:
(WebCore::getSubResourceURLFromElement):
(WebCore::retrieveResourcesForElement):
(WebCore::retrieveResourcesForFrame):
(WebKit::WebPageSerializer::retrieveAllResources):
* tests/WebFrameTest.cpp:
* tests/WebPageSerializerTest.cpp: Added.
* tests/data/pageserialization/awesome.png: Added.
* tests/data/pageserialization/embed_iframe.html: Added.
* tests/data/pageserialization/object_iframe.html: Added.
* tests/data/pageserialization/simple_iframe.html: Added.
* tests/data/pageserialization/simple_page.html: Added.
* tests/data/pageserialization/top_frame.html: Added.
2011-03-23 Kenneth Russell <kbr@google.com>
Rolled forward Chromium DEPS again to fix DumpRenderTree build
......
......@@ -817,6 +817,7 @@
'tests/TransparencyWinTest.cpp',
'tests/UniscribeHelperTest.cpp',
'tests/WebFrameTest.cpp',
'tests/WebPageSerializerTest.cpp',
],
}],
['OS=="mac"', {
......
......@@ -34,10 +34,12 @@
#include "WebCommon.h"
namespace WebKit {
class WebCString;
class WebFrame;
class WebPageSerializerClient;
class WebString;
class WebURL;
class WebView;
template <typename T> class WebVector;
// Get html data by serializing all frames of current page with lists
......@@ -69,6 +71,14 @@ public:
const WebVector<WebString>& localPaths,
const WebString& localDirectoryName);
// Retrieve all the resource for the passed view, including the main frame
// and sub-frames. Returns true if all resources were retrieved
// successfully.
WEBKIT_API static bool retrieveAllResources(WebView*,
const WebVector<WebCString>& supportedSchemes,
WebVector<WebURL>* resources,
WebVector<WebURL>* frames);
// FIXME: The following are here for unit testing purposes. Consider
// changing the unit tests instead.
......@@ -80,6 +90,6 @@ public:
WEBKIT_API static WebString generateBaseTagDeclaration(const WebString& baseTarget);
};
} // namespace WebKit
} // namespace WebKit
#endif
......@@ -143,6 +143,16 @@ inline bool operator<(const WebURL& a, const WebURL& b)
return a.spec() < b.spec();
}
inline bool operator==(const WebURL& a, const WebURL& b)
{
return !a.spec().compare(b.spec());
}
inline bool operator!=(const WebURL& a, const WebURL& b)
{
return !(a == b);
}
} // namespace WebKit
#endif
......@@ -125,6 +125,15 @@ public:
WEBKIT_ASSERT(i < m_size);
return m_ptr[i];
}
bool contains(const T& value) const
{
for (size_t i = 0; i < m_size; i++) {
if (m_ptr[i] == value)
return true;
}
return false;
}
T* data() { return m_ptr; }
const T* data() const { return m_ptr; }
......@@ -172,6 +181,6 @@ private:
size_t m_size;
};
} // namespace WebKit
} // namespace WebKit
#endif
......@@ -174,7 +174,7 @@ WebDataSourceImpl::WebDataSourceImpl(const ResourceRequest& request, const Subst
// frame, which results in a second data source being created. We want
// to wait to attach the WebPluginLoadObserver to that data source.
if (!request.url().isEmpty()) {
ASSERT(m_nextPluginLoadObserver->url() == request.url());
ASSERT(m_nextPluginLoadObserver->url() == WebURL(request.url()));
m_pluginLoadObserver.set(m_nextPluginLoadObserver);
m_nextPluginLoadObserver = 0;
}
......
......@@ -31,19 +31,152 @@
#include "config.h"
#include "WebPageSerializer.h"
#include "DocumentLoader.h"
#include "Element.h"
#include "Frame.h"
#include "HTMLAllCollection.h"
#include "HTMLFrameOwnerElement.h"
#include "HTMLInputElement.h"
#include "HTMLNames.h"
#include "KURL.h"
#include "Vector.h"
#include "WebCString.h"
#include "WebFrame.h"
#include "WebFrameImpl.h"
#include "WebPageSerializerClient.h"
#include "WebPageSerializerImpl.h"
#include "WebString.h"
#include "WebURL.h"
#include "WebVector.h"
#include "WebView.h"
#include <wtf/text/StringConcatenate.h>
using namespace WebCore;
namespace {
KURL getSubResourceURLFromElement(Element* element)
{
ASSERT(element);
const QualifiedName* attributeName = 0;
if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
attributeName = &HTMLNames::srcAttr;
else if (element->hasTagName(HTMLNames::inputTag)) {
HTMLInputElement* input = static_cast<HTMLInputElement*>(element);
if (input->isImageButton())
attributeName = &HTMLNames::srcAttr;
} else if (element->hasTagName(HTMLNames::bodyTag)
|| element->hasTagName(HTMLNames::tableTag)
|| element->hasTagName(HTMLNames::trTag)
|| element->hasTagName(HTMLNames::tdTag))
attributeName = &HTMLNames::backgroundAttr;
else if (element->hasTagName(HTMLNames::blockquoteTag)
|| element->hasTagName(HTMLNames::qTag)
|| element->hasTagName(HTMLNames::delTag)
|| element->hasTagName(HTMLNames::insTag))
attributeName = &HTMLNames::citeAttr;
else if (element->hasTagName(HTMLNames::linkTag)) {
// If the link element is not css, ignore it.
if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
// FIXME: Add support for extracting links of sub-resources which
// are inside style-sheet such as @import, @font-face, url(), etc.
attributeName = &HTMLNames::hrefAttr;
}
} else if (element->hasTagName(HTMLNames::objectTag))
attributeName = &HTMLNames::dataAttr;
else if (element->hasTagName(HTMLNames::embedTag))
attributeName = &HTMLNames::srcAttr;
if (!attributeName)
return KURL();
String value = element->getAttribute(*attributeName);
// Ignore javascript content.
if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
return KURL();
return element->document()->completeURL(value);
}
void retrieveResourcesForElement(Element* element,
Vector<Frame*>* visitedFrames,
Vector<Frame*>* framesToVisit,
Vector<KURL>* frameURLs,
Vector<KURL>* resourceURLs)
{
// If the node is a frame, we'll process it later in retrieveResourcesForFrame.
if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
|| element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
&& element->isFrameOwnerElement()) {
Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame();
if (frame) {
if (!visitedFrames->contains(frame))
framesToVisit->append(frame);
return;
}
}
KURL url = getSubResourceURLFromElement(element);
if (url.isEmpty() || !url.isValid())
return; // No subresource for this node.
// Ignore URLs that have a non-standard protocols. Since the FTP protocol
// does no have a cache mechanism, we skip it as well.
if (!url.protocolInHTTPFamily() && !url.isLocalFile())
return;
if (!resourceURLs->contains(url))
resourceURLs->append(url);
}
void retrieveResourcesForFrame(Frame* frame,
const WebKit::WebVector<WebKit::WebCString>& supportedSchemes,
Vector<Frame*>* visitedFrames,
Vector<Frame*>* framesToVisit,
Vector<KURL>* frameURLs,
Vector<KURL>* resourceURLs)
{
KURL frameURL = frame->loader()->documentLoader()->request().url();
// If the frame's URL is invalid, ignore it, it is not retrievable.
if (!frameURL.isValid())
return;
// Ignore frames from unsupported schemes.
bool isValidScheme = false;
for (size_t i = 0; i < supportedSchemes.size(); ++i) {
if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
isValidScheme = true;
break;
}
}
if (!isValidScheme)
return;
// If we have already seen that frame, ignore it.
if (visitedFrames->contains(frame))
return;
visitedFrames->append(frame);
if (!frameURLs->contains(frameURL))
frameURLs->append(frameURL);
// Now get the resources associated with each node of the document.
RefPtr<HTMLAllCollection> allNodes = frame->document()->all();
for (unsigned i = 0; i < allNodes->length(); ++i) {
Node* node = allNodes->item(i);
// We are only interested in HTML resources.
if (!node->isElementNode())
continue;
retrieveResourcesForElement(static_cast<Element*>(node),
visitedFrames, framesToVisit,
frameURLs, resourceURLs);
}
}
} // namespace
namespace WebKit {
bool WebPageSerializer::serialize(WebFrame* frame,
......@@ -58,6 +191,48 @@ bool WebPageSerializer::serialize(WebFrame* frame,
return serializerImpl.serialize();
}
bool WebPageSerializer::retrieveAllResources(WebView* view,
const WebVector<WebCString>& supportedSchemes,
WebVector<WebURL>* resourceURLs,
WebVector<WebURL>* frameURLs) {
WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame());
if (!mainFrame)
return false;
Vector<Frame*> framesToVisit;
Vector<Frame*> visitedFrames;
Vector<KURL> frameKURLs;
Vector<KURL> resourceKURLs;
// Let's retrieve the resources from every frame in this page.
framesToVisit.append(mainFrame->frame());
while (!framesToVisit.isEmpty()) {
Frame* frame = framesToVisit[0];
framesToVisit.remove(0);
retrieveResourcesForFrame(frame, supportedSchemes,
&visitedFrames, &framesToVisit,
&frameKURLs, &resourceKURLs);
}
// Converts the results to WebURLs.
WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
for (size_t i = 0; i < resourceKURLs.size(); ++i) {
resultResourceURLs[i] = resourceKURLs[i];
// A frame's src can point to the same URL as another resource, keep the
// resource URL only in such cases.
size_t index = frameKURLs.find(resourceKURLs[i]);
if (index != notFound)
frameKURLs.remove(index);
}
*resourceURLs = resultResourceURLs;
WebVector<WebURL> resultFrameURLs(frameKURLs.size());
for (size_t i = 0; i < frameKURLs.size(); ++i)
resultFrameURLs[i] = frameKURLs[i];
*frameURLs = resultFrameURLs;
return true;
}
WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
{
return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">");
......@@ -77,4 +252,4 @@ WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTar
return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">");
}
} // namespace WebKit
} // namespace WebKit
......@@ -28,6 +28,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <googleurl/src/gurl.h>
#include <gtest/gtest.h>
#include <webkit/support/webkit_support.h>
......
/*
* Copyright (C) 2011 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "WebPageSerializer.h"
#include "WebFrame.h"
#include "WebFrameClient.h"
#include "WebString.h"
#include "WebURL.h"
#include "WebURLRequest.h"
#include "WebURLResponse.h"
#include "WebView.h"
#include <googleurl/src/gurl.h>
#include <gtest/gtest.h>
#include <webkit/support/webkit_support.h>
using namespace WebKit;
namespace {
class TestWebFrameClient : public WebFrameClient {
};
class WebPageSerializerTest : public testing::Test {
public:
WebPageSerializerTest() : m_webView(0), m_supportedSchemes(3U)
{
m_supportedSchemes[0] = "http";
m_supportedSchemes[1] = "https";
m_supportedSchemes[2] = "file";
}
protected:
virtual void SetUp()
{
// Create and initialize the WebView.
m_webView = WebView::create(0, 0, 0);
m_webView->initializeMainFrame(&m_webFrameClient);
}
virtual void TearDown()
{
webkit_support::UnregisterAllMockedURLs();
m_webView->close();
}
void registerMockedURLLoad(const WebURL& url, const WebString& fileName)
{
WebURLResponse response;
response.initialize();
response.setMIMEType("text/html");
std::string filePath = webkit_support::GetWebKitRootDir().utf8();
filePath.append("/Source/WebKit/chromium/tests/data/pageserialization/");
filePath.append(fileName.utf8());
webkit_support::RegisterMockedURL(url, response, WebString::fromUTF8(filePath));
}
void loadURLInTopFrame(const GURL& url)
{
WebURLRequest urlRequest;
urlRequest.initialize();
urlRequest.setURL(WebURL(url));
m_webView->mainFrame()->loadRequest(urlRequest);
// Make sure any pending request get served.
webkit_support::ServeAsynchronousMockedRequests();
}
static bool webVectorContains(const WebVector<WebURL>& vector, char* url)
{
return vector.contains(WebURL(GURL(url)));
}
// Useful for debugging.
static void printWebURLs(const WebVector<WebURL>& urls)
{
for (size_t i = 0; i < urls.size(); i++)
printf("%s\n", urls[i].spec().data());
}
WebView* m_webView;
WebVector<WebCString> m_supportedSchemes;
private:
TestWebFrameClient m_webFrameClient;
};
TEST_F(WebPageSerializerTest, HTMLNodes)
{
// Register the mocked frame and load it.
WebURL topFrameURL = GURL("http://www.test.com");
registerMockedURLLoad(topFrameURL, WebString::fromUTF8("simple_page.html"));
loadURLInTopFrame(topFrameURL);
// Retrieve all resources.
WebVector<WebURL> frames;
WebVector<WebURL> resources;
ASSERT_TRUE(WebPageSerializer::retrieveAllResources(
m_webView, m_supportedSchemes, &resources, &frames));
// Tests that all resources from the frame have been retrieved.
EXPECT_EQ(1, frames.size()); // There should be no duplicates.
EXPECT_TRUE(webVectorContains(frames, "http://www.test.com"));
EXPECT_EQ(14, resources.size()); // There should be no duplicates.
EXPECT_TRUE(webVectorContains(resources, "http://www.example.com/beautifull.css"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/awesome.js"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/bodyBackground.jpg"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/awesome.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/imageButton.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/tableBackground.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/trBackground.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/tdBackground.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.evene.fr/citations/auteur.php?ida=46"));
EXPECT_TRUE(webVectorContains(resources, "http://www.brainyquote.com/quotes/authors/c/charles_darwin.html"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/why_deleted.html"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/why_inserted.html"));
EXPECT_TRUE(webVectorContains(resources, "https://www.secure.com/https.gif"));
EXPECT_TRUE(webVectorContains(resources, "file://c/my_folder/file.gif"));
}
TEST_F(WebPageSerializerTest, MultipleFrames)
{
// Register the mocked frames.
WebURL topFrameURL = GURL("http://www.test.com");
registerMockedURLLoad(topFrameURL, WebString::fromUTF8("top_frame.html"));
registerMockedURLLoad(GURL("http://www.test.com/simple_iframe.html"),
WebString::fromUTF8("simple_iframe.html"));
registerMockedURLLoad(GURL("http://www.test.com/object_iframe.html"),
WebString::fromUTF8("object_iframe.html"));
registerMockedURLLoad(GURL("http://www.test.com/embed_iframe.html"),
WebString::fromUTF8("embed_iframe.html"));
// If we don't register a mocked resource for awesome.png, it causes the
// document loader of the iframe that has it as its src to assert on close,
// not sure why.
registerMockedURLLoad(GURL("http://www.test.com/awesome.png"),
WebString::fromUTF8("awesome.png"));
loadURLInTopFrame(topFrameURL);
// Retrieve all resources.
WebVector<WebURL> frames;
WebVector<WebURL> resources;
ASSERT_TRUE(WebPageSerializer::retrieveAllResources(
m_webView, m_supportedSchemes, &resources, &frames));
// Tests that all resources from the frame have been retrieved.
EXPECT_EQ(4, frames.size()); // There should be no duplicates.
EXPECT_TRUE(webVectorContains(frames, "http://www.test.com"));
EXPECT_TRUE(webVectorContains(frames, "http://www.test.com/simple_iframe.html"));
EXPECT_TRUE(webVectorContains(frames, "http://www.test.com/object_iframe.html"));
EXPECT_TRUE(webVectorContains(frames, "http://www.test.com/embed_iframe.html"));
EXPECT_EQ(5, resources.size()); // There should be no duplicates.
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/awesome.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/innerFrame.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/flash.swf"));
// FIXME: for some reason the following resources is missing on one of the bot
// causing the test to fail. Probably a plugin issue.
// EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/music.mid"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/object.png"));
EXPECT_TRUE(webVectorContains(resources, "http://www.test.com/embed.png"));
}
}
<html>
<!--
This page is used to test that WebPageSerializer::retrieveAllResources retrieves
correctly the expected resources when dealing with multiple frames.
-->
<body>
<img src="awesome.png"/>
<img src="innerFrame.png"/>
</body>
</html>
<html>
<!--
This page is used to test that WebPageSerializer::retrieveAllResources retrieves
correctly the expected resources from various HTML nodes.
-->
<head>
<!-- Style sheet links should be retrieved -->
<link rel="stylesheet" type="text/css" href="http://www.example.com/beautifull.css"/>
<!-- Other link should NOT be retrieved -->
<link rel="copyright" type="text/plain" href="http://www.example.com/copyright"/>
<!-- Scripts should be retrieved -->
<script src="awesome.js"></script>
</head>
<!-- Images are always retrieved -->
<body background="bodyBackground.jpg">
<!-- Twice to make sure we only report each resource once -->
<img src="awesome.png"/>
<img src="awesome.png"/>
<form>
<input type="image" src="imageButton.png"/>
</form>
<table background="tableBackground.png">
<tr background="trBackground.png">
<td background="tdBackground.png"></td>
</tr>
<tr background="trBackground.png">
<td background="tdBackground.png"></td>
</tr>
</table>
<!-- Some more obscure tags -->
<blockquote cite="http://www.evene.fr/citations/auteur.php?ida=46"></blockquote>
<q CITE="http://www.brainyquote.com/quotes/authors/c/charles_darwin.html"></q>
<p>My favorite color is <del cite="why_deleted.html">blue</del> <ins>red</ins>!</p>
<p>My favorite color is <del>blue</del> <ins cite="why_inserted.html">red</ins>!</p>
<!-- Make sure we only retrieve URLs with the right schemes -->
<img src="https://www.secure.com/https.gif"/> <!-- HTTPS is OK -->