APIs and tests for retrieving raw/decoded data from image objects

Added FPDFImageObj_GetImageDataDecoded() for retrieving the uncompressed
data of an image, and FPDFImageObj_GetImageDataRaw() for retrieving the
raw data of an image.
    * Refactored out DecodeStreamMaybeCopyAndReturnLength(), which is
      used to decode both attachment data and image data.
    * Within DecodeStreamMaybeCopyAndReturnLength(), used a different
      decoder function which takes care of multiple filters if exist. As
      a result, CPDF_StreamParser::DecodeInlineStream() which was made
      static previously is now moved back into namespace.

Bug=pdfium:677

Change-Id: I22a22c99acaca98ef8c15f88911f2646a2c854d5
Reviewed-on: https://pdfium-review.googlesource.com/9811
Commit-Queue: Jane Liu <janeliulwq@google.com>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdfattachment.cpp b/fpdfsdk/fpdfattachment.cpp
index 0cb623f..5bdb3bd 100644
--- a/fpdfsdk/fpdfattachment.cpp
+++ b/fpdfsdk/fpdfattachment.cpp
@@ -8,7 +8,6 @@
 #include <utility>
 
 #include "core/fdrm/crypto/fx_crypt.h"
-#include "core/fpdfapi/page/cpdf_streamparser.h"
 #include "core/fpdfapi/parser/cpdf_array.h"
 #include "core/fpdfapi/parser/cpdf_document.h"
 #include "core/fpdfapi/parser/cpdf_name.h"
@@ -273,26 +272,5 @@
   if (!pFileStream)
     return 0;
 
-  uint8_t* data = pFileStream->GetRawData();
-  uint32_t len = pFileStream->GetRawSize();
-  CPDF_Dictionary* pFileDict = pFileStream->GetDict();
-  if (!pFileDict || pFileDict->GetStringFor("Filter").IsEmpty()) {
-    if (buffer && buflen >= len)
-      memcpy(buffer, data, len);
-
-    return len;
-  }
-
-  // Decode the stream if a stream filter is specified.
-  uint8_t* decodedData = nullptr;
-  uint32_t decodedLen = 0;
-  CPDF_StreamParser::DecodeInlineStream(
-      data, len, pFileDict->GetIntegerFor("Width"),
-      pFileDict->GetIntegerFor("Height"), pFileDict->GetStringFor("Filter"),
-      pFileDict->GetDictFor("DecodeParms"), &decodedData, &decodedLen);
-  if (buffer && buflen >= decodedLen)
-    memcpy(buffer, decodedData, decodedLen);
-
-  FX_Free(decodedData);
-  return decodedLen;
+  return DecodeStreamMaybeCopyAndReturnLength(pFileStream, buffer, buflen);
 }
diff --git a/fpdfsdk/fpdfedit_embeddertest.cpp b/fpdfsdk/fpdfedit_embeddertest.cpp
index dcaeb94..f1bbb87 100644
--- a/fpdfsdk/fpdfedit_embeddertest.cpp
+++ b/fpdfsdk/fpdfedit_embeddertest.cpp
@@ -5,6 +5,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "core/fpdfapi/font/cpdf_font.h"
 #include "core/fpdfapi/page/cpdf_page.h"
@@ -979,3 +980,53 @@
   FPDFBitmap_Destroy(bitmap);
   UnloadPage(page);
 }
+
+TEST_F(FPDFEditEmbeddertest, GetImageData) {
+  EXPECT_TRUE(OpenDocument("embedded_images.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+  ASSERT_EQ(39, FPDFPage_CountObject(page));
+
+  // Retrieve an image object with flate-encoded data stream.
+  FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, 33);
+  ASSERT_EQ(FPDF_PAGEOBJ_IMAGE, FPDFPageObj_GetType(obj));
+
+  // Check that the raw image data has the correct length and hash value.
+  unsigned long len = FPDFImageObj_GetImageDataRaw(obj, nullptr, 0);
+  std::vector<char> buf(len);
+  EXPECT_EQ(4091u, FPDFImageObj_GetImageDataRaw(obj, buf.data(), len));
+  EXPECT_EQ("f73802327d2e88e890f653961bcda81a",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  // Check that the decoded image data has the correct length and hash value.
+  len = FPDFImageObj_GetImageDataDecoded(obj, nullptr, 0);
+  buf.clear();
+  buf.resize(len);
+  EXPECT_EQ(28776u, FPDFImageObj_GetImageDataDecoded(obj, buf.data(), len));
+  EXPECT_EQ("cb3637934bb3b95a6e4ae1ea9eb9e56e",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  // Retrieve an image obejct with DCTDecode-encoded data stream.
+  obj = FPDFPage_GetObject(page, 37);
+  ASSERT_EQ(FPDF_PAGEOBJ_IMAGE, FPDFPageObj_GetType(obj));
+
+  // Check that the raw image data has the correct length and hash value.
+  len = FPDFImageObj_GetImageDataRaw(obj, nullptr, 0);
+  buf.clear();
+  buf.resize(len);
+  EXPECT_EQ(4370u, FPDFImageObj_GetImageDataRaw(obj, buf.data(), len));
+  EXPECT_EQ("6aae1f3710335023a9e12191be66b64b",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  // Check that the decoded image data has the correct length and hash value,
+  // which should be the same as those of the raw data, since this image is
+  // encoded by a single DCTDecode filter and decoding is a noop.
+  len = FPDFImageObj_GetImageDataDecoded(obj, nullptr, 0);
+  buf.clear();
+  buf.resize(len);
+  EXPECT_EQ(4370u, FPDFImageObj_GetImageDataDecoded(obj, buf.data(), len));
+  EXPECT_EQ("6aae1f3710335023a9e12191be66b64b",
+            GenerateMD5Base16(reinterpret_cast<uint8_t*>(buf.data()), len));
+
+  UnloadPage(page);
+}
diff --git a/fpdfsdk/fpdfeditimg.cpp b/fpdfsdk/fpdfeditimg.cpp
index bfd12b2..0d0c546 100644
--- a/fpdfsdk/fpdfeditimg.cpp
+++ b/fpdfsdk/fpdfeditimg.cpp
@@ -137,3 +137,45 @@
 
   return pBitmap.Leak();
 }
+
+DLLEXPORT unsigned long STDCALL
+FPDFImageObj_GetImageDataDecoded(FPDF_PAGEOBJECT image_object,
+                                 void* buffer,
+                                 unsigned long buflen) {
+  CPDF_PageObject* pObj = CPDFPageObjectFromFPDFPageObject(image_object);
+  if (!pObj || !pObj->IsImage())
+    return 0;
+
+  CFX_RetainPtr<CPDF_Image> pImg = pObj->AsImage()->GetImage();
+  if (!pImg)
+    return 0;
+
+  CPDF_Stream* pImgStream = pImg->GetStream();
+  if (!pImgStream)
+    return 0;
+
+  return DecodeStreamMaybeCopyAndReturnLength(pImgStream, buffer, buflen);
+}
+
+DLLEXPORT unsigned long STDCALL
+FPDFImageObj_GetImageDataRaw(FPDF_PAGEOBJECT image_object,
+                             void* buffer,
+                             unsigned long buflen) {
+  CPDF_PageObject* pObj = CPDFPageObjectFromFPDFPageObject(image_object);
+  if (!pObj || !pObj->IsImage())
+    return 0;
+
+  CFX_RetainPtr<CPDF_Image> pImg = pObj->AsImage()->GetImage();
+  if (!pImg)
+    return 0;
+
+  CPDF_Stream* pImgStream = pImg->GetStream();
+  if (!pImgStream)
+    return 0;
+
+  uint32_t len = pImgStream->GetRawSize();
+  if (buffer && buflen >= len)
+    memcpy(buffer, pImgStream->GetRawData(), len);
+
+  return len;
+}
diff --git a/fpdfsdk/fpdfview.cpp b/fpdfsdk/fpdfview.cpp
index 5aa8013..57e4806 100644
--- a/fpdfsdk/fpdfview.cpp
+++ b/fpdfsdk/fpdfview.cpp
@@ -357,10 +357,48 @@
 unsigned long Utf16EncodeMaybeCopyAndReturnLength(const CFX_WideString& text,
                                                   void* buffer,
                                                   unsigned long buflen) {
-  CFX_ByteString encodedText = text.UTF16LE_Encode();
-  unsigned long len = encodedText.GetLength();
+  CFX_ByteString encoded_text = text.UTF16LE_Encode();
+  unsigned long len = encoded_text.GetLength();
   if (buffer && len <= buflen)
-    memcpy(buffer, encodedText.c_str(), len);
+    memcpy(buffer, encoded_text.c_str(), len);
+  return len;
+}
+
+unsigned long DecodeStreamMaybeCopyAndReturnLength(const CPDF_Stream* stream,
+                                                   void* buffer,
+                                                   unsigned long buflen) {
+  ASSERT(stream);
+  uint8_t* data = stream->GetRawData();
+  uint32_t len = stream->GetRawSize();
+  CPDF_Dictionary* dict = stream->GetDict();
+  CPDF_Object* decoder = dict ? dict->GetDirectObjectFor("Filter") : nullptr;
+  if (decoder && (decoder->IsArray() || decoder->IsName())) {
+    // Decode the stream if one or more stream filters are specified.
+    uint8_t* decoded_data = nullptr;
+    uint32_t decoded_len = 0;
+    CFX_ByteString dummy_last_decoder;
+    CPDF_Dictionary* dummy_last_param;
+    if (PDF_DataDecode(data, len, dict, dict->GetIntegerFor("DL"), false,
+                       &decoded_data, &decoded_len, &dummy_last_decoder,
+                       &dummy_last_param)) {
+      if (buffer && buflen >= decoded_len)
+        memcpy(buffer, decoded_data, decoded_len);
+
+      // Free the buffer for the decoded data if it was allocated by
+      // PDF_DataDecode(). Note that for images with a single image-specific
+      // filter, |decoded_data| is directly assigned to be |data|, so
+      // |decoded_data| does not need to be freed.
+      if (decoded_data != data)
+        FX_Free(decoded_data);
+
+      return decoded_len;
+    }
+  }
+  // Copy the raw data and return its length if there is no valid filter
+  // specified or if decoding failed.
+  if (buffer && buflen >= len)
+    memcpy(buffer, data, len);
+
   return len;
 }
 
diff --git a/fpdfsdk/fpdfview_c_api_test.c b/fpdfsdk/fpdfview_c_api_test.c
index e47f4d1..d40437c 100644
--- a/fpdfsdk/fpdfview_c_api_test.c
+++ b/fpdfsdk/fpdfview_c_api_test.c
@@ -133,6 +133,8 @@
     CHK(FPDFImageObj_SetMatrix);
     CHK(FPDFImageObj_SetBitmap);
     CHK(FPDFImageObj_GetBitmap);
+    CHK(FPDFImageObj_GetImageDataDecoded);
+    CHK(FPDFImageObj_GetImageDataRaw);
     CHK(FPDFPageObj_CreateNewPath);
     CHK(FPDFPageObj_CreateNewRect);
     CHK(FPDFPath_SetStrokeColor);
diff --git a/fpdfsdk/fsdk_define.h b/fpdfsdk/fsdk_define.h
index 610b854..91efc27 100644
--- a/fpdfsdk/fsdk_define.h
+++ b/fpdfsdk/fsdk_define.h
@@ -26,6 +26,7 @@
 class CPDF_PageObject;
 class CPDF_PageRenderContext;
 class CPDF_PathObject;
+class CPDF_Stream;
 class IFSDK_PAUSE_Adapter;
 
 // Layering prevents fxcrt from knowing about FPDF_FILEACCESS, so this can't
@@ -77,6 +78,10 @@
                                                   void* buffer,
                                                   unsigned long buflen);
 
+unsigned long DecodeStreamMaybeCopyAndReturnLength(const CPDF_Stream* stream,
+                                                   void* buffer,
+                                                   unsigned long buflen);
+
 void FSDK_SetSandBoxPolicy(FPDF_DWORD policy, FPDF_BOOL enable);
 FPDF_BOOL FSDK_IsSandBoxPolicyEnabled(FPDF_DWORD policy);
 void FPDF_RenderPage_Retail(CPDF_PageRenderContext* pContext,