testing/tools/gold.py - pdfium - Git at Google

 # Copyright 2015 The PDFium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.


 import json
 import os
 import shlex
 import shutil
 import ssl
 import urllib2


 def _ParseKeyValuePairs(kv_str):
   """
   Parses a string of the type 'key1 value1 key2 value2' into a dict.
   """
   kv_pairs = shlex.split(kv_str)
   if len(kv_pairs) % 2:
     raise ValueError('Uneven number of key/value pairs. Got %s' % kv_str)
   return { kv_pairs[i]:kv_pairs[i + 1] for i in xrange(0, len(kv_pairs), 2) }


 # This module downloads a json provided by Skia Gold with the expected baselines
 # for each test file.
 #
 # The expected format for the json is:
 # {
 #   "commit": {
 #     "author": "John Doe (jdoe@chromium.org)",
 #     "commit_time": 1510598123,
 #     "hash": "cee39e6e90c219cc91f2c94a912a06977f4461a0"
 #   },
 #   "master": {
 #     "abc.pdf.1": {
 #       "0ec3d86f545052acd7c9a16fde8ca9d4": 1,
 #       "80455b71673becc9fbc100d6da56ca65": 1,
 #       "b68e2ecb80090b4502ec89ad1be2322c": 1
 #      },
 #     "defgh.pdf.0": {
 #       "01e020cd4cd05c6738e479a46a506044": 1,
 #       "b68e2ecb80090b4502ec89ad1be2322c": 1
 #     }
 #   },
 #   "changeLists": {
 #     "18499" : {
 #       "abc.pdf.1": {
 #         "d5dd649124cf1779152253dc8fb239c5": 1,
 #         "42a270581930579cdb0f28674972fb1a": 1,
 #       }
 #     }
 #   }
 # }
 class GoldBaseline(object):

   def __init__(self, properties_str):
     """
     properties_str is a string with space separated key/value pairs that
                is used to find the cl number for which to baseline
     """
     self._properties = _ParseKeyValuePairs(properties_str)
     self._baselines = self._LoadSkiaGoldBaselines()

   def _LoadSkiaGoldBaselines(self):
     """
     Download the baseline json and return a list of the two baselines that
     should be used to match hashes (master and cl#).
     """
     GOLD_BASELINE_URL = 'https://pdfium-gold.skia.org/json/baseline'

     # If we have an issue number add it to the baseline URL
     cl_number_str = self._properties.get('issue', None)
     url = GOLD_BASELINE_URL + ('/' + cl_number_str if cl_number_str else '')

     json_data = ''
     MAX_TIMEOUT = 33  # 5 tries. (2, 4, 8, 16, 32)
     timeout = 2
     while True:
       try:
         response = urllib2.urlopen(url, timeout=timeout)
         c_type = response.headers.get('Content-type', '')
         EXPECTED_CONTENT_TYPE = 'application/json'
         if c_type != EXPECTED_CONTENT_TYPE:
           raise ValueError('Invalid content type. Got %s instead of %s' % (
               c_type, EXPECTED_CONTENT_TYPE))
         json_data = response.read()
         break  # If this line is reached, then no exception occurred.
       except (ssl.SSLError, urllib2.HTTPError, urllib2.URLError) as e:
         timeout *= 2
         if timeout < MAX_TIMEOUT:
           continue
         print ('Error: Unable to read skia gold json from %s: %s' % (url, e))
         return None

     try:
       data = json.loads(json_data)
     except ValueError as e:
       print 'Error: Malformed json read from %s: %s' % (url, e)
       return None

     return data.get('master', {})

   # Return values for MatchLocalResult().
   MATCH = 'match'
   MISMATCH = 'mismatch'
   NO_BASELINE = 'no_baseline'
   BASELINE_DOWNLOAD_FAILED = 'baseline_download_failed'

   def MatchLocalResult(self, test_name, md5_hash):
     """
     Match a locally generated hash of a test cases rendered image with the
     expected hashes downloaded in the baselines json.

     Each baseline is a dict mapping the test case name to a dict with the
     expected hashes as keys. Therefore, this list of baselines should be
     searched until the test case name is found, then the hash should be matched
     with the options in that dict. If the hashes don't match, it should be
     considered a failure and we should not continue searching the baseline list.

     Returns MATCH if the md5 provided matches the ones in the baseline json,
     MISMATCH if it does not, NO_BASELINE if the test case has no baseline, or
     BASELINE_DOWNLOAD_FAILED if the baseline could not be downloaded and parsed.
     """
     if self._baselines is None:
       return GoldBaseline.BASELINE_DOWNLOAD_FAILED

     found_test_case = False
     if test_name in self._baselines:
       found_test_case = True
       if md5_hash in self._baselines[test_name]:
         return GoldBaseline.MATCH

     return (GoldBaseline.MISMATCH if found_test_case
             else GoldBaseline.NO_BASELINE)


 # This module collects and writes output in a format expected by the
 # Gold baseline tool. Based on meta data provided explicitly and by
 # adding a series of test results it can be used to produce
 # a JSON file that is uploaded to Google Storage and ingested by Gold.
 #
 # The output will look similar this:
 #
 # {
 #    "build_number" : "2",
 #    "gitHash" : "a4a338179013b029d6dd55e737b5bd648a9fb68c",
 #    "key" : {
 #       "arch" : "arm64",
 #       "compiler" : "Clang",
 #    },
 #    "results" : [
 #       {
 #          "key" : {
 #             "config" : "vk",
 #             "name" : "yuv_nv12_to_rgb_effect",
 #             "source_type" : "gm"
 #          },
 #          "md5" : "7db34da246868d50ab9ddd776ce6d779",
 #          "options" : {
 #             "ext" : "png",
 #             "gamma_correct" : "no"
 #          }
 #       },
 #       {
 #          "key" : {
 #             "config" : "vk",
 #             "name" : "yuv_to_rgb_effect",
 #             "source_type" : "gm"
 #          },
 #          "md5" : "0b955f387740c66eb23bf0e253c80d64",
 #          "options" : {
 #             "ext" : "png",
 #             "gamma_correct" : "no"
 #          }
 #       }
 #    ],
 # }
 #
 class GoldResults(object):
   def __init__(self, source_type, output_dir, properties_str, key_str,
                ignore_hashes_file):
     """
     source_type is the source_type (=corpus) field used for all results.
     output_dir is the directory where the resulting images are copied and
                the dm.json file is written. If the directory exists it will
                be removed and recreated.
     properties_str is a string with space separated key/value pairs that
                is used to set the top level fields in the output JSON file.
     key_str is a string with space separated key/value pairs that
                is used to set the 'key' field in the output JSON file.
     ignore_hashes_file is a file that contains a list of image hashes
                that should be ignored.
     """
     self._source_type = source_type
     self._properties = _ParseKeyValuePairs(properties_str)
     self._properties['key'] = _ParseKeyValuePairs(key_str)
     self._results =  []
     self._passfail = []
     self._output_dir = output_dir

     # make sure the output directory exists and is empty.
     if os.path.exists(output_dir):
       shutil.rmtree(output_dir, ignore_errors=True)
     os.makedirs(output_dir)

     self._ignore_hashes = set()
     if ignore_hashes_file:
       with open(ignore_hashes_file, 'r') as ig_file:
         hashes=[x.strip() for x in ig_file.readlines() if x.strip()]
         self._ignore_hashes = set(hashes)

   def AddTestResult(self, testName, md5Hash, outputImagePath, matchResult):
     # If the hash is in the list of hashes to ignore then we don'try
     # make a copy, but add it to the result.
     imgExt = os.path.splitext(outputImagePath)[1].lstrip('.')
     if md5Hash not in self._ignore_hashes:
       # Copy the image to <output_dir>/<md5Hash>.<image_extension>
       if not imgExt:
         raise ValueError('File %s does not have an extension' % outputImagePath)
       newFilePath = os.path.join(self._output_dir, md5Hash + '.' + imgExt)
       shutil.copy2(outputImagePath, newFilePath)

     # Add an entry to the list of test results
     self._results.append({
       'key': {
         'name': testName,
         'source_type': self._source_type,
       },
       'md5': md5Hash,
       'options': {
         'ext': imgExt,
         'gamma_correct': 'no'
       }
     })

     self._passfail.append((testName, matchResult))

   def WriteResults(self):
     self._properties.update({
       'results': self._results
     })

     output_file_name = os.path.join(self._output_dir, 'dm.json')
     with open(output_file_name, 'wb') as outfile:
       json.dump(self._properties, outfile, indent=1)
       outfile.write('\n')

     output_file_name = os.path.join(self._output_dir, 'passfail.json')
     with open(output_file_name, 'wb') as outfile:
       json.dump(self._passfail, outfile, indent=1)
       outfile.write('\n')

 # Produce example output for manual testing.
 if __name__ == '__main__':
   # Create a test directory with three empty 'image' files.
   test_dir = './testdirectory'
   if not os.path.exists(test_dir):
     os.makedirs(test_dir)
   open(os.path.join(test_dir, 'image1.png'), 'wb').close()
   open(os.path.join(test_dir, 'image2.png'), 'wb').close()
   open(os.path.join(test_dir, 'image3.png'), 'wb').close()

   # Create an instance and add results.
   prop_str = 'build_number 2 "builder name" Builder-Name gitHash ' \
       'a4a338179013b029d6dd55e737b5bd648a9fb68c'

   key_str = 'arch arm64 compiler Clang configuration Debug'

   hash_file = os.path.join(test_dir, 'ignore_hashes.txt')
   with open(hash_file, 'wb') as f:
     f.write('\n'.join(['hash-1', 'hash-4']) + '\n')

   output_dir = './output_directory'
   gr = GoldResults('pdfium', output_dir, prop_str, key_str, hash_file)
   gr.AddTestResult('test-1', 'hash-1', os.path.join(test_dir, 'image1.png'),
                    GoldBaseline.MATCH)
   gr.AddTestResult('test-2', 'hash-2', os.path.join(test_dir, 'image2.png'),
                    GoldBaseline.MATCH)
   gr.AddTestResult('test-3', 'hash-3', os.path.join(test_dir, 'image3.png'),
                    GoldBaseline.MISMATCH)
   gr.WriteResults()
	# Copyright 2015 The PDFium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.


	import json
	import os
	import shlex
	import shutil
	import ssl
	import urllib2


	def _ParseKeyValuePairs(kv_str):
	"""
	Parses a string of the type 'key1 value1 key2 value2' into a dict.
	"""
	kv_pairs = shlex.split(kv_str)
	if len(kv_pairs) % 2:
	raise ValueError('Uneven number of key/value pairs. Got %s' % kv_str)
	return { kv_pairs[i]:kv_pairs[i + 1] for i in xrange(0, len(kv_pairs), 2) }


	# This module downloads a json provided by Skia Gold with the expected baselines
	# for each test file.
	#
	# The expected format for the json is:
	# {
	# "commit": {
	# "author": "John Doe (jdoe@chromium.org)",
	# "commit_time": 1510598123,
	# "hash": "cee39e6e90c219cc91f2c94a912a06977f4461a0"
	# },
	# "master": {
	# "abc.pdf.1": {
	# "0ec3d86f545052acd7c9a16fde8ca9d4": 1,
	# "80455b71673becc9fbc100d6da56ca65": 1,
	# "b68e2ecb80090b4502ec89ad1be2322c": 1
	# },
	# "defgh.pdf.0": {
	# "01e020cd4cd05c6738e479a46a506044": 1,
	# "b68e2ecb80090b4502ec89ad1be2322c": 1
	# }
	# },
	# "changeLists": {
	# "18499" : {
	# "abc.pdf.1": {
	# "d5dd649124cf1779152253dc8fb239c5": 1,
	# "42a270581930579cdb0f28674972fb1a": 1,
	# }
	# }
	# }
	# }
	class GoldBaseline(object):

	def __init__(self, properties_str):
	"""
	properties_str is a string with space separated key/value pairs that
	is used to find the cl number for which to baseline
	"""
	self._properties = _ParseKeyValuePairs(properties_str)
	self._baselines = self._LoadSkiaGoldBaselines()

	def _LoadSkiaGoldBaselines(self):
	"""
	Download the baseline json and return a list of the two baselines that
	should be used to match hashes (master and cl#).
	"""
	GOLD_BASELINE_URL = 'https://pdfium-gold.skia.org/json/baseline'

	# If we have an issue number add it to the baseline URL
	cl_number_str = self._properties.get('issue', None)
	url = GOLD_BASELINE_URL + ('/' + cl_number_str if cl_number_str else '')

	json_data = ''
	MAX_TIMEOUT = 33 # 5 tries. (2, 4, 8, 16, 32)
	timeout = 2
	while True:
	try:
	response = urllib2.urlopen(url, timeout=timeout)
	c_type = response.headers.get('Content-type', '')
	EXPECTED_CONTENT_TYPE = 'application/json'
	if c_type != EXPECTED_CONTENT_TYPE:
	raise ValueError('Invalid content type. Got %s instead of %s' % (
	c_type, EXPECTED_CONTENT_TYPE))
	json_data = response.read()
	break # If this line is reached, then no exception occurred.
	except (ssl.SSLError, urllib2.HTTPError, urllib2.URLError) as e:
	timeout *= 2
	if timeout < MAX_TIMEOUT:
	continue
	print ('Error: Unable to read skia gold json from %s: %s' % (url, e))
	return None

	try:
	data = json.loads(json_data)
	except ValueError as e:
	print 'Error: Malformed json read from %s: %s' % (url, e)
	return None

	return data.get('master', {})

	# Return values for MatchLocalResult().
	MATCH = 'match'
	MISMATCH = 'mismatch'
	NO_BASELINE = 'no_baseline'
	BASELINE_DOWNLOAD_FAILED = 'baseline_download_failed'

	def MatchLocalResult(self, test_name, md5_hash):
	"""
	Match a locally generated hash of a test cases rendered image with the
	expected hashes downloaded in the baselines json.

	Each baseline is a dict mapping the test case name to a dict with the
	expected hashes as keys. Therefore, this list of baselines should be
	searched until the test case name is found, then the hash should be matched
	with the options in that dict. If the hashes don't match, it should be
	considered a failure and we should not continue searching the baseline list.

	Returns MATCH if the md5 provided matches the ones in the baseline json,
	MISMATCH if it does not, NO_BASELINE if the test case has no baseline, or
	BASELINE_DOWNLOAD_FAILED if the baseline could not be downloaded and parsed.
	"""
	if self._baselines is None:
	return GoldBaseline.BASELINE_DOWNLOAD_FAILED

	found_test_case = False
	if test_name in self._baselines:
	found_test_case = True
	if md5_hash in self._baselines[test_name]:
	return GoldBaseline.MATCH

	return (GoldBaseline.MISMATCH if found_test_case
	else GoldBaseline.NO_BASELINE)


	# This module collects and writes output in a format expected by the
	# Gold baseline tool. Based on meta data provided explicitly and by
	# adding a series of test results it can be used to produce
	# a JSON file that is uploaded to Google Storage and ingested by Gold.
	#
	# The output will look similar this:
	#
	# {
	# "build_number" : "2",
	# "gitHash" : "a4a338179013b029d6dd55e737b5bd648a9fb68c",
	# "key" : {
	# "arch" : "arm64",
	# "compiler" : "Clang",
	# },
	# "results" : [
	# {
	# "key" : {
	# "config" : "vk",
	# "name" : "yuv_nv12_to_rgb_effect",
	# "source_type" : "gm"
	# },
	# "md5" : "7db34da246868d50ab9ddd776ce6d779",
	# "options" : {
	# "ext" : "png",
	# "gamma_correct" : "no"
	# }
	# },
	# {
	# "key" : {
	# "config" : "vk",
	# "name" : "yuv_to_rgb_effect",
	# "source_type" : "gm"
	# },
	# "md5" : "0b955f387740c66eb23bf0e253c80d64",
	# "options" : {
	# "ext" : "png",
	# "gamma_correct" : "no"
	# }
	# }
	# ],
	# }
	#
	class GoldResults(object):
	def __init__(self, source_type, output_dir, properties_str, key_str,
	ignore_hashes_file):
	"""
	source_type is the source_type (=corpus) field used for all results.
	output_dir is the directory where the resulting images are copied and
	the dm.json file is written. If the directory exists it will
	be removed and recreated.
	properties_str is a string with space separated key/value pairs that
	is used to set the top level fields in the output JSON file.
	key_str is a string with space separated key/value pairs that
	is used to set the 'key' field in the output JSON file.
	ignore_hashes_file is a file that contains a list of image hashes
	that should be ignored.
	"""
	self._source_type = source_type
	self._properties = _ParseKeyValuePairs(properties_str)
	self._properties['key'] = _ParseKeyValuePairs(key_str)
	self._results = []
	self._passfail = []
	self._output_dir = output_dir

	# make sure the output directory exists and is empty.
	if os.path.exists(output_dir):
	shutil.rmtree(output_dir, ignore_errors=True)
	os.makedirs(output_dir)

	self._ignore_hashes = set()
	if ignore_hashes_file:
	with open(ignore_hashes_file, 'r') as ig_file:
	hashes=[x.strip() for x in ig_file.readlines() if x.strip()]
	self._ignore_hashes = set(hashes)

	def AddTestResult(self, testName, md5Hash, outputImagePath, matchResult):
	# If the hash is in the list of hashes to ignore then we don'try
	# make a copy, but add it to the result.
	imgExt = os.path.splitext(outputImagePath)[1].lstrip('.')
	if md5Hash not in self._ignore_hashes:
	# Copy the image to <output_dir>/<md5Hash>.<image_extension>
	if not imgExt:
	raise ValueError('File %s does not have an extension' % outputImagePath)
	newFilePath = os.path.join(self._output_dir, md5Hash + '.' + imgExt)
	shutil.copy2(outputImagePath, newFilePath)

	# Add an entry to the list of test results
	self._results.append({
	'key': {
	'name': testName,
	'source_type': self._source_type,
	},
	'md5': md5Hash,
	'options': {
	'ext': imgExt,
	'gamma_correct': 'no'
	}
	})

	self._passfail.append((testName, matchResult))

	def WriteResults(self):
	self._properties.update({
	'results': self._results
	})

	output_file_name = os.path.join(self._output_dir, 'dm.json')
	with open(output_file_name, 'wb') as outfile:
	json.dump(self._properties, outfile, indent=1)
	outfile.write('\n')

	output_file_name = os.path.join(self._output_dir, 'passfail.json')
	with open(output_file_name, 'wb') as outfile:
	json.dump(self._passfail, outfile, indent=1)
	outfile.write('\n')

	# Produce example output for manual testing.
	if __name__ == '__main__':
	# Create a test directory with three empty 'image' files.
	test_dir = './testdirectory'
	if not os.path.exists(test_dir):
	os.makedirs(test_dir)
	open(os.path.join(test_dir, 'image1.png'), 'wb').close()
	open(os.path.join(test_dir, 'image2.png'), 'wb').close()
	open(os.path.join(test_dir, 'image3.png'), 'wb').close()

	# Create an instance and add results.
	prop_str = 'build_number 2 "builder name" Builder-Name gitHash ' \
	'a4a338179013b029d6dd55e737b5bd648a9fb68c'

	key_str = 'arch arm64 compiler Clang configuration Debug'

	hash_file = os.path.join(test_dir, 'ignore_hashes.txt')
	with open(hash_file, 'wb') as f:
	f.write('\n'.join(['hash-1', 'hash-4']) + '\n')

	output_dir = './output_directory'
	gr = GoldResults('pdfium', output_dir, prop_str, key_str, hash_file)
	gr.AddTestResult('test-1', 'hash-1', os.path.join(test_dir, 'image1.png'),
	GoldBaseline.MATCH)
	gr.AddTestResult('test-2', 'hash-2', os.path.join(test_dir, 'image2.png'),
	GoldBaseline.MATCH)
	gr.AddTestResult('test-3', 'hash-3', os.path.join(test_dir, 'image3.png'),
	GoldBaseline.MISMATCH)
	gr.WriteResults()