From 3802767817139ca1d287c15cc266c4e114f5ddeb Mon Sep 17 00:00:00 2001 From: Bert Blommers Date: Thu, 12 Mar 2020 12:25:31 +0000 Subject: [PATCH 1/5] S3 - Add test case to showcase bug when downloading large files --- moto/s3/models.py | 11 +++++- moto/s3/responses.py | 15 ++++++- moto/s3/utils.py | 4 +- tests/test_s3/test_s3.py | 84 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 3 deletions(-) diff --git a/moto/s3/models.py b/moto/s3/models.py index 5a665e27e..67b53b984 100644 --- a/moto/s3/models.py +++ b/moto/s3/models.py @@ -12,6 +12,7 @@ import codecs import random import string import tempfile +import threading import sys import time import uuid @@ -110,6 +111,7 @@ class FakeKey(BaseModel): self._value_buffer = tempfile.SpooledTemporaryFile(max_size=max_buffer_size) self._max_buffer_size = max_buffer_size self.value = value + self.lock = threading.Lock() @property def version_id(self): @@ -117,8 +119,14 @@ class FakeKey(BaseModel): @property def value(self): + self.lock.acquire() + print("===>value") self._value_buffer.seek(0) - return self._value_buffer.read() + print("===>seek") + r = self._value_buffer.read() + print("===>read") + self.lock.release() + return r @value.setter def value(self, new_value): @@ -1319,6 +1327,7 @@ class S3Backend(BaseBackend): return key def get_key(self, bucket_name, key_name, version_id=None, part_number=None): + print("get_key("+str(bucket_name)+","+str(key_name)+","+str(version_id)+","+str(part_number)+")") key_name = clean_key_name(key_name) bucket = self.get_bucket(bucket_name) key = None diff --git a/moto/s3/responses.py b/moto/s3/responses.py index b74be9a63..15b1d1670 100644 --- a/moto/s3/responses.py +++ b/moto/s3/responses.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re import sys +import threading import six from botocore.awsrequest import AWSPreparedRequest @@ -150,6 +151,7 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): self.path = "" self.data = {} self.headers = {} + self.lock = threading.Lock() @property def should_autoescape(self): @@ -857,6 +859,7 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): def _handle_range_header(self, request, headers, response_content): response_headers = {} length = len(response_content) + print("Length: " + str(length) + " Range: " + str(request.headers.get("range"))) last = length - 1 _, rspec = request.headers.get("range").split("=") if "," in rspec: @@ -874,6 +877,7 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): else: return 400, response_headers, "" if begin < 0 or end > last or begin > min(end, last): + print(str(begin)+ " < 0 or " + str(end) + " > " + str(last) + " or " + str(begin) + " > min("+str(end)+","+str(last)+")") return 416, response_headers, "" response_headers["content-range"] = "bytes {0}-{1}/{2}".format( begin, end, length @@ -903,14 +907,20 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): response_content = response else: status_code, response_headers, response_content = response + print("response received: " + str(len(response_content))) + print(request.headers) if status_code == 200 and "range" in request.headers: - return self._handle_range_header( + self.lock.acquire() + r = self._handle_range_header( request, response_headers, response_content ) + self.lock.release() + return r return status_code, response_headers, response_content def _control_response(self, request, full_url, headers): + print("_control_response") parsed_url = urlparse(full_url) query = parse_qs(parsed_url.query, keep_blank_values=True) method = request.method @@ -1058,12 +1068,14 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): ) def _key_response_get(self, bucket_name, query, key_name, headers): + print("_key_response_get("+str(key_name)+","+str(headers)+")") self._set_action("KEY", "GET", query) self._authenticate_and_authorize_s3_action() response_headers = {} if query.get("uploadId"): upload_id = query["uploadId"][0] + print("UploadID: " + str(upload_id)) parts = self.backend.list_multipart(bucket_name, upload_id) template = self.response_template(S3_MULTIPART_LIST_RESPONSE) return ( @@ -1095,6 +1107,7 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): response_headers.update(key.metadata) response_headers.update(key.response_dict) + print("returning 200, " + str(headers) + ", " + str(len(key.value)) + " ( " + str(key_name) + ")") return 200, response_headers, key.value def _key_response_put(self, request, body, bucket_name, query, key_name, headers): diff --git a/moto/s3/utils.py b/moto/s3/utils.py index e22b6b860..50ff1cf34 100644 --- a/moto/s3/utils.py +++ b/moto/s3/utils.py @@ -104,7 +104,9 @@ class _VersionedKeyStore(dict): def get(self, key, default=None): try: return self[key] - except (KeyError, IndexError): + except (KeyError, IndexError) as e: + print("Error retrieving " + str(key)) + print(e) pass return default diff --git a/tests/test_s3/test_s3.py b/tests/test_s3/test_s3.py index 48655ee17..2eef9ef82 100644 --- a/tests/test_s3/test_s3.py +++ b/tests/test_s3/test_s3.py @@ -4393,3 +4393,87 @@ def test_s3_config_dict(): assert not logging_bucket["supplementaryConfiguration"].get( "BucketTaggingConfiguration" ) + + +@mock_s3 +def test_delete_downloaded_file(): + # SET UP + filename = '...' + file = open(filename, 'rb') + uploader = PdfFileUploader(file) + boto3.client('s3').create_bucket(Bucket=uploader.bucket_name()) + uploader.upload() + print("================\nUPLOADED\n=================") + # DOWNLOAD + # the following two lines are basically + # boto3.client('s3').download_file(bucket_name, file_name, local_path) + # where bucket_name, file_name and local_path are retrieved from PdfFileUploader + # e.g. boto3.client('s3').download_file("bucket_name", "asdf.pdf", "/tmp/asdf.pdf") + downloader = PdfFileDownloader(uploader.full_bucket_file_name()) + downloader.download() + + downloader.delete_downloaded_file() + + print("Done!") + + +from pathlib import Path +import re +import os +class PdfFileDownloader: + def __init__(self, full_bucket_file_name): + self.bucket_name, self.file_name = self.extract(full_bucket_file_name) + self.s3 = boto3.client('s3') + + def download(self): + try: + self.s3.download_file(self.bucket_name, self.file_name, self.local_path()) + + return self.local_path() + except ClientError as exc: + print("=======") + print(exc) + raise exc + + def local_path(self): + return '/tmp/' + self.file_name.replace('/', '') + + def delete_downloaded_file(self): + if Path(self.local_path()).is_file(): + print("Removing " + str(self.local_path())) + os.remove(self.local_path()) + + def file(self): + return open(self.local_path(), 'rb') + + def extract(self, full_bucket_file_name): + match = re.search(r'([\.a-zA-Z_-]+)\/(.*)', full_bucket_file_name) + + if match and len(match.groups()) == 2: + return (match.groups()[0], match.groups()[1]) + else: + raise RuntimeError(f"Cannot determine bucket and file name for {full_bucket_file_name}") + + +import binascii +class PdfFileUploader: + def __init__(self, file): + self.file = file + date = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + random_hex = binascii.b2a_hex(os.urandom(16)).decode('ascii') + self.bucket_file_name = f"{date}_{random_hex}.pdf" + + def upload(self): + self.file.seek(0) + boto3.client('s3').upload_fileobj(self.file, self.bucket_name(), self.bucket_file_name) + + return (self.original_file_name(), self.full_bucket_file_name()) + + def original_file_name(self): + return os.path.basename(self.file.name) + + def bucket_name(self): + return 'test_bucket' #os.environ['AWS_BUCKET_NAME'] + + def full_bucket_file_name(self): + return f"{self.bucket_name()}/{self.bucket_file_name}" From d8423b5de0f8770149449b54f7b09ed05419233b Mon Sep 17 00:00:00 2001 From: Bert Blommers Date: Tue, 17 Mar 2020 09:16:12 +0000 Subject: [PATCH 2/5] Optimize content length for large files --- moto/s3/models.py | 14 +++++++------- moto/s3/responses.py | 8 -------- tests/test_s3/test_s3.py | 13 ++----------- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/moto/s3/models.py b/moto/s3/models.py index 67b53b984..8c2a86f41 100644 --- a/moto/s3/models.py +++ b/moto/s3/models.py @@ -120,11 +120,9 @@ class FakeKey(BaseModel): @property def value(self): self.lock.acquire() - print("===>value") self._value_buffer.seek(0) - print("===>seek") r = self._value_buffer.read() - print("===>read") + r = copy.copy(r) self.lock.release() return r @@ -138,6 +136,7 @@ class FakeKey(BaseModel): if isinstance(new_value, six.text_type): new_value = new_value.encode(DEFAULT_TEXT_ENCODING) self._value_buffer.write(new_value) + self.contentsize = len(new_value) def copy(self, new_name=None, new_is_versioned=None): r = copy.deepcopy(self) @@ -165,6 +164,7 @@ class FakeKey(BaseModel): self.acl = acl def append_to_value(self, value): + self.contentsize += len(value) self._value_buffer.seek(0, os.SEEK_END) self._value_buffer.write(value) @@ -237,8 +237,7 @@ class FakeKey(BaseModel): @property def size(self): - self._value_buffer.seek(0, os.SEEK_END) - return self._value_buffer.tell() + return self.contentsize @property def storage_class(self): @@ -257,6 +256,7 @@ class FakeKey(BaseModel): state = self.__dict__.copy() state["value"] = self.value del state["_value_buffer"] + del state["lock"] return state def __setstate__(self, state): @@ -266,6 +266,7 @@ class FakeKey(BaseModel): max_size=self._max_buffer_size ) self.value = state["value"] + self.lock = threading.Lock() class FakeMultipart(BaseModel): @@ -292,7 +293,7 @@ class FakeMultipart(BaseModel): etag = etag.replace('"', "") if part is None or part_etag != etag: raise InvalidPart() - if last is not None and len(last.value) < UPLOAD_PART_MIN_SIZE: + if last is not None and last.contentsize < UPLOAD_PART_MIN_SIZE: raise EntityTooSmall() md5s.extend(decode_hex(part_etag)[0]) total.extend(part.value) @@ -1327,7 +1328,6 @@ class S3Backend(BaseBackend): return key def get_key(self, bucket_name, key_name, version_id=None, part_number=None): - print("get_key("+str(bucket_name)+","+str(key_name)+","+str(version_id)+","+str(part_number)+")") key_name = clean_key_name(key_name) bucket = self.get_bucket(bucket_name) key = None diff --git a/moto/s3/responses.py b/moto/s3/responses.py index 15b1d1670..4f38e2a9b 100644 --- a/moto/s3/responses.py +++ b/moto/s3/responses.py @@ -859,7 +859,6 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): def _handle_range_header(self, request, headers, response_content): response_headers = {} length = len(response_content) - print("Length: " + str(length) + " Range: " + str(request.headers.get("range"))) last = length - 1 _, rspec = request.headers.get("range").split("=") if "," in rspec: @@ -877,7 +876,6 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): else: return 400, response_headers, "" if begin < 0 or end > last or begin > min(end, last): - print(str(begin)+ " < 0 or " + str(end) + " > " + str(last) + " or " + str(begin) + " > min("+str(end)+","+str(last)+")") return 416, response_headers, "" response_headers["content-range"] = "bytes {0}-{1}/{2}".format( begin, end, length @@ -907,8 +905,6 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): response_content = response else: status_code, response_headers, response_content = response - print("response received: " + str(len(response_content))) - print(request.headers) if status_code == 200 and "range" in request.headers: self.lock.acquire() @@ -920,7 +916,6 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): return status_code, response_headers, response_content def _control_response(self, request, full_url, headers): - print("_control_response") parsed_url = urlparse(full_url) query = parse_qs(parsed_url.query, keep_blank_values=True) method = request.method @@ -1068,14 +1063,12 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): ) def _key_response_get(self, bucket_name, query, key_name, headers): - print("_key_response_get("+str(key_name)+","+str(headers)+")") self._set_action("KEY", "GET", query) self._authenticate_and_authorize_s3_action() response_headers = {} if query.get("uploadId"): upload_id = query["uploadId"][0] - print("UploadID: " + str(upload_id)) parts = self.backend.list_multipart(bucket_name, upload_id) template = self.response_template(S3_MULTIPART_LIST_RESPONSE) return ( @@ -1107,7 +1100,6 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): response_headers.update(key.metadata) response_headers.update(key.response_dict) - print("returning 200, " + str(headers) + ", " + str(len(key.value)) + " ( " + str(key_name) + ")") return 200, response_headers, key.value def _key_response_put(self, request, body, bucket_name, query, key_name, headers): diff --git a/tests/test_s3/test_s3.py b/tests/test_s3/test_s3.py index 2eef9ef82..7b9f2c726 100644 --- a/tests/test_s3/test_s3.py +++ b/tests/test_s3/test_s3.py @@ -4398,24 +4398,17 @@ def test_s3_config_dict(): @mock_s3 def test_delete_downloaded_file(): # SET UP - filename = '...' + filename = 'some_large_file.pdf' file = open(filename, 'rb') uploader = PdfFileUploader(file) boto3.client('s3').create_bucket(Bucket=uploader.bucket_name()) uploader.upload() - print("================\nUPLOADED\n=================") - # DOWNLOAD - # the following two lines are basically - # boto3.client('s3').download_file(bucket_name, file_name, local_path) - # where bucket_name, file_name and local_path are retrieved from PdfFileUploader - # e.g. boto3.client('s3').download_file("bucket_name", "asdf.pdf", "/tmp/asdf.pdf") + downloader = PdfFileDownloader(uploader.full_bucket_file_name()) downloader.download() downloader.delete_downloaded_file() - print("Done!") - from pathlib import Path import re @@ -4431,8 +4424,6 @@ class PdfFileDownloader: return self.local_path() except ClientError as exc: - print("=======") - print(exc) raise exc def local_path(self): From e2434cbf6f4f939c99fd4d81cbad285702251fd1 Mon Sep 17 00:00:00 2001 From: Bert Blommers Date: Tue, 17 Mar 2020 09:18:38 +0000 Subject: [PATCH 3/5] Remove unnecessary lock --- moto/s3/responses.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/moto/s3/responses.py b/moto/s3/responses.py index 4f38e2a9b..b74be9a63 100644 --- a/moto/s3/responses.py +++ b/moto/s3/responses.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re import sys -import threading import six from botocore.awsrequest import AWSPreparedRequest @@ -151,7 +150,6 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): self.path = "" self.data = {} self.headers = {} - self.lock = threading.Lock() @property def should_autoescape(self): @@ -907,12 +905,9 @@ class ResponseObject(_TemplateEnvironmentMixin, ActionAuthenticatorMixin): status_code, response_headers, response_content = response if status_code == 200 and "range" in request.headers: - self.lock.acquire() - r = self._handle_range_header( + return self._handle_range_header( request, response_headers, response_content ) - self.lock.release() - return r return status_code, response_headers, response_content def _control_response(self, request, full_url, headers): From 5e4736e23392079c20bb283a5ceb2c8e8d6bacf4 Mon Sep 17 00:00:00 2001 From: Bert Blommers Date: Tue, 17 Mar 2020 09:19:57 +0000 Subject: [PATCH 4/5] Remove unnecessary print-statements --- moto/s3/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/moto/s3/utils.py b/moto/s3/utils.py index 50ff1cf34..e22b6b860 100644 --- a/moto/s3/utils.py +++ b/moto/s3/utils.py @@ -104,9 +104,7 @@ class _VersionedKeyStore(dict): def get(self, key, default=None): try: return self[key] - except (KeyError, IndexError) as e: - print("Error retrieving " + str(key)) - print(e) + except (KeyError, IndexError): pass return default From 410d9ee90186d5e83c81f97dc93b6a24faf62b39 Mon Sep 17 00:00:00 2001 From: Bert Blommers Date: Tue, 17 Mar 2020 09:21:33 +0000 Subject: [PATCH 5/5] Remove test that only runs locally --- tests/test_s3/test_s3.py | 75 ---------------------------------------- 1 file changed, 75 deletions(-) diff --git a/tests/test_s3/test_s3.py b/tests/test_s3/test_s3.py index 7b9f2c726..48655ee17 100644 --- a/tests/test_s3/test_s3.py +++ b/tests/test_s3/test_s3.py @@ -4393,78 +4393,3 @@ def test_s3_config_dict(): assert not logging_bucket["supplementaryConfiguration"].get( "BucketTaggingConfiguration" ) - - -@mock_s3 -def test_delete_downloaded_file(): - # SET UP - filename = 'some_large_file.pdf' - file = open(filename, 'rb') - uploader = PdfFileUploader(file) - boto3.client('s3').create_bucket(Bucket=uploader.bucket_name()) - uploader.upload() - - downloader = PdfFileDownloader(uploader.full_bucket_file_name()) - downloader.download() - - downloader.delete_downloaded_file() - - -from pathlib import Path -import re -import os -class PdfFileDownloader: - def __init__(self, full_bucket_file_name): - self.bucket_name, self.file_name = self.extract(full_bucket_file_name) - self.s3 = boto3.client('s3') - - def download(self): - try: - self.s3.download_file(self.bucket_name, self.file_name, self.local_path()) - - return self.local_path() - except ClientError as exc: - raise exc - - def local_path(self): - return '/tmp/' + self.file_name.replace('/', '') - - def delete_downloaded_file(self): - if Path(self.local_path()).is_file(): - print("Removing " + str(self.local_path())) - os.remove(self.local_path()) - - def file(self): - return open(self.local_path(), 'rb') - - def extract(self, full_bucket_file_name): - match = re.search(r'([\.a-zA-Z_-]+)\/(.*)', full_bucket_file_name) - - if match and len(match.groups()) == 2: - return (match.groups()[0], match.groups()[1]) - else: - raise RuntimeError(f"Cannot determine bucket and file name for {full_bucket_file_name}") - - -import binascii -class PdfFileUploader: - def __init__(self, file): - self.file = file - date = datetime.datetime.now().strftime('%Y%m%d%H%M%S') - random_hex = binascii.b2a_hex(os.urandom(16)).decode('ascii') - self.bucket_file_name = f"{date}_{random_hex}.pdf" - - def upload(self): - self.file.seek(0) - boto3.client('s3').upload_fileobj(self.file, self.bucket_name(), self.bucket_file_name) - - return (self.original_file_name(), self.full_bucket_file_name()) - - def original_file_name(self): - return os.path.basename(self.file.name) - - def bucket_name(self): - return 'test_bucket' #os.environ['AWS_BUCKET_NAME'] - - def full_bucket_file_name(self): - return f"{self.bucket_name()}/{self.bucket_file_name}"