Don't store duplicates of files.

If a new file is uploaded, it will be matched with a previously uploaded
file so we don't have to store duplicates. SHA256 is random enough and
also unlikely enough to cause collisions.
This commit is contained in:
Mikael Nordfeldth 2015-02-24 21:11:25 +01:00
parent 0d577584c3
commit 325e784ccd
5 changed files with 160 additions and 37 deletions

View File

@ -28,6 +28,7 @@ class File extends Managed_DataObject
public $id; // int(4) primary_key not_null public $id; // int(4) primary_key not_null
public $urlhash; // varchar(64) unique_key public $urlhash; // varchar(64) unique_key
public $url; // text public $url; // text
public $filehash; // varchar(64) indexed
public $mimetype; // varchar(50) public $mimetype; // varchar(50)
public $size; // int(4) public $size; // int(4)
public $title; // varchar(191) not 255 because utf8mb4 takes more space public $title; // varchar(191) not 255 because utf8mb4 takes more space
@ -39,6 +40,7 @@ class File extends Managed_DataObject
public $modified; // timestamp() not_null default_CURRENT_TIMESTAMP public $modified; // timestamp() not_null default_CURRENT_TIMESTAMP
const URLHASH_ALG = 'sha256'; const URLHASH_ALG = 'sha256';
const FILEHASH_ALG = 'sha256';
public static function schemaDef() public static function schemaDef()
{ {
@ -47,6 +49,7 @@ class File extends Managed_DataObject
'id' => array('type' => 'serial', 'not null' => true), 'id' => array('type' => 'serial', 'not null' => true),
'urlhash' => array('type' => 'varchar', 'length' => 64, 'not null' => true, 'description' => 'sha256 of destination URL (url field)'), 'urlhash' => array('type' => 'varchar', 'length' => 64, 'not null' => true, 'description' => 'sha256 of destination URL (url field)'),
'url' => array('type' => 'text', 'description' => 'destination URL after following possible redirections'), 'url' => array('type' => 'text', 'description' => 'destination URL after following possible redirections'),
'filehash' => array('type' => 'varchar', 'length' => 64, 'not null' => false, 'description' => 'sha256 of the file contents, only for locally stored files of course'),
'mimetype' => array('type' => 'varchar', 'length' => 50, 'description' => 'mime type of resource'), 'mimetype' => array('type' => 'varchar', 'length' => 50, 'description' => 'mime type of resource'),
'size' => array('type' => 'int', 'description' => 'size of resource when available'), 'size' => array('type' => 'int', 'description' => 'size of resource when available'),
'title' => array('type' => 'varchar', 'length' => 191, 'description' => 'title of resource when available'), 'title' => array('type' => 'varchar', 'length' => 191, 'description' => 'title of resource when available'),
@ -62,6 +65,9 @@ class File extends Managed_DataObject
'unique keys' => array( 'unique keys' => array(
'file_urlhash_key' => array('urlhash'), 'file_urlhash_key' => array('urlhash'),
), ),
'indexes' => array(
'file_filehash_idx' => array('filehash'),
),
); );
} }
@ -247,12 +253,7 @@ class File extends Managed_DataObject
static function filename(Profile $profile, $origname, $mimetype) static function filename(Profile $profile, $origname, $mimetype)
{ {
try { $ext = self::guessMimeExtension($mimetype);
$ext = common_supported_mime_to_ext($mimetype);
} catch (Exception $e) {
// We don't support this mimetype, but let's guess the extension
$ext = substr(strrchr($mimetype, '/'), 1);
}
// Normalize and make the original filename more URL friendly. // Normalize and make the original filename more URL friendly.
$origname = basename($origname, ".$ext"); $origname = basename($origname, ".$ext");
@ -273,6 +274,17 @@ class File extends Managed_DataObject
return $filename; return $filename;
} }
static function guessMimeExtension($mimetype)
{
try {
$ext = common_supported_mime_to_ext($mimetype);
} catch (Exception $e) {
// We don't support this mimetype, but let's guess the extension
$ext = substr(strrchr($mimetype, '/'), 1);
}
return strtolower($ext);
}
/** /**
* Validation for as-saved base filenames * Validation for as-saved base filenames
*/ */
@ -464,7 +476,11 @@ class File extends Managed_DataObject
public function getPath() public function getPath()
{ {
return self::path($this->filename); $filepath = self::path($this->filename);
if (!file_exists($filepath)) {
throw new FileNotFoundException($filepath);
}
return $filepath;
} }
public function getUrl() public function getUrl()
@ -494,6 +510,19 @@ class File extends Managed_DataObject
return $file; return $file;
} }
/**
* @param string $hashstr String of (preferrably lower case) hexadecimal characters, same as result of 'hash_file(...)'
*/
static public function getByHash($hashstr, $alg=File::FILEHASH_ALG)
{
$file = new File();
$file->filehash = strtolower($hashstr);
if (!$file->find(true)) {
throw new NoResultException($file);
}
return $file;
}
public function updateUrl($url) public function updateUrl($url)
{ {
$file = File::getKV('urlhash', self::hashurl($url)); $file = File::getKV('urlhash', self::hashurl($url));

View File

@ -119,7 +119,11 @@ class File_thumbnail extends Managed_DataObject
public function getPath() public function getPath()
{ {
return self::path($this->filename); $filepath = self::path($this->filename);
if (!file_exists($filepath)) {
throw new FileNotFoundException($filepath);
}
return $filepath;
} }
public function getUrl() public function getUrl()

View File

@ -253,6 +253,7 @@ $default =
'user_quota' => 50000000, 'user_quota' => 50000000,
'monthly_quota' => 15000000, 'monthly_quota' => 15000000,
'uploads' => true, 'uploads' => true,
'filename_base' => 'hash', // for new files, choose one: 'upload', 'hash'
'show_html' => false, // show (filtered) text/html attachments (and oEmbed HTML etc.). Doesn't affect AJAX calls. 'show_html' => false, // show (filtered) text/html attachments (and oEmbed HTML etc.). Doesn't affect AJAX calls.
'show_thumbs' => true, // show thumbnails in notice lists for uploaded images, and photos and videos linked remotely that provide oEmbed info 'show_thumbs' => true, // show thumbnails in notice lists for uploaded images, and photos and videos linked remotely that provide oEmbed info
'process_links' => true, // check linked resources for embeddable photos and videos; this will hit referenced external web sites when processing new messages. 'process_links' => true, // check linked resources for embeddable photos and videos; this will hit referenced external web sites when processing new messages.

View File

@ -42,12 +42,13 @@ class MediaFile
var $short_fileurl = null; var $short_fileurl = null;
var $mimetype = null; var $mimetype = null;
function __construct(Profile $scoped, $filename = null, $mimetype = null) function __construct(Profile $scoped, $filename = null, $mimetype = null, $filehash = null)
{ {
$this->scoped = $scoped; $this->scoped = $scoped;
$this->filename = $filename; $this->filename = $filename;
$this->mimetype = $mimetype; $this->mimetype = $mimetype;
$this->filehash = $filehash;
$this->fileRecord = $this->storeFile(); $this->fileRecord = $this->storeFile();
$this->fileurl = common_local_url('attachment', $this->fileurl = common_local_url('attachment',
@ -90,6 +91,24 @@ class MediaFile
protected function storeFile() protected function storeFile()
{ {
$filepath = File::path($this->filename);
if (!empty($this->filename) && $this->filehash === null) {
// Calculate if we have an older upload method somewhere (Qvitter) that
// doesn't do this before calling new MediaFile on its local files...
$this->filehash = hash_file(File::FILEHASH_ALG, $filepath);
if ($this->filehash === false) {
throw new ServerException('Could not read file for hashing');
}
}
try {
$file = File::getByHash($this->filehash);
// We're done here. Yes. Already. We assume sha256 won't collide on us anytime soon.
return $file;
} catch (NoResultException $e) {
// Well, let's just continue below.
}
$fileurl = File::url($this->filename); $fileurl = File::url($this->filename);
$file = new File; $file = new File;
@ -97,11 +116,15 @@ class MediaFile
$file->filename = $this->filename; $file->filename = $this->filename;
$file->urlhash = File::hashurl($fileurl); $file->urlhash = File::hashurl($fileurl);
$file->url = $fileurl; $file->url = $fileurl;
$filepath = File::path($this->filename); $file->filehash = $this->filehash;
$file->size = filesize($filepath); $file->size = filesize($filepath);
if ($file->size === false) {
throw new ServerException('Could not read file to get its size');
}
$file->date = time(); $file->date = time();
$file->mimetype = $this->mimetype; $file->mimetype = $this->mimetype;
$file_id = $file->insert(); $file_id = $file->insert();
if ($file_id===false) { if ($file_id===false) {
@ -206,14 +229,35 @@ class MediaFile
throw new ClientException(_('System error uploading file.')); throw new ClientException(_('System error uploading file.'));
} }
// TODO: Make documentation clearer that this won't work for files >2GiB because
// PHP is stupid in its 32bit head. But noone accepts 2GiB files with PHP
// anyway... I hope.
$filehash = hash_file(File::FILEHASH_ALG, $_FILES[$param]['tmp_name']);
try {
$file = File::getByHash($filehash);
// If no exception is thrown the file exists locally, so we'll use that and just add redirections.
$filename = $file->filename;
$mimetype = $file->mimetype;
} catch (NoResultException $e) {
// We have to save the upload as a new local file. This is the normal course of action.
// Throws exception if additional size does not respect quota // Throws exception if additional size does not respect quota
// This test is only needed, of course, if we're uploading something new.
File::respectsQuota($scoped, $_FILES[$param]['size']); File::respectsQuota($scoped, $_FILES[$param]['size']);
$mimetype = self::getUploadedMimeType($_FILES[$param]['tmp_name'], $mimetype = self::getUploadedMimeType($_FILES[$param]['tmp_name'], $_FILES[$param]['name']);
$_FILES[$param]['name']);
switch (common_config('attachments', 'filename_base')) {
case 'upload':
$basename = basename($_FILES[$param]['name']); $basename = basename($_FILES[$param]['name']);
$filename = File::filename($scoped, $basename, $mimetype); $filename = File::filename($scoped, $basename, $mimetype);
break;
case 'hash':
default:
$filename = strtolower($filehash) . '.' . File::guessMimeExtension($mimetype);
}
$filepath = File::path($filename); $filepath = File::path($filename);
$result = move_uploaded_file($_FILES[$param]['tmp_name'], $filepath); $result = move_uploaded_file($_FILES[$param]['tmp_name'], $filepath);
@ -223,20 +267,35 @@ class MediaFile
// TRANS: not be moved from the temporary folder to the permanent file location. // TRANS: not be moved from the temporary folder to the permanent file location.
throw new ClientException(_('File could not be moved to destination directory.')); throw new ClientException(_('File could not be moved to destination directory.'));
} }
}
return new MediaFile($scoped, $filename, $mimetype); return new MediaFile($scoped, $filename, $mimetype, $filehash);
} }
static function fromFilehandle($fh, Profile $scoped) { static function fromFilehandle($fh, Profile $scoped) {
$stream = stream_get_meta_data($fh); $stream = stream_get_meta_data($fh);
// So far we're only handling filehandles originating from tmpfile(),
// so we can always do hash_file on $stream['uri'] as far as I can tell!
$filehash = hash_file(File::FILEHASH_ALG, $stream['uri']);
try {
$file = File::getByHash($filehash);
// Already have it, so let's reuse the locally stored File
$filename = $file->filename;
$mimetype = $file->mimetype;
} catch (NoResultException $e) {
File::respectsQuota($scoped, filesize($stream['uri'])); File::respectsQuota($scoped, filesize($stream['uri']));
$mimetype = self::getUploadedMimeType($stream['uri']); $mimetype = self::getUploadedMimeType($stream['uri']);
switch (common_config('attachments', 'filename_base')) {
case 'upload':
$filename = File::filename($scoped, "email", $mimetype); $filename = File::filename($scoped, "email", $mimetype);
break;
case 'hash':
default:
$filename = strtolower($filehash) . '.' . File::guessMimeExtension($mimetype);
}
$filepath = File::path($filename); $filepath = File::path($filename);
$result = copy($stream['uri'], $filepath) && chmod($filepath, 0664); $result = copy($stream['uri'], $filepath) && chmod($filepath, 0664);
@ -247,8 +306,9 @@ class MediaFile
throw new ClientException(_('File could not be moved to destination directory.' . throw new ClientException(_('File could not be moved to destination directory.' .
$stream['uri'] . ' ' . $filepath)); $stream['uri'] . ' ' . $filepath));
} }
}
return new MediaFile($scoped, $filename, $mimetype); return new MediaFile($scoped, $filename, $mimetype, $filehash);
} }
/** /**

View File

@ -48,6 +48,7 @@ function main()
fixupFileGeometry(); fixupFileGeometry();
deleteLocalFileThumbnailsWithoutFilename(); deleteLocalFileThumbnailsWithoutFilename();
deleteMissingLocalFileThumbnails(); deleteMissingLocalFileThumbnails();
setFilehashOnLocalFiles();
initGroupProfileId(); initGroupProfileId();
initLocalGroup(); initLocalGroup();
@ -490,7 +491,9 @@ function deleteMissingLocalFileThumbnails()
// Checking if there were any File_thumbnail entries without filename // Checking if there were any File_thumbnail entries without filename
if ($thumbs->find()) { if ($thumbs->find()) {
while ($thumbs->fetch()) { while ($thumbs->fetch()) {
if (!file_exists(File_thumbnail::path($thumbs->filename))) { try {
$thumbs->getPath();
} catch (FileNotFoundException $e) {
$thumbs->delete(); $thumbs->delete();
} }
} }
@ -499,4 +502,30 @@ function deleteMissingLocalFileThumbnails()
printfnq("DONE.\n"); printfnq("DONE.\n");
} }
/*
* Files are now stored with their hash, so let's generate for previously uploaded files.
*/
function setFilehashOnLocalFiles()
{
printfnq('Ensuring all local files have the filehash field set...');
$file = new File();
$file->whereAdd('filename IS NOT NULL'); // local files
$file->whereAdd('filehash IS NULL', 'AND'); // without filehash value
if ($file->find()) {
while ($file->fetch()) {
try {
$orig = clone($file);
$file->filehash = hash_file(File::FILEHASH_ALG, $file->getPath());
$file->update($orig);
} catch (FileNotFoundException $e) {
echo "\n WARNING: file ID {$file->id} does not exist on path '{$e->path}'. Clean up the file table?";
}
}
}
printfnq("DONE.\n");
}
main(); main();