Don't store duplicates of files.
If a new file is uploaded, it will be matched with a previously uploaded file so we don't have to store duplicates. SHA256 is random enough and also unlikely enough to cause collisions.
This commit is contained in:
parent
0d577584c3
commit
325e784ccd
|
@ -28,6 +28,7 @@ class File extends Managed_DataObject
|
|||
public $id; // int(4) primary_key not_null
|
||||
public $urlhash; // varchar(64) unique_key
|
||||
public $url; // text
|
||||
public $filehash; // varchar(64) indexed
|
||||
public $mimetype; // varchar(50)
|
||||
public $size; // int(4)
|
||||
public $title; // varchar(191) not 255 because utf8mb4 takes more space
|
||||
|
@ -39,6 +40,7 @@ class File extends Managed_DataObject
|
|||
public $modified; // timestamp() not_null default_CURRENT_TIMESTAMP
|
||||
|
||||
const URLHASH_ALG = 'sha256';
|
||||
const FILEHASH_ALG = 'sha256';
|
||||
|
||||
public static function schemaDef()
|
||||
{
|
||||
|
@ -47,6 +49,7 @@ class File extends Managed_DataObject
|
|||
'id' => array('type' => 'serial', 'not null' => true),
|
||||
'urlhash' => array('type' => 'varchar', 'length' => 64, 'not null' => true, 'description' => 'sha256 of destination URL (url field)'),
|
||||
'url' => array('type' => 'text', 'description' => 'destination URL after following possible redirections'),
|
||||
'filehash' => array('type' => 'varchar', 'length' => 64, 'not null' => false, 'description' => 'sha256 of the file contents, only for locally stored files of course'),
|
||||
'mimetype' => array('type' => 'varchar', 'length' => 50, 'description' => 'mime type of resource'),
|
||||
'size' => array('type' => 'int', 'description' => 'size of resource when available'),
|
||||
'title' => array('type' => 'varchar', 'length' => 191, 'description' => 'title of resource when available'),
|
||||
|
@ -62,6 +65,9 @@ class File extends Managed_DataObject
|
|||
'unique keys' => array(
|
||||
'file_urlhash_key' => array('urlhash'),
|
||||
),
|
||||
'indexes' => array(
|
||||
'file_filehash_idx' => array('filehash'),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -247,12 +253,7 @@ class File extends Managed_DataObject
|
|||
|
||||
static function filename(Profile $profile, $origname, $mimetype)
|
||||
{
|
||||
try {
|
||||
$ext = common_supported_mime_to_ext($mimetype);
|
||||
} catch (Exception $e) {
|
||||
// We don't support this mimetype, but let's guess the extension
|
||||
$ext = substr(strrchr($mimetype, '/'), 1);
|
||||
}
|
||||
$ext = self::guessMimeExtension($mimetype);
|
||||
|
||||
// Normalize and make the original filename more URL friendly.
|
||||
$origname = basename($origname, ".$ext");
|
||||
|
@ -273,6 +274,17 @@ class File extends Managed_DataObject
|
|||
return $filename;
|
||||
}
|
||||
|
||||
static function guessMimeExtension($mimetype)
|
||||
{
|
||||
try {
|
||||
$ext = common_supported_mime_to_ext($mimetype);
|
||||
} catch (Exception $e) {
|
||||
// We don't support this mimetype, but let's guess the extension
|
||||
$ext = substr(strrchr($mimetype, '/'), 1);
|
||||
}
|
||||
return strtolower($ext);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validation for as-saved base filenames
|
||||
*/
|
||||
|
@ -464,7 +476,11 @@ class File extends Managed_DataObject
|
|||
|
||||
public function getPath()
|
||||
{
|
||||
return self::path($this->filename);
|
||||
$filepath = self::path($this->filename);
|
||||
if (!file_exists($filepath)) {
|
||||
throw new FileNotFoundException($filepath);
|
||||
}
|
||||
return $filepath;
|
||||
}
|
||||
|
||||
public function getUrl()
|
||||
|
@ -494,6 +510,19 @@ class File extends Managed_DataObject
|
|||
return $file;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $hashstr String of (preferrably lower case) hexadecimal characters, same as result of 'hash_file(...)'
|
||||
*/
|
||||
static public function getByHash($hashstr, $alg=File::FILEHASH_ALG)
|
||||
{
|
||||
$file = new File();
|
||||
$file->filehash = strtolower($hashstr);
|
||||
if (!$file->find(true)) {
|
||||
throw new NoResultException($file);
|
||||
}
|
||||
return $file;
|
||||
}
|
||||
|
||||
public function updateUrl($url)
|
||||
{
|
||||
$file = File::getKV('urlhash', self::hashurl($url));
|
||||
|
|
|
@ -119,7 +119,11 @@ class File_thumbnail extends Managed_DataObject
|
|||
|
||||
public function getPath()
|
||||
{
|
||||
return self::path($this->filename);
|
||||
$filepath = self::path($this->filename);
|
||||
if (!file_exists($filepath)) {
|
||||
throw new FileNotFoundException($filepath);
|
||||
}
|
||||
return $filepath;
|
||||
}
|
||||
|
||||
public function getUrl()
|
||||
|
|
|
@ -253,6 +253,7 @@ $default =
|
|||
'user_quota' => 50000000,
|
||||
'monthly_quota' => 15000000,
|
||||
'uploads' => true,
|
||||
'filename_base' => 'hash', // for new files, choose one: 'upload', 'hash'
|
||||
'show_html' => false, // show (filtered) text/html attachments (and oEmbed HTML etc.). Doesn't affect AJAX calls.
|
||||
'show_thumbs' => true, // show thumbnails in notice lists for uploaded images, and photos and videos linked remotely that provide oEmbed info
|
||||
'process_links' => true, // check linked resources for embeddable photos and videos; this will hit referenced external web sites when processing new messages.
|
||||
|
|
|
@ -42,12 +42,13 @@ class MediaFile
|
|||
var $short_fileurl = null;
|
||||
var $mimetype = null;
|
||||
|
||||
function __construct(Profile $scoped, $filename = null, $mimetype = null)
|
||||
function __construct(Profile $scoped, $filename = null, $mimetype = null, $filehash = null)
|
||||
{
|
||||
$this->scoped = $scoped;
|
||||
|
||||
$this->filename = $filename;
|
||||
$this->mimetype = $mimetype;
|
||||
$this->filehash = $filehash;
|
||||
$this->fileRecord = $this->storeFile();
|
||||
|
||||
$this->fileurl = common_local_url('attachment',
|
||||
|
@ -90,6 +91,24 @@ class MediaFile
|
|||
|
||||
protected function storeFile()
|
||||
{
|
||||
$filepath = File::path($this->filename);
|
||||
if (!empty($this->filename) && $this->filehash === null) {
|
||||
// Calculate if we have an older upload method somewhere (Qvitter) that
|
||||
// doesn't do this before calling new MediaFile on its local files...
|
||||
$this->filehash = hash_file(File::FILEHASH_ALG, $filepath);
|
||||
if ($this->filehash === false) {
|
||||
throw new ServerException('Could not read file for hashing');
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
$file = File::getByHash($this->filehash);
|
||||
// We're done here. Yes. Already. We assume sha256 won't collide on us anytime soon.
|
||||
return $file;
|
||||
} catch (NoResultException $e) {
|
||||
// Well, let's just continue below.
|
||||
}
|
||||
|
||||
$fileurl = File::url($this->filename);
|
||||
|
||||
$file = new File;
|
||||
|
@ -97,11 +116,15 @@ class MediaFile
|
|||
$file->filename = $this->filename;
|
||||
$file->urlhash = File::hashurl($fileurl);
|
||||
$file->url = $fileurl;
|
||||
$filepath = File::path($this->filename);
|
||||
$file->filehash = $this->filehash;
|
||||
$file->size = filesize($filepath);
|
||||
if ($file->size === false) {
|
||||
throw new ServerException('Could not read file to get its size');
|
||||
}
|
||||
$file->date = time();
|
||||
$file->mimetype = $this->mimetype;
|
||||
|
||||
|
||||
$file_id = $file->insert();
|
||||
|
||||
if ($file_id===false) {
|
||||
|
@ -206,14 +229,35 @@ class MediaFile
|
|||
throw new ClientException(_('System error uploading file.'));
|
||||
}
|
||||
|
||||
// TODO: Make documentation clearer that this won't work for files >2GiB because
|
||||
// PHP is stupid in its 32bit head. But noone accepts 2GiB files with PHP
|
||||
// anyway... I hope.
|
||||
$filehash = hash_file(File::FILEHASH_ALG, $_FILES[$param]['tmp_name']);
|
||||
|
||||
try {
|
||||
$file = File::getByHash($filehash);
|
||||
// If no exception is thrown the file exists locally, so we'll use that and just add redirections.
|
||||
$filename = $file->filename;
|
||||
$mimetype = $file->mimetype;
|
||||
|
||||
} catch (NoResultException $e) {
|
||||
// We have to save the upload as a new local file. This is the normal course of action.
|
||||
|
||||
// Throws exception if additional size does not respect quota
|
||||
// This test is only needed, of course, if we're uploading something new.
|
||||
File::respectsQuota($scoped, $_FILES[$param]['size']);
|
||||
|
||||
$mimetype = self::getUploadedMimeType($_FILES[$param]['tmp_name'],
|
||||
$_FILES[$param]['name']);
|
||||
$mimetype = self::getUploadedMimeType($_FILES[$param]['tmp_name'], $_FILES[$param]['name']);
|
||||
|
||||
switch (common_config('attachments', 'filename_base')) {
|
||||
case 'upload':
|
||||
$basename = basename($_FILES[$param]['name']);
|
||||
$filename = File::filename($scoped, $basename, $mimetype);
|
||||
break;
|
||||
case 'hash':
|
||||
default:
|
||||
$filename = strtolower($filehash) . '.' . File::guessMimeExtension($mimetype);
|
||||
}
|
||||
$filepath = File::path($filename);
|
||||
|
||||
$result = move_uploaded_file($_FILES[$param]['tmp_name'], $filepath);
|
||||
|
@ -223,20 +267,35 @@ class MediaFile
|
|||
// TRANS: not be moved from the temporary folder to the permanent file location.
|
||||
throw new ClientException(_('File could not be moved to destination directory.'));
|
||||
}
|
||||
}
|
||||
|
||||
return new MediaFile($scoped, $filename, $mimetype);
|
||||
return new MediaFile($scoped, $filename, $mimetype, $filehash);
|
||||
}
|
||||
|
||||
static function fromFilehandle($fh, Profile $scoped) {
|
||||
|
||||
$stream = stream_get_meta_data($fh);
|
||||
// So far we're only handling filehandles originating from tmpfile(),
|
||||
// so we can always do hash_file on $stream['uri'] as far as I can tell!
|
||||
$filehash = hash_file(File::FILEHASH_ALG, $stream['uri']);
|
||||
|
||||
try {
|
||||
$file = File::getByHash($filehash);
|
||||
// Already have it, so let's reuse the locally stored File
|
||||
$filename = $file->filename;
|
||||
$mimetype = $file->mimetype;
|
||||
} catch (NoResultException $e) {
|
||||
File::respectsQuota($scoped, filesize($stream['uri']));
|
||||
|
||||
$mimetype = self::getUploadedMimeType($stream['uri']);
|
||||
|
||||
switch (common_config('attachments', 'filename_base')) {
|
||||
case 'upload':
|
||||
$filename = File::filename($scoped, "email", $mimetype);
|
||||
|
||||
break;
|
||||
case 'hash':
|
||||
default:
|
||||
$filename = strtolower($filehash) . '.' . File::guessMimeExtension($mimetype);
|
||||
}
|
||||
$filepath = File::path($filename);
|
||||
|
||||
$result = copy($stream['uri'], $filepath) && chmod($filepath, 0664);
|
||||
|
@ -247,8 +306,9 @@ class MediaFile
|
|||
throw new ClientException(_('File could not be moved to destination directory.' .
|
||||
$stream['uri'] . ' ' . $filepath));
|
||||
}
|
||||
}
|
||||
|
||||
return new MediaFile($scoped, $filename, $mimetype);
|
||||
return new MediaFile($scoped, $filename, $mimetype, $filehash);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -48,6 +48,7 @@ function main()
|
|||
fixupFileGeometry();
|
||||
deleteLocalFileThumbnailsWithoutFilename();
|
||||
deleteMissingLocalFileThumbnails();
|
||||
setFilehashOnLocalFiles();
|
||||
|
||||
initGroupProfileId();
|
||||
initLocalGroup();
|
||||
|
@ -490,7 +491,9 @@ function deleteMissingLocalFileThumbnails()
|
|||
// Checking if there were any File_thumbnail entries without filename
|
||||
if ($thumbs->find()) {
|
||||
while ($thumbs->fetch()) {
|
||||
if (!file_exists(File_thumbnail::path($thumbs->filename))) {
|
||||
try {
|
||||
$thumbs->getPath();
|
||||
} catch (FileNotFoundException $e) {
|
||||
$thumbs->delete();
|
||||
}
|
||||
}
|
||||
|
@ -499,4 +502,30 @@ function deleteMissingLocalFileThumbnails()
|
|||
printfnq("DONE.\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* Files are now stored with their hash, so let's generate for previously uploaded files.
|
||||
*/
|
||||
function setFilehashOnLocalFiles()
|
||||
{
|
||||
printfnq('Ensuring all local files have the filehash field set...');
|
||||
|
||||
$file = new File();
|
||||
$file->whereAdd('filename IS NOT NULL'); // local files
|
||||
$file->whereAdd('filehash IS NULL', 'AND'); // without filehash value
|
||||
|
||||
if ($file->find()) {
|
||||
while ($file->fetch()) {
|
||||
try {
|
||||
$orig = clone($file);
|
||||
$file->filehash = hash_file(File::FILEHASH_ALG, $file->getPath());
|
||||
$file->update($orig);
|
||||
} catch (FileNotFoundException $e) {
|
||||
echo "\n WARNING: file ID {$file->id} does not exist on path '{$e->path}'. Clean up the file table?";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printfnq("DONE.\n");
|
||||
}
|
||||
|
||||
main();
|
||||
|
|
Loading…
Reference in New Issue
Block a user