2009-05-12 02:45:00 +09:00
< ? php
/*
2009-08-26 07:14:12 +09:00
* StatusNet - the distributed open - source microblogging tool
2009-08-26 07:12:20 +09:00
* Copyright ( C ) 2008 , 2009 , StatusNet , Inc .
2009-05-12 02:45:00 +09:00
*
* This program is free software : you can redistribute it and / or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation , either version 3 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU Affero General Public License for more details .
*
* You should have received a copy of the GNU Affero General Public License
* along with this program . If not , see < http :// www . gnu . org / licenses />.
*/
2013-11-10 22:33:45 +09:00
if ( ! defined ( 'GNUSOCIAL' )) { exit ( 1 ); }
2009-05-14 03:27:32 +09:00
2009-05-12 02:45:00 +09:00
/**
* Table Definition for file_redirection
*/
2011-08-23 06:52:02 +09:00
class File_redirection extends Managed_DataObject
2009-05-12 02:45:00 +09:00
{
###START_AUTOCODE
/* the code below is auto generated do not remove the above tag */
public $__table = 'file_redirection' ; // table name
2015-02-20 02:34:48 +09:00
public $urlhash ; // varchar(64) primary_key not_null
public $url ; // text
2009-06-23 23:29:43 +09:00
public $file_id ; // int(4)
public $redirections ; // int(4)
public $httpcode ; // int(4)
2009-06-23 06:19:41 +09:00
public $modified ; // timestamp() not_null default_CURRENT_TIMESTAMP
2009-05-12 02:45:00 +09:00
/* the code above is auto generated do not remove the tag below */
###END_AUTOCODE
2009-05-14 03:27:32 +09:00
2015-11-02 14:15:08 +09:00
protected $file ; /* Cache the associated file sometimes */
2011-08-23 06:52:02 +09:00
public static function schemaDef ()
{
return array (
'fields' => array (
2015-02-20 05:21:39 +09:00
'urlhash' => array ( 'type' => 'varchar' , 'length' => 64 , 'not null' => true , 'description' => 'sha256 hash of the URL' ),
2015-02-20 02:34:48 +09:00
'url' => array ( 'type' => 'text' , 'description' => 'short URL (or any other kind of redirect) for file (id)' ),
2011-08-23 06:52:02 +09:00
'file_id' => array ( 'type' => 'int' , 'description' => 'short URL for what URL/file' ),
'redirections' => array ( 'type' => 'int' , 'description' => 'redirect count' ),
'httpcode' => array ( 'type' => 'int' , 'description' => 'HTTP status code (20x, 30x, etc.)' ),
'modified' => array ( 'type' => 'timestamp' , 'not null' => true , 'description' => 'date this record was modified' ),
),
2015-02-20 02:34:48 +09:00
'primary key' => array ( 'urlhash' ),
2011-08-23 06:52:02 +09:00
'foreign keys' => array (
'file_redirection_file_id_fkey' => array ( 'file' => array ( 'file_id' => 'id' )),
),
);
}
2015-02-20 02:34:48 +09:00
static public function getByUrl ( $url )
{
2015-06-05 05:33:36 +09:00
return self :: getByPK ( array ( 'urlhash' => File :: hashurl ( $url )));
2015-02-20 02:34:48 +09:00
}
2009-10-29 04:29:20 +09:00
static function _commonHttp ( $url , $redirs ) {
$request = new HTTPClient ( $url );
$request -> setConfig ( array (
'connect_timeout' => 10 , // # seconds to wait
'max_redirs' => $redirs , // # max number of http redirections to follow
2016-01-26 03:55:48 +09:00
'follow_redirects' => false , // We follow redirects ourselves in lib/httpclient.php
2009-11-10 04:01:46 +09:00
'store_body' => false , // We won't need body content here.
2009-10-29 04:29:20 +09:00
));
return $request ;
2009-05-14 03:27:32 +09:00
}
2010-03-11 07:31:29 +09:00
/**
* Check if this URL is a redirect and return redir info .
*
* Most code should call File_redirection :: where instead , to check if we
* already know that redirection and avoid extra hits to the web .
*
* The URL is hit and any redirects are followed , up to 10 levels or until
* a protected URL is reached .
*
* @ param string $in_url
* @ return mixed one of :
* string - target URL , if this is a direct link or can ' t be followed
* array - redirect info if this is an * unknown * redirect :
* associative array with the following elements :
* code : HTTP status code
* redirects : count of redirects followed
* url : URL string of final target
* type ( optional ) : MIME type from Content - Type header
* size ( optional ) : byte size from Content - Length header
* time ( optional ) : timestamp from Last - Modified header
*/
2015-06-07 16:52:48 +09:00
static function lookupWhere ( $short_url , $redirs = 10 , $protected = false ) {
2009-05-14 03:27:32 +09:00
if ( $redirs < 0 ) return false ;
2009-09-28 05:52:15 +09:00
if ( strpos ( $short_url , '://' ) === false ){
return $short_url ;
}
2009-10-29 04:29:20 +09:00
try {
$request = self :: _commonHttp ( $short_url , $redirs );
2009-11-10 04:01:46 +09:00
// Don't include body in output
2009-10-29 04:29:20 +09:00
$request -> setMethod ( HTTP_Request2 :: METHOD_HEAD );
$response = $request -> send ();
2010-11-16 04:01:00 +09:00
if ( 405 == $response -> getStatus () || 204 == $response -> getStatus ()) {
// HTTP 405 Unsupported Method
2009-11-10 04:01:46 +09:00
// Server doesn't support HEAD method? Can this really happen?
2009-10-29 04:29:20 +09:00
// We'll try again as a GET and ignore the response data.
2010-11-16 04:01:00 +09:00
//
// HTTP 204 No Content
// YFrog sends 204 responses back for our HEAD checks, which
// seems like it may be a logic error in their servers. If
// we get a 204 back, re-run it as a GET... if there's really
// no content it'll be cheap. :)
2009-10-29 04:29:20 +09:00
$request = self :: _commonHttp ( $short_url , $redirs );
$response = $request -> send ();
2016-02-03 08:34:49 +09:00
} elseif ( 400 == $response -> getStatus ()) {
throw new Exception ( 'Got error 400 on HEAD request, will not go further.' );
2009-10-29 04:29:20 +09:00
}
} catch ( Exception $e ) {
// Invalid URL or failure to reach server
2010-03-11 07:31:29 +09:00
common_log ( LOG_ERR , " Error while following redirects for $short_url : " . $e -> getMessage ());
2009-10-29 04:29:20 +09:00
return $short_url ;
2009-05-14 03:27:32 +09:00
}
2016-01-26 03:55:48 +09:00
// if last url after all redirections is protected,
// use the url before it in the redirection chain
2016-01-13 22:00:05 +09:00
if ( $response -> getRedirectCount () && File :: isProtected ( $response -> getEffectiveUrl ())) {
2016-01-26 03:55:48 +09:00
$return_url = $response -> redirUrls [ $response -> getRedirectCount () - 1 ];
} else {
$return_url = $response -> getEffectiveUrl ();
2009-05-14 03:27:32 +09:00
}
2009-10-29 04:29:20 +09:00
$ret = array ( 'code' => $response -> getStatus ()
, 'redirects' => $response -> getRedirectCount ()
2016-01-26 03:55:48 +09:00
, 'url' => $return_url );
2009-05-14 03:27:32 +09:00
2009-10-29 04:29:20 +09:00
$type = $response -> getHeader ( 'Content-Type' );
if ( $type ) $ret [ 'type' ] = $type ;
2009-05-14 03:27:32 +09:00
if ( $protected ) $ret [ 'protected' ] = true ;
2009-10-29 04:29:20 +09:00
$size = $response -> getHeader ( 'Content-Length' ); // @fixme bytes?
if ( $size ) $ret [ 'size' ] = $size ;
$time = $response -> getHeader ( 'Last-Modified' );
if ( $time ) $ret [ 'time' ] = strtotime ( $time );
2009-05-14 03:27:32 +09:00
return $ret ;
}
2010-03-11 06:39:42 +09:00
/**
* Check if this URL is a redirect and return redir info .
* If a File record is present for this URL , it is not considered a redirect .
* If a File_redirection record is present for this URL , the recorded target is returned .
*
* If no File or File_redirect record is present , the URL is hit and any
* redirects are followed , up to 10 levels or until a protected URL is
* reached .
*
* @ param string $in_url
2010-11-18 06:03:59 +09:00
* @ param boolean $discover true to attempt dereferencing the redirect if we don ' t know it already
2015-11-02 14:15:08 +09:00
* @ return File_redirection
2010-03-11 06:39:42 +09:00
*/
2015-07-08 02:48:18 +09:00
static function where ( $in_url , $discover = true ) {
2015-11-02 14:15:08 +09:00
$redir = new File_redirection ();
$redir -> url = $in_url ;
$redir -> urlhash = File :: hashurl ( $redir -> url );
$redir -> redirections = 0 ;
2015-02-20 02:34:48 +09:00
try {
2015-11-02 14:15:08 +09:00
$r = File_redirection :: getByUrl ( $in_url );
if ( $r instanceof File_redirection ) {
2016-01-26 03:55:48 +09:00
try {
$f = File :: getKV ( 'id' , $r -> file_id );
$r -> file = $f ;
$r -> redir_url = $f -> url ;
} catch ( NoResultException $e ) {
// Invalid entry, delete and run again
common_log ( LOG_ERR , " Could not find File with id= " . $r -> file_id . " referenced in File_redirection, deleting File redirection entry and creating new File and File_redirection entries. " );
$r -> delete ();
return self :: where ( $in_url );
}
2015-11-02 14:15:08 +09:00
return $r ;
}
2015-02-20 02:34:48 +09:00
} catch ( NoResultException $e ) {
try {
2015-11-02 14:15:08 +09:00
$f = File :: getByUrl ( $in_url );
$redir -> file_id = $f -> id ;
$redir -> file = $f ;
return $redir ;
2016-01-26 03:55:48 +09:00
} catch ( NoResultException $e ) {
2015-02-20 02:34:48 +09:00
// Oh well, let's keep going
2010-03-11 07:31:29 +09:00
}
}
2016-01-26 03:55:48 +09:00
if ( $discover ) {
2015-11-02 14:15:08 +09:00
$redir_info = File_redirection :: lookupWhere ( $in_url );
if ( is_string ( $redir_info )) {
$redir_info = array ( 'url' => $redir_info );
}
2016-01-26 03:55:48 +09:00
// Save the file if we don't have it already
$redir -> file = File :: saveNew ( $redir_info , $redir_info [ 'url' ]);
// If this is a redirection, save it
// (if it hasn't been saved yet by some other process while we we
// were running lookupWhere())
if ( $redir_info [ 'url' ] != $in_url ) {
try {
$file_redir = File_redirection :: getByUrl ( $in_url );
} catch ( NoResultException $e ) {
$file_redir = new File_redirection ();
$file_redir -> urlhash = File :: hashurl ( $in_url );
$file_redir -> url = $in_url ;
$file_redir -> file_id = $redir -> file -> getID ();
$file_redir -> insert ();
$file_redir -> redir_url = $redir -> file -> url ;
}
$file_redir -> file = $redir -> file ;
return $file_redir ;
}
2015-12-08 19:42:09 +09:00
}
2015-11-02 14:15:08 +09:00
return $redir ;
2009-05-14 03:27:32 +09:00
}
2010-03-11 06:39:42 +09:00
/**
* Shorten a URL with the current user ' s configured shortening
* options , if applicable .
*
* If it cannot be shortened or the " short " URL is longer than the
* original , the original is returned .
*
* If the referenced item has not been seen before , embedding data
* may be saved .
*
* @ param string $long_url
2010-12-03 03:56:44 +09:00
* @ param User $user whose shortening options to use ; defaults to the current web session user
2010-03-11 06:39:42 +09:00
* @ return string
*/
2015-07-08 03:16:05 +09:00
static function makeShort ( $long_url , $user = null )
2010-04-26 15:40:36 +09:00
{
2009-06-26 03:10:34 +09:00
$canon = File_redirection :: _canonUrl ( $long_url );
2010-12-03 03:56:44 +09:00
$short_url = File_redirection :: _userMakeShort ( $canon , $user );
2009-06-26 03:10:34 +09:00
// Did we get one? Is it shorter?
2010-04-26 15:40:36 +09:00
2015-07-08 03:16:05 +09:00
return ! empty ( $short_url ) ? $short_url : $long_url ;
2010-04-26 15:40:36 +09:00
}
/**
* Shorten a URL with the current user ' s configured shortening
* options , if applicable .
*
* If it cannot be shortened or the " short " URL is longer than the
* original , the original is returned .
*
* If the referenced item has not been seen before , embedding data
* may be saved .
*
* @ param string $long_url
* @ return string
*/
2015-07-08 03:16:05 +09:00
static function forceShort ( $long_url , $user )
2010-04-26 15:40:36 +09:00
{
$canon = File_redirection :: _canonUrl ( $long_url );
2010-12-08 03:50:05 +09:00
$short_url = File_redirection :: _userMakeShort ( $canon , $user , true );
2010-04-26 15:40:36 +09:00
// Did we get one? Is it shorter?
2015-07-08 03:16:05 +09:00
return ! empty ( $short_url ) ? $short_url : $long_url ;
2009-05-14 03:27:32 +09:00
}
2015-06-07 16:52:48 +09:00
static function _userMakeShort ( $long_url , User $user = null , $force = false ) {
2010-12-08 03:50:05 +09:00
$short_url = common_shorten_url ( $long_url , $user , $force );
2009-06-26 03:10:34 +09:00
if ( ! empty ( $short_url ) && $short_url != $long_url ) {
2009-05-14 03:27:32 +09:00
$short_url = ( string ) $short_url ;
// store it
2015-12-28 05:36:23 +09:00
try {
$file = File :: getByUrl ( $long_url );
} catch ( NoResultException $e ) {
2010-03-11 06:39:42 +09:00
// Check if the target URL is itself a redirect...
2016-01-26 03:55:48 +09:00
// This should already have happened in processNew in common_shorten_url()
2015-11-02 14:15:08 +09:00
$redir = File_redirection :: where ( $long_url );
2016-01-26 03:55:48 +09:00
$file = $redir -> file ;
2009-05-14 03:27:32 +09:00
}
2015-12-28 05:36:23 +09:00
// Now we definitely have a File object in $file
try {
$file_redir = File_redirection :: getByUrl ( $short_url );
} catch ( NoResultException $e ) {
$file_redir = new File_redirection ();
2015-02-20 02:34:48 +09:00
$file_redir -> urlhash = File :: hashurl ( $short_url );
2009-05-14 03:27:32 +09:00
$file_redir -> url = $short_url ;
2015-12-28 05:36:23 +09:00
$file_redir -> file_id = $file -> getID ();
2009-05-14 03:27:32 +09:00
$file_redir -> insert ();
}
return $short_url ;
}
2009-06-26 03:10:34 +09:00
return null ;
2009-05-14 03:27:32 +09:00
}
2010-11-16 04:01:00 +09:00
/**
* Basic attempt to canonicalize a URL , cleaning up some standard variants
* such as funny syntax or a missing path . Used internally when cleaning
* up URLs for storage and following redirect chains .
*
* Note that despite being on File_redirect , this function DOES NOT perform
* any dereferencing of redirects .
*
* @ param string $in_url input URL
* @ param string $default_scheme if given a bare link ; defaults to 'http://'
* @ return string
*/
2015-06-07 16:52:48 +09:00
static function _canonUrl ( $in_url , $default_scheme = 'http://' ) {
2009-05-14 03:27:32 +09:00
if ( empty ( $in_url )) return false ;
$out_url = $in_url ;
$p = parse_url ( $out_url );
if ( empty ( $p [ 'host' ]) || empty ( $p [ 'scheme' ])) {
list ( $scheme ) = explode ( ':' , $in_url , 2 );
2012-08-02 20:38:11 +09:00
switch ( strtolower ( $scheme )) {
2009-05-14 03:27:32 +09:00
case 'fax' :
case 'tel' :
$out_url = str_replace ( '.-()' , '' , $out_url );
break ;
2016-01-24 20:47:31 +09:00
// non-HTTP schemes, so no redirects
2016-01-24 20:42:53 +09:00
case 'bitcoin' :
2009-05-14 03:27:32 +09:00
case 'mailto' :
case 'aim' :
case 'jabber' :
case 'xmpp' :
2009-11-10 04:01:46 +09:00
// don't touch anything
2009-05-14 03:27:32 +09:00
break ;
2016-01-24 20:47:31 +09:00
// URLs without domain name, so no redirects
case 'magnet' :
// don't touch anything
break ;
2016-02-03 22:19:08 +09:00
// URLs with coordinates, not browsable domain names
case 'geo' :
// don't touch anything
break ;
2009-05-14 03:27:32 +09:00
default :
$out_url = $default_scheme . ltrim ( $out_url , '/' );
$p = parse_url ( $out_url );
if ( empty ( $p [ 'scheme' ])) return false ;
break ;
}
}
2009-08-25 09:44:06 +09:00
if (( 'ftp' == $p [ 'scheme' ]) || ( 'ftps' == $p [ 'scheme' ]) || ( 'http' == $p [ 'scheme' ]) || ( 'https' == $p [ 'scheme' ])) {
2009-05-14 03:27:32 +09:00
if ( empty ( $p [ 'host' ])) return false ;
if ( empty ( $p [ 'path' ])) {
$out_url .= '/' ;
}
}
return $out_url ;
}
2015-06-07 16:52:48 +09:00
static function saveNew ( $data , $file_id , $url ) {
2009-05-14 03:27:32 +09:00
$file_redir = new File_redirection ;
2015-10-14 05:57:45 +09:00
$file_redir -> urlhash = File :: hashurl ( $url );
2009-05-14 03:27:32 +09:00
$file_redir -> url = $url ;
$file_redir -> file_id = $file_id ;
$file_redir -> redirections = intval ( $data [ 'redirects' ]);
$file_redir -> httpcode = intval ( $data [ 'code' ]);
$file_redir -> insert ();
}
2015-02-20 02:59:28 +09:00
static public function beforeSchemaUpdate ()
{
$table = strtolower ( get_called_class ());
$schema = Schema :: get ();
$schemadef = $schema -> getTableDef ( $table );
// 2015-02-19 We have to upgrade our table definitions to have the urlhash field populated
if ( isset ( $schemadef [ 'fields' ][ 'urlhash' ]) && in_array ( 'urlhash' , $schemadef [ 'primary key' ])) {
// We already have the urlhash field, so no need to migrate it.
return ;
}
2015-02-20 03:40:36 +09:00
echo " \n Found old $table table, upgrading it to contain 'urlhash' field... " ;
2015-02-20 02:59:28 +09:00
// We have to create a urlhash that is _not_ the primary key,
// transfer data and THEN run checkSchema
$schemadef [ 'fields' ][ 'urlhash' ] = array (
'type' => 'varchar' ,
'length' => 64 ,
2015-02-20 06:06:43 +09:00
'not null' => true ,
'description' => 'sha256 hash of the URL' ,
2015-02-20 02:59:28 +09:00
);
2015-02-20 06:06:43 +09:00
$schemadef [ 'fields' ][ 'url' ] = array (
'type' => 'text' ,
'description' => 'short URL (or any other kind of redirect) for file (id)' ,
);
unset ( $schemadef [ 'primary key' ]);
2015-02-20 02:59:28 +09:00
$schema -> ensureTable ( $table , $schemadef );
echo " DONE. \n " ;
$classname = ucfirst ( $table );
$tablefix = new $classname ;
// urlhash is hash('sha256', $url) in the File table
2015-02-20 03:40:36 +09:00
echo " Updating urlhash fields in $table table... " ;
2015-02-20 02:59:28 +09:00
// Maybe very MySQL specific :(
$tablefix -> query ( sprintf ( 'UPDATE %1$s SET %2$s=%3$s;' ,
$schema -> quoteIdentifier ( $table ),
'urlhash' ,
// The line below is "result of sha256 on column `url`"
'SHA2(url, 256)' ));
echo " DONE. \n " ;
echo " Resuming core schema upgrade... " ;
}
2015-11-02 14:15:08 +09:00
public function getFile () {
if ( empty ( $this -> file ) && $this -> file_id ) {
$this -> file = File :: getKV ( 'id' , $this -> file_id );
}
return $this -> file ;
}
2016-02-03 08:34:49 +09:00
}