diff options
-rw-r--r-- | libraries/libuchardet/README | 10 | ||||
-rw-r--r-- | libraries/libuchardet/libuchardet.SlackBuild | 15 | ||||
-rw-r--r-- | libraries/libuchardet/libuchardet.info | 6 | ||||
-rw-r--r-- | libraries/libuchardet/slack-desc | 8 | ||||
-rw-r--r-- | libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch | 116 | ||||
-rw-r--r-- | libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch | 30 |
6 files changed, 168 insertions, 17 deletions
diff --git a/libraries/libuchardet/README b/libraries/libuchardet/README index 59b601ae09..9822181500 100644 --- a/libraries/libuchardet/README +++ b/libraries/libuchardet/README @@ -1,8 +1,8 @@ libuchardet (encoding detector library) -uchardet is a C language binding of the original C++ implementation of the -universal charset detection library by Mozilla. +uchardet uchardet is a C language binding of the original C++ +implementation of the universal charset detection library by Mozilla. -uchardet is an encoding detector library, which takes a sequence of bytes -in an unknown character encoding without any additional information, and attempts -to determine the encoding of the text. +uchardet is an encoding detector library, which takes a sequence of +bytes in an unknown character encoding without any additional +information, and attempts to determine the encoding of the text. diff --git a/libraries/libuchardet/libuchardet.SlackBuild b/libraries/libuchardet/libuchardet.SlackBuild index c03264c73a..f398f3b5e4 100644 --- a/libraries/libuchardet/libuchardet.SlackBuild +++ b/libraries/libuchardet/libuchardet.SlackBuild @@ -1,7 +1,7 @@ #!/bin/sh -# + # Slackware build script for libuchardet. -# + # Copyright 2015 Edinaldo P. Silva, Rio de Janeiro, Brazil. # All rights reserved. # @@ -23,7 +23,7 @@ # ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PRGNAM=libuchardet -VERSION=${VERSION:-0.0.1} +VERSION=${VERSION:-0.0.5} BUILD=${BUILD:-1} TAG=${TAG:-_SBo} @@ -57,12 +57,13 @@ fi set -e SRCNAM="uchardet" +SRCVER="v0.0.5" rm -rf $PKG mkdir -p $TMP $PKG $OUTPUT rm -rf $TMP/$PRGNAM-$VERSION cd $TMP -tar xvf $CWD/$SRCNAM-$VERSION.tar.gz +tar xvf $CWD/$SRCVER.tar.gz mv $SRCNAM-$VERSION $PRGNAM-$VERSION cd $PRGNAM-$VERSION chown -R root:root . @@ -72,12 +73,16 @@ find -L . \ \( -perm 666 -o -perm 664 -o -perm 640 -o -perm 600 -o -perm 444 \ -o -perm 440 -o -perm 400 \) -exec chmod 644 {} \; +patch -Np1 < $CWD/uchardet-0.0.5-fix-ASCII-detection.patch +patch -Np1 < $CWD/uchardet-0.0.5-use-proper-package-name.patch + cmake \ -DCMAKE_CXX_FLAGS:STRING="$SLKCFLAGS" \ -DCMAKE_INSTALL_PREFIX=/usr \ -DCMAKE_INSTALL_LIBDIR=/usr/lib${LIBDIRSUFFIX} \ . make +#make test make install DESTDIR=$PKG find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \ @@ -89,7 +94,7 @@ for i in $( find $PKG/usr/man -type l ) ; do ln -s $( readlink $i ).gz $i.gz ; r rm -rf $PKG/usr/share mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION -cp -a AUTHORS COPYING INSTALL $PKG/usr/doc/$PRGNAM-$VERSION +cp -a AUTHORS COPYING INSTALL README.md $PKG/usr/doc/$PRGNAM-$VERSION cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM-SlackBuild mkdir -p $PKG/install diff --git a/libraries/libuchardet/libuchardet.info b/libraries/libuchardet/libuchardet.info index ea48c49ae3..f354bf9af7 100644 --- a/libraries/libuchardet/libuchardet.info +++ b/libraries/libuchardet/libuchardet.info @@ -1,8 +1,8 @@ PRGNAM="libuchardet" -VERSION="0.0.1" +VERSION="0.0.5" HOMEPAGE="https://github.com/BYVoid/uchardet" -DOWNLOAD="http://uchardet.googlecode.com/files/uchardet-0.0.1.tar.gz" -MD5SUM="9c17f0aca38c66c95d400691a9160b1b" +DOWNLOAD="https://github.com/BYVoid/uchardet/archive/v0.0.5.tar.gz" +MD5SUM="2421993e7b098366bd008d81385150b6" DOWNLOAD_x86_64="" MD5SUM_x86_64="" REQUIRES="" diff --git a/libraries/libuchardet/slack-desc b/libraries/libuchardet/slack-desc index e94a9a920f..11f882aabd 100644 --- a/libraries/libuchardet/slack-desc +++ b/libraries/libuchardet/slack-desc @@ -8,10 +8,10 @@ |-----handy-ruler------------------------------------------------------| libuchardet: libuchardet (encoding detector library) libuchardet: -libuchardet: uchardet uchardet is a C language binding of the original C++ -libuchardet: implementation of the universal charset detection library by Mozilla. -libuchardet: uchardet is an encoding detector library, which takes a sequence of -libuchardet: bytes in an unknown character encoding without any additional +libuchardet: uchardet uchardet is a C language binding of the original C++ +libuchardet: implementation of the universal charset detection library by Mozilla. +libuchardet: uchardet is an encoding detector library, which takes a sequence of +libuchardet: bytes in an unknown character encoding without any additional libuchardet: information, and attempts to determine the encoding of the text. libuchardet: libuchardet: Home page: https://github.com/BYVoid/uchardet/ diff --git a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch new file mode 100644 index 0000000000..c82aee866e --- /dev/null +++ b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch @@ -0,0 +1,116 @@ +commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f +Author: Jehan <jehan@girinstud.io> +Date: Sat Dec 5 21:04:20 2015 +0100 + + Nearly-ASCII text with NBSP is still not ASCII. + + There is no "exception" in encoding. The non-breaking space 0xA0 is not + ASCII, and therefore returning "ASCII" will later create issues (for + instance trying to re-encode with iconv produces an error). + This was obviously an explicit decision in original code (according to + code comments), probably tied to specifity of the original program from + Mozilla. Now we want strict detection. + I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only + exception" (note that I could have returned any ISO-8859 charsets since + they all have this character in common). + +diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp +index ab8bae0..ff06b9d 100644 +--- a/src/nsUniversalDetector.cpp ++++ b/src/nsUniversalDetector.cpp +@@ -47,6 +47,7 @@ + + nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) + { ++ mNbspFound = PR_FALSE; + mDone = PR_FALSE; + mBestGuess = -1; //illegal value as signal + mInTag = PR_FALSE; +@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector() + void + nsUniversalDetector::Reset() + { ++ mNbspFound = PR_FALSE; + mDone = PR_FALSE; + mBestGuess = -1; //illegal value as signal + mInTag = PR_FALSE; +@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + PRUint32 i; + for (i = 0; i < aLen; i++) + { +- /* Other than 0xA0, if every other character is ASCII, the page is ASCII. ++ /* If every other character is ASCII or 0xA0, we don't run charset ++ * probers. + * 0xA0 (NBSP in a few charset) is apparently a rare exception +- * of non-ASCII character contained in ASCII text. */ ++ * of non-ASCII character often contained in nearly-ASCII text. */ + if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') + { + /* We got a non-ASCII byte (high-byte) */ +@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + } + else + { +- //ok, just pure ascii so far +- if ( ePureAscii == mInputState && +- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) ++ /* Just pure ASCII or NBSP so far. */ ++ if (aBuf[i] == '\xA0') + { +- //found escape character or HZ "~{" ++ /* ASCII with the only exception of NBSP seems quite common. ++ * I doubt it is really necessary to train a model here, so let's ++ * just make an exception. ++ */ ++ mNbspFound = PR_TRUE; ++ } ++ else if (mInputState == ePureAscii && ++ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) ++ { ++ /* We found an escape character or HZ "~{". */ + mInputState = eEscAscii; + } + mLastChar = aBuf[i]; +@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + mDone = PR_TRUE; + mDetectedCharset = mEscCharSetProber->GetCharSetName(); + } ++ else if (mNbspFound) ++ { ++ mDetectedCharset = "ISO-8859-1"; ++ } + else + { + /* ASCII with the ESC character (or the sequence "~{") is still +@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + break; + + default: +- /* Pure ASCII */ +- mDetectedCharset = "ASCII"; ++ if (mNbspFound) ++ { ++ /* ISO-8859-1 is a good result candidate for ASCII + NBSP. ++ * (though it could have been any ISO-8859 encoding). */ ++ mDetectedCharset = "ISO-8859-1"; ++ } ++ else ++ { ++ /* Pure ASCII */ ++ mDetectedCharset = "ASCII"; ++ } + break; + } + return NS_OK; +diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h +index 4d9b460..9f0a4b1 100644 +--- a/src/nsUniversalDetector.h ++++ b/src/nsUniversalDetector.h +@@ -72,6 +72,7 @@ protected: + virtual void Report(const char* aCharset) = 0; + virtual void Reset(); + nsInputState mInputState; ++ PRBool mNbspFound; + PRBool mDone; + PRBool mInTag; + PRBool mStart; diff --git a/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch b/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch new file mode 100644 index 0000000000..b1ed88991c --- /dev/null +++ b/libraries/libuchardet/uchardet-0.0.5-use-proper-package-name.patch @@ -0,0 +1,30 @@ +commit b6d872bbec3be7abfccbdfd3d90e784cf7281c55 +Author: Jehan <jehan@girinstud.io> +Date: Tue Dec 15 21:40:16 2015 +0100 + + app: package name wrong in CMakeLists.txt. + + Probably coming from a copy-paste error when the build system was + originally created. + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 0b65c49..4f279e1 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -1,6 +1,6 @@ + ######## Project settings + cmake_minimum_required(VERSION 2.8) +-set (PACKAGE_NAME opencc) ++set (PACKAGE_NAME uchardet) + project (${PACKAGE_NAME} CXX C) + enable_testing() + +@@ -54,7 +54,7 @@ if (DEFINED SYSCONF_INSTALL_DIR) + set (DIR_ETC ${SYSCONF_INSTALL_DIR}) + endif (DEFINED SYSCONF_INSTALL_DIR) + +-set (DIR_SHARE_UCHARDET ${DIR_SHARE}/opencc) ++set (DIR_SHARE_UCHARDET ${DIR_SHARE}/uchardet) + set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale) + + ######## Configuration |