#眉標=文字編碼 #副標=以.NET實踐資料庫文字欄位壓縮 #大標=資料庫空間收納魔法 #作者=作者/李明儒 ============= 程式1 public static byte[] GZipCompress(byte[] rawData) { MemoryStream ms = new MemoryStream(); GZipStream zipStream = new GZipStream(ms, CompressionMode.Compress, true); zipStream.Write(rawData, 0, rawData.Length); zipStream.Close(); return ms.ToArray(); } public static byte[] GZipDecompress(byte[] compressedData) { MemoryStream ms = new MemoryStream(compressedData); GZipStream zipStream = new GZipStream(ms, CompressionMode.Decompress); MemoryStream outBuffer = new MemoryStream(); byte[] block = new byte[1024]; while (true) { int bytesRead = zipStream.Read(block, 0, block.Length); if (bytesRead <= 0) break; else outBuffer.Write(block, 0, bytesRead); } zipStream.Close(); return outBuffer.ToArray(); }================ ============= 程式2 private void CompressBenchmark() { //逐一開啟所有檔案 foreach (string file in Directory.GetFiles("D:\\Temp\\Benchmark")) { //讀取檔案內容成為Byte Array byte[] bIn = File.ReadAllBytes(file); //記錄開始壓縮時間 DateTime dtZipStart = DateTime.Now; //進行壓縮,取得Byte Array //分別使用GZipCompress及DefalteCompress做比較 byte[] bZip = Compressor.DeflateCompress(bIn); //計算壓縮所耗時間 TimeSpan tsZip = DateTime.Now - dtZipStart; //記錄開始解壓縮時間 DateTime dtUnzipStart = DateTime.Now; //進行解壓縮,結果存入Byte Array byte[] bOut = Compressor.DeflateDecompress(bZip); //計算解壓縮所耗時間 TimeSpan tsUnzip = DateTime.Now - dtUnzipStart; //比對原始內容與壓縮->解壓縮的內容是否完全相符 if (bIn.Length != bOut.Length) throw new ApplicationException("Decomprssion Failed!"); //每一個Byte都必須完全相同 for (int i=0; i============== ============= 程式3 public static string CompressString(string srcString) { //使用Encoding.UTF8將字串轉為Byte Array //再對Byte Array做DeflateCompress byte[] data = DeflateCompress(Encoding.UTF8.GetBytes(srcString)); //以Base64編碼將壓縮後資料轉為字串傳回 return Convert.ToBase64String(data); } public static string DecompressString(string compString) { //對Base64編碼的壓縮字串解碼取得Byte Array //使用DeflateDecompress解壓還原 byte[] data = DeflateDecompress( Convert.FromBase64String(compString) ); //解壓後的Byte Array還原回字串傳回 return Encoding.UTF8.GetString(data); }================ ============= 程式4 private void TestStringCompression() { DateTime dtStart = DateTime.Now; //將檔案內容讀成字串 string srcString = File.ReadAllText(@"D:\Temp\Benchmark\System.Web.xml"); SqlConnection cn = new SqlConnection( "Data Source=(local); Integrated Security=SSPI;" +"Initial Catalog=Lab;"); cn.Open(); SqlCommand cmd = new SqlCommand( "INSERT INTO StringArchive VALUES (@No, @DataString)", cn); cmd.Parameters.Add("@No", SqlDbType.Int).Value = 1; cmd.Parameters.Add("@DataString", SqlDbType.NVarChar) .Value = srcString; cmd.ExecuteNonQuery(); //寫入第二筆原始字串 cmd.Parameters["@No"].Value = 2; cmd.Parameters["@DataString"].Value = Compressor.CompressString(srcString); cmd.ExecuteNonQuery(); //讀取剛才的內容進行檢驗 cmd.CommandText = "SELECT * FROM StringArchive"; SqlDataReader dr = cmd.ExecuteReader(); string s1 = "", s2 = ""; while (dr.Read()) { if ((int)dr["No"] == 1) s1 = dr["DataString"].ToString(); if ((int)dr["No"] == 2) s2 = dr["DataString"].ToString(); } cn.Close(); //比較二者大小 Console.WriteLine( string.Format("原始大小: {0:#,#} 壓縮後大小: {1:#,#}", s1.Length, s2.Length) ); //解壓並比較內容是否一致 if (s1.CompareTo(Compressor.DecompressString(s2)) == 0) Console.WriteLine("內容驗證無誤!"); else Console.WriteLine("內容驗證失敗!"); TimeSpan ts = (DateTime.Now - dtStart); Console.WriteLine( string.Format("測試耗時={0}ms", ts.TotalMilliseconds) ); }================ ============= 程式5 private void TestUTF16Conversion() { byte[] bin = new byte[2]; byte[] binVerify; string ch = ""; for (int i = 0; i <= 0xFF; i++) { //使用以下邏輯跳過D800-DFFF就不會出錯 //if (i >= 0xD8 && i <= 0xDF) continue; for (int j = 0; j <= 0xFF; j++) { //測試65536二進位組合 bin[0] = Convert.ToByte(j); bin[1] = Convert.ToByte(i); ch = Encoding.Unicode.GetString(bin); binVerify = Encoding.Unicode.GetBytes(ch); //驗證二進位資料轉字串後再轉回來有無失真 if (binVerify.Length != 2 || binVerify[0] != j || binVerify[1] != i) throw new ApplicationException( string.Format("Failed [{0:x}]!", i * 0x100 + j)); } } MessageBox.Show("Successed!"); }================ ============= 程式6 /// /// 二進位資料與UTF16字串轉換物件 /// class EnhancedUTF16Coverter { //當轉換時有加Padding Byte時字串要加上註記 private static string paddingFlag = "??"; //00 D8 00 DC   //如果你沒在Wikipedia Copy到這個特殊字元 //可寫為paddingFlag=Encoding.Unicode.GetString( // new byte[] { 0x00, 0xD8, 0x00, 0xDC } ); /// /// 將byte[]視為UTF16編碼資料轉換為字串,重點在於D800-DFFF段的額外轉換 /// /// 要處理的byte[] /// Unicode字串 public static string ToUTF16(byte[] data) { //若byte[]長度為奇數時,要補上一個Padding Byte bool needPadding = (data.Length % 2 == 1); //D800-DFFF要轉換成Surrogate Pair用的四個Byte byte[] b4 = new byte[4] { 0x00, 0xd8, 0x00, 0x00 }; //使用MemoryStream接收轉換結果 using (MemoryStream ms = new MemoryStream()) { for (int i = 0; i < data.Length; i += 2) { if (needPadding && i == (data.Length - 1)) //需要Padding時,最後一個位元另外處理 { //只輸出單一個Byte ms.Write(data, i, 1); //最後再補一個Byte 0 ms.WriteByte(0); } else { //檢查是否為D800-DF00區段,若對映成特定的Surrogate Pair(4 Bytes) if (data[i + 1] >= 0xd8 && data[i + 1] <= 0xdf) { b4[0] = (byte)((data[i + 1] < 0xdc) ? 0x40 : 0x41); b4[2] = data[i]; b4[3] = (byte)(0xdc + (data[i + 1] - 0xd8) % 4); ms.Write(b4, 0, 4); } else ms.Write(data, i, 2); } } return Encoding.Unicode.GetString(ms.ToArray()) + ((needPadding) ? paddingFlag : "") //Padding時,字串尾加上特別註記 ; } } /// /// 將字串以UTF16編輯轉回byte[],重點在於D800-DFFF段的額外轉換 /// /// Unicode字串 /// 轉換回的byte[] public static byte[] FromUTF16(string dataString) { //當初轉換時是否有加上Padding bool withPadding = dataString.EndsWith(paddingFlag); byte[] data = Encoding.Unicode.GetBytes(dataString); //如果有Padding註記,最後的4個Byte不處理 int dataLen = (withPadding) ? data.Length-4 : data.Length; //轉換回D800-DFFF時使用 byte[] b2 = new byte[] { 0x00, 0x00 }; using (MemoryStream ms = new MemoryStream()) { for (int i = 0; i < dataLen; i += 2) { //若為U+20000-U+203FF(40 D8 XX XX - 41 D8 XX XX), 對映回D800-DFFF if (data[i + 1] == 0xd8 && (data[i] == 0x40 || data[i] == 0x41)) { b2[0] = data[i + 2]; b2[1] = (byte) (0xd8 + (data[i + 3] - 0xdc) + ((data[i] == 0x41) ? 4 : 0)); ms.Write(b2, 0, 2); i += 2; } else if (withPadding && i == dataLen - 2) //有Padding時,最後只輸出一個Byte ms.Write(data, i, 1); //只支援40 D8 XX XX - 41 D8 XX XX,若有其他SIP字元出現,則表示異常! else if (data[i + 1] >= 0xd8 && data[i + 1] <= 0xdf) { throw new ApplicationException( string.Format("無效的EnhancedUTF16Coverter字元{0}!", BitConverter.ToString(data, i, 4) ) ); } else ms.Write(data, i, 2); } return ms.ToArray(); } } }================ ============= 程式7 using System; using System.Data; using System.Data.SqlClient; using System.Data.SqlTypes; using Microsoft.SqlServer.Server; public partial class UserDefinedFunctions { [Microsoft.SqlServer.Server.SqlFunction] //加上SqlFacet(MaxSize=-1)以打破8000長度限制 [return: SqlFacet(MaxSize = -1)] public static SqlString DeflateCompressString( [SqlFacet(MaxSize = -1)] SqlString rawString ) { if (rawString.IsNull) return SqlString.Null; else return new SqlString( Compressor.CompressString( rawString.ToString()) ); } [Microsoft.SqlServer.Server.SqlFunction] [return: SqlFacet(MaxSize = -1)] public static SqlString DeflateDecompressString( [SqlFacet(MaxSize = -1)] SqlString compString ) { if (compString.IsNull) return SqlString.Null; else return new SqlString( Compressor.DecompressString( compString.ToString()) ); } [Microsoft.SqlServer.Server.SqlFunction] [return: SqlFacet(MaxSize = -1)] public static SqlString DecompressStringOnDemand( [SqlFacet(MaxSize = -1)] SqlString inpString ) { if (inpString.IsNull) return SqlString.Null; else { string s = inpString.ToString(); //由最前方四個位元判定是否為 //Deflate壓過的字串,若是則解壓縮 string DeflateHeaderMark = System.Text.Encoding.Unicode. GetString(new byte[] { 0xED, 0xBD, 0x07, 0x60 } ); if (s.Substring(0, 2) == DeflateHeaderMark) return new SqlString( Compressor.DecompressString( s) ); else return new SqlString(s); } } };================ ==T-SQL指令 =========== --使用SQLCLR UDF處理先前加入的壓縮字串 --另外新增一筆NO=3 INSERT INTO StringArchive SELECT 3 AS No, dbo.DeflateDecompressString(DataString) FROM StringArchive WHERE No=2 --以SQLCLR UDF將NO=3的字串壓縮後存入NO=4 INSERT INTO StringArchive SELECT 4 AS No, dbo.DeflateCompressString(DataString) FROM StringArchive WHERE No=3 --檢查大小 SELECT No, LEN(DataString) AS 字串長度, DATALENGTH(DataString) AS 佔用Byte數 FROM StringArchive 執行結果: No 字串長度 佔用Byte數 ----------- -------------------- -------------------- 1 5310105 10620210 2 328321 656642 3 5310105 10620210 4 328321 656642 ================ ==參考資料=========== 1. SharpZip Library http://www.icsharpcode.net/OpenSource/SharpZipLib/。 2. 多國語系的資料處理與整合技術探討:http://www.microsoft.com/taiwan/technet/columns/profwin/multilanguage.mspx。 3. Wikipedia中關於Unicode的介紹:http://zh.wikipedia.org/w/index.php?title=Unicode&variant=zh-tw。 ================