diff mde/file/serialize.d @ 82:ac1e3fd07275

New ssi file format. (De)serializer now supports non-ascii wide characters (encoded to UTF-8) and no longer supports non-ascii 8-bit chars which would result in bad UTF-8. Moved/renamed a few things left over from the last commit.
author Diggory Hardy <diggory.hardy@gmail.com>
date Sat, 30 Aug 2008 09:37:35 +0100
parents d8fccaa45d5f
children 79d816b3e2d2
line wrap: on
line diff
--- a/mde/file/serialize.d	Fri Aug 29 11:59:43 2008 +0100
+++ b/mde/file/serialize.d	Sat Aug 30 09:37:35 2008 +0100
@@ -17,7 +17,8 @@
  * Generic serialization templated function.
  *
  * Supports:
- *  Associative arrays, arrays (inc. strings), structs, char types, bool, int types, float types.
+ *  Associative arrays, dynamic arrays (with usual formatting of strings), structs, char types,
+ *  bool, int types, float types.
  *
  * Examples:
  * ------------------------------------------------------------------------------------------------
@@ -51,6 +52,8 @@
  *************************************************************************************************/
 //NOTE: in case of multiple formats, make this a dummy module importing both serialize modules,
 // or put all the code here.
+//FIXME: Optimize by using a slicing buffer. Put everything in a struct containing this buffer to
+// make it thread-safe.
 module mde.file.serialize;
 // Since serialize is never used in a module where deserialize is not used, save an import:
 public import mde.file.deserialize;
@@ -181,8 +184,8 @@
     }
     // Basic types
     else static if (is(U == char)) {            // char (UTF-8 byte)
-        // Note: if (val > 127) "is invalid UTF-8 single char".  However we don't know
-        // what this is for, in particular if it will be recombined with other chars later.
+        if (val > 127)      // outputing invalid utf-8 could corrupt the output stream
+            throw new IllegalArgumentException ("Not a valid UTF-8 character");
         
         // Can't return reference to static array; so making it dynamic is cheaper than copying.
         char[] ret = new char[4];	// max length for an escaped char
@@ -200,11 +203,16 @@
         }
     } else static if (is(U == wchar) ||
                       is(U == dchar)) {         // wchar or dchar (UTF-16/32 single char)
-        // Note: only ascii can be converted. NOTE: convert to UTF-8 (multibyte) char?
         if (val <= 127u)
             return serialize!(char) (cast(char) val);  // ASCII
-        else throw new UnicodeException (
-            "Error: unicode non-ascii character cannot be converted to a single UTF-8 char", 0);
+        else {  // convert to a multi-byte UTF-8 char
+            // NOTE: suboptimal
+            char[] t,ret;
+            t = Utf.toString([val]);
+            ret.length = t.length + 2;
+            ret = '\'' ~ t ~ '\'';
+            return ret;
+        }
     } else static if (is (U == bool)) {         // boolean
         static if (BINARY_AS_WORDS) {
             if (val)
@@ -288,12 +296,12 @@
 
 
 
-debug (UnitTest) {
+debug (mdeUnitTest) {
     import tango.util.log.Log : Log, Logger;
 
     private Logger logger;
     static this() {
-        logger = Log.getLogger ("text.serialize");
+        logger = Log.getLogger ("mde.file.serialize");
     }
 unittest {
     // Utility
@@ -303,7 +311,7 @@
             dg();
         } catch (Exception e) {
             r = true;
-            logger.info ("Exception caught: "~e.msg);
+            logger.trace ("Exception caught: "~e.msg);
         }
         return r;
     }
@@ -351,8 +359,9 @@
     assert (serialize!(char) ('\'') == "\'\\\'\'");
     assert (serialize!(wchar) ('X') == "'X'");
     assert (serialize!(dchar) ('X') == "'X'");
-    assert (throws ({ char[] r = serialize!(wchar) ('£');   /* unicode U+00A3 */ }));
-    assert (throws ({ char[] r = serialize!(dchar) ('£'); }));
+    assert (serialize!(wchar) ('£') == "'£'");  // unicode U+00A3 i.e. a multi-byte UTF-8 char
+    assert (serialize!(dchar) ('£') == "'£'");
+    assert (throws ({ serialize!(char) ('£'); }));      // compiler converts £ to char, but it's not valid UTF-8
     
     // Bool
     static if (BINARY_AS_WORDS)