Frank Heckenbach wrote:
Ich habe nichts in fertiger Form. Mein Ansatz wäre:
- UTF-8 <-> UTF-32, also die Codierung der Zeichensequenzen (reine Rechnerei, unabhängig von der Bedeutung der Zeichen)
Ungefähr wie folgt (nur ein Ansatz, kein GNU Pascal).
Type Int32 = Integer attribute( size = 32); UCS4Code = Int32; EncodedChar = string[ 6]; PackedChar = packed array[ 0..0] of char; PackedCharPtr = ^PackedChar;
procedure UCS4CodeToUTF8 ( theUCS4Code : UCS4Code; var theUTF8Char : EncodedChar); var theIndex : Int32; begin if theUCS4Code <= $0000007F then begin theUTF8Char[ 0] := char( 1); theUTF8Char[ 1] := char( theUCS4Code) end else if theUCS4Code <= $000007FF then begin theUTF8Char[ 0] := char( 2); theUTF8Char[ 1] := char(( theUCS4Code shr 6) or $C0); theUTF8Char[ 2] := char(( theUCS4Code and $3F) or $80) end else if theUCS4Code <= $0000FFFF then begin theUTF8Char[ 0] := char( 3); theUTF8Char[ 1] := char(( theUCS4Code shr 12) or $E0); theUTF8Char[ 2] := char((( theUCS4Code shr 6) and $3F) or $80); theUTF8Char[ 3] := char(( theUCS4Code and $3F) or $80) end else if theUCS4Code <= $001FFFFF then begin theUTF8Char[ 0] := char( 4); theUTF8Char[ 1] := char(( theUCS4Code shr 18) or $F0); theUTF8Char[ 2] := char((( theUCS4Code shr 12) and $3F) or $80); theUTF8Char[ 3] := char((( theUCS4Code shr 6) and $3F) or $80); theUTF8Char[ 4] := char(( theUCS4Code and $3F) or $80) end else if theUCS4Code <= $03FFFFFF then begin theUTF8Char[ 0] := char( 5); theUTF8Char[ 1] := char(( theUCS4Code shr 24) or $F8); theUTF8Char[ 2] := char((( theUCS4Code shr 18) and $3F) or $80); theUTF8Char[ 3] := char((( theUCS4Code shr 12) and $3F) or $80); theUTF8Char[ 4] := char((( theUCS4Code shr 6) and $3F) or $80); theUTF8Char[ 5] := char(( theUCS4Code and $3F) or $80) end else begin theUTF8Char[ 0] := char( 6); theUTF8Char[ 1] := char(( theUCS4Code shr 30) or $FC); theUTF8Char[ 2] := char((( theUCS4Code shr 24) and $3F) or $80); theUTF8Char[ 3] := char((( theUCS4Code shr 18) and $3F) or $80); theUTF8Char[ 4] := char((( theUCS4Code shr 12) and $3F) or $80); theUTF8Char[ 5] := char((( theUCS4Code shr 6) and $3F) or $80); theUTF8Char[ 6] := char(( theUCS4Code and $3F) or $80) end; for theIndex := ORD( theUTF8Char[ 0]) + 1 to 6 do theUTF8Char[ theIndex] := char( 0) end;
und zurück
procedure GetUFT8CountBits ( theLeadingChar : Char; var theLeadingBitCount : Int32; var theLeadingMask : Int32); var theCountMask : Int32; theBitFlag : boolean; begin theLeadingBitCount := 0; theCountMask := $080; theLeadingMask := $07F; repeat theBitFlag := ORD( theLeadingChar) and theCountMask = theCountMask; if theBitFlag then begin theLeadingMask := theLeadingMask shr 1; theCountMask := theCountMask shr 1; theLeadingBitCount := theLeadingBitCount + 1 end until ( not theBitFlag) or ( theLeadingBitCount = 6) end;
procedure UFT8CharToUCS4 ( var theVarCharPtr : PackedCharPtr; var theVarCharCount : Int32; var theUCS4Code : UCS4Code; var theErrorFlag : boolean); var theLeadingMask : Int32; theByteLength : Int32; theBitShift : Int32; theIndex : Int32; theMaskedValue : Int32; theChar : char; begin theUCS4Code := 0; theErrorFlag := False; if ( theVarCharPtr = nil) or ( theVarCharCount = 0) then theErrorFlag := True else begin theChar := theVarCharPtr^[ 0]; if ORD( theChar) <= $07F then begin theUCS4Code := ORD( theChar); IncPtr ( theVarCharPtr, 1); theVarCharCount := theVarCharCount - 1 end else begin GetUFT8CountBits ( theChar, theByteLength, theLeadingMask); if ( theByteLength < 2) or ( theByteLength > theVarCharCount) then theErrorFlag := True else begin theBitShift := ( theByteLength - 1) * 6; theMaskedValue := ORD( theChar) and theLeadingMask; theUCS4Code := theMaskedValue shl theBitShift; IncPtr ( theVarCharPtr, 1); theVarCharCount := theVarCharCount - 1; theIndex := 2; while ( theIndex <= theByteLength) and not theErrorFlag do begin theChar := theVarCharPtr^[ 0]; if ORD( theChar) and $0C0 <> $080 then theErrorFlag := True else begin theMaskedValue := ORD( theChar) and $3F; theBitShift := theBitShift - 6; theUCS4Code := theUCS4Code or ( theMaskedValue shl theBitShift); theIndex := theIndex + 1; IncPtr ( theVarCharPtr, 1); theVarCharCount := theVarCharCount - 1 end end; end end end end;
Adriaan van Os