;;; -*- coding: iso-2022-7bit -*-
;;; utf-8m.el --- modified UTF-8 encoding for Mac OS X hfs plus volume format

;; Copyright (C) 2004-2005  Seiji Zenitani <zenitani@mac.com>

;; Author: Seiji Zenitani <zenitani@mac.com>
;; Version: v20050805
;; Keywords: mac, multilingual, Unicode, UTF-8
;; Created: 2004-02-20
;; Compatibility: Mac OS X (Carbon Emacs)
;; URL(jp): http://home.att.ne.jp/alpha/z123/emacs-mac-j.html
;; URL(en): http://home.att.ne.jp/alpha/z123/elisp-e.html

;; This file is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; This file is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING.  If not, write to
;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary:

;; This package provides a modified utf-8 encoding (utf-8m) for Mac OSX
;; hfs plus volume format. By setting utf-8m as the file-name-coding-system,
;; emacs can read the following characters in filenames.
;; 
;;  * Japanese Kana characters with Dakuten/Han-Dakuten signs
;;  * Latin characters with diacritical marks (accents, umlauts, tilde, etc.)
;;
;; Note that utf-8m does not restore the above characters when
;; it exports the filenames. Fortunately, it seems that the filesystem
;; knows how to deal with such invalid filenames.
;;
;; In order to use, add the below line to your .emacs file.
;; 
;;   (set-file-name-coding-system 'utf-8m)
;;

;;; utf-8m $B$K$D$$$F(B

;; Mac OS X $B$N(B HFS+ $B%U%!%$%k%7%9%F%`$N%U%!%$%kL>$rFI$`$?$a$N(B
;; $B=$@5(B UTF8 $B%(%s%3!<%G%#%s%0(B (utf-8m) $B$rDs6!$7$^$9!#(B
;; $B%U%!%$%kL>$rFI$_9~$`:]$K@55,2=J}<0$rJQ99$9$k$N$G(B
;; $BF|K\8l$NByE@!&H>ByE@J8;z$H%"%/%;%s%HIU$-$N%i%F%sJ8;z$,(B
;; $BJ8;z2=$1$7$J$$$h$&$K$J$j$^$9!#%U%!%$%kL>$r=q$-=P$9:]$NJQ49$O(B
;; $B9MN8$7$F$$$^$;$s$,!"%U%!%$%k%7%9%F%`B&$,$&$^$/=hM}$7$F$/$l$k$h$&$G$9!#(B
;; utf-8m $B$r;HMQ$9$k$?$a$K$O!"$3$N%U%!%$%k$rFI$_9~$s$@$N$A!"(B
;;
;;   (set-file-name-coding-system 'utf-8m)
;;
;; $B$H$7$F2<$5$$!#(B


;;; Code:


;; Japanese Kana characters with Dakuten/Han-Dakuten signs

(defvar utf-8m-fix-kana1-alist
  '( ?$B$+(B ?$B$-(B ?$B$/(B ?$B$1(B ?$B$3(B ?$B$5(B ?$B$7(B ?$B$9(B ?$B$;(B ?$B$=(B
     ?$B$?(B ?$B$A(B ?$B$D(B ?$B$F(B ?$B$H(B ?$B$O(B ?$B$R(B ?$B$U(B ?$B$X(B ?$B$[(B
     ?$B%+(B ?$B%-(B ?$B%/(B ?$B%1(B ?$B%3(B ?$B%5(B ?$B%7(B ?$B%9(B ?$B%;(B ?$B%=(B
     ?$B%?(B ?$B%A(B ?$B%D(B ?$B%F(B ?$B%H(B ?$B%O(B ?$B%R(B ?$B%U(B ?$B%X(B ?$B%[(B
     ?$B!3(B ?$B!5(B ))
(defvar utf-8m-fix-kana2-alist
  '( ?$B$O(B ?$B$R(B ?$B$U(B ?$B$X(B ?$B$[(B ?$B%O(B ?$B%R(B ?$B%U(B ?$B%X(B ?$B%[(B ))
;; (defvar utf-8m-fix-kana3-alist
;;   '(?$B%o(B ?$B%p(B ?$B%q(B ?$B%r(B ))

(defun utf-8m-post-read-kana-conversion (length)
  "Document forthcoming..."
  (save-excursion
    (while (not (eobp))
      (let ((ch1 (char-before))
            (ch2 (char-after)))
        (cond
         ((= ch2 302969)
          (cond
           ((memq ch1 utf-8m-fix-kana1-alist)
            (delete-char -1)
            (delete-char 1)
            (insert (+ ch1 1))
            (setq length (- length 1))
            )
;;            ((memq ch1 utf-8m-fix-kana3-alist)
;;             (delete-char -1)
;;             (delete-char 1)
;;             (insert (+ ch1 1244))
;;             (setq length (- length 1))
;;             )
           ((= ch1 ?$B%&(B)
            (delete-char -1)
            (delete-char 1)
            (insert ?$B%t(B)
            (setq length (- length 1))
            )))
          ((= ch2 302970)
           (cond
            ((memq ch1 utf-8m-fix-kana2-alist)
             (delete-char -1)
             (delete-char 1)
             (insert (+ ch1 2))
             (setq length (- length 1))
             ))))
        (if (not (eobp))(forward-char))
        )))
  length)


;;  Latin characters with diacritical marks

(defvar utf-8m-fix-latin-alist
  '(
    (332480 . ( ;; grave
               (?A . ?,A@(B) (?E . ?,AH(B) (?I . ?,AL(B) (?O . ?,AR(B) (?U . ?,AY(B)
               (?a . ?,A`(B) (?e . ?,Ah(B) (?i . ?,Al(B) (?o . ?,Ar(B) (?u . ?,Ay(B)
               ))
    (332481 . ( ;; acute
               (?A . ?,AA(B) (?E . ?,AI(B) (?I . ?,AM(B) (?O . ?,AS(B) (?U . ?,AZ(B)
               (?a . ?,Aa(B) (?e . ?,Ai(B) (?i . ?,Am(B) (?o . ?,As(B) (?u . ?,Az(B)
               ))
    (332482 . ( ;; circumflex
               (?A . ?,AB(B) (?E . ?,AJ(B) (?I . ?,AN(B) (?O . ?,AT(B) (?U . ?,A[(B)
               (?a . ?,Ab(B) (?e . ?,Aj(B) (?i . ?,An(B) (?o . ?,At(B) (?u . ?,A{(B)
               ))
    (332483 . ( ;; tilda
               (?A . ?,bC(B) (?N . ?,bQ(B) (?O . ?,bU(B)
               (?a . ?,bc(B) (?n . ?,bq(B) (?o . ?,bu(B)
               ))
    (332488 . ( ;; umlaut
               (?A . ?,AD(B) (?E . ?,AK(B) (?I . ?,AO(B) (?O . ?,AV(B) (?U . ?,A\(B)
               (?a . ?,Ad(B) (?e . ?,Ak(B) (?i . ?,Ao(B) (?o . ?,Av(B) (?u . ?,A|(B) (?y . ?,A(B)
               ))
    (332490 . ( ;; angstrom
               (?A . ?,AE(B)
               (?a . ?,Ae(B)
               ))
    (332519 . ( ;; cedille
               (?C . ?,bG(B)
               (?c . ?,bg(B)
               ))
    ))

(defun utf-8m-post-read-latin-conversion (length)
  "Document forthcoming..."
  (save-excursion
    (dotimes (i length) (forward-char))
    (let ((accent_char nil)
          (accent_count 0))
      (while (not (bobp))
        (let ((ch (char-before)))
          (cond
           ((and (= accent_count 1)
                 (assoc accent_char utf-8m-fix-latin-alist)
                 (assoc ch (cdr (assoc accent_char utf-8m-fix-latin-alist)))
                 )
            (delete-char -1)
            (delete-char 1)
            (insert
             (cdr (assoc ch
                         (cdr (assoc accent_char utf-8m-fix-latin-alist))
                         )))
            (setq length (- length 1))
            (setq accent_count 0)
            )
           ((assoc ch utf-8m-fix-latin-alist)
            (setq accent_char ch)
            (setq accent_count (+ accent_count 1))
            )
           (t (setq accent_count 0))
           )
        (if (not (bobp))(backward-char))
        )
      )))
  length)


;; ;; Korean Hangul characters
;; ;; ref. http://www.unicode.org/reports/tr15/#Hangul

;; (defun utf-8m-post-read-hangul-conversion (length)
;;   "Document forthcoming..."
;;   (save-excursion
;;     (let* ((ch1 nil)
;;            (ch2 nil)
;;            (sbase #xac00)
;;            (lbase #x1100)
;;            (vbase #x1161)
;;            (tbase #x11a7)
;;            (lcount 19)
;;            (vcount 21)
;;            (tcount 28)
;;            (ncount (* vcount tcount)) ; 588
;;            (scount (* lcount ncount)) ; 11172
;;            (lindex nil)
;;            (vindex nil)
;;            (sindex nil)
;;            (tindex nil))
;;       (setq ch1 (char-to-ucs (char-after)))
;; ;      (setq ch1 (decode-char (char-after) 'ucs))
;; ;      (setq ch1 (encode-char (char-after) 'ucs))
;; ;      (setq ch1 (mule-unicode-xxxx-to-ucs (char-after)))
;;       (if (not (eobp))(forward-char))
;;       (while (not (eobp))
;;         (setq ch2 (char-to-ucs (char-after)))
;; ;	(setq ch2 (encode-char (char-after) 'ucs))
;; ;	(setq ch2 (decode-char (char-after) 'ucs))
;; ;	(setq ch2 (mule-unicode-xxxx-to-ucs (char-after)))
;; ;       (message "ch1:%X ch2:%X" ch1 ch2)
;;         (setq lindex (- ch1 lbase))
;;         (setq vindex (- ch2 vbase))
;;         (setq sindex (- ch1 sbase))
;;         (setq tindex (- ch2 tbase))
;;         (if (and (>= lindex 0)(< lindex lcount)
;;                  (>= vindex 0)(< vindex vcount))
;;             (progn
;; ;             (message "first loop")
;;               (setq ch1 (+ sbase (* (+ (* lindex vcount) vindex) tcount)))
;;               (delete-char -1)
;;               (delete-char 1)
;; ;              (insert-ucs-character ch1)
;;               (ucs-insert ch1)
;;               (setq length (- length 1))
;;               )
;;           (if (and (>= sindex 0)(< sindex scount)
;;                    (= (% sindex tcount) 0)
;;                    (>= tindex 0)(< tindex tcount))
;;               (progn
;; ;               (message "second loop")
;;                 (setq ch1 (+ ch1 tindex))
;;                 (delete-char -1)
;;                 (delete-char 1)
;; ;                (insert-ucs-character ch1)
;; 		(ucs-insert ch1)
;;                 (setq length (- length 1))
;;                 )
;;             (progn
;;               (setq ch1 ch2)
;;               (if (not (eobp))(forward-char))
;;               )
;;             ))
;;         )))
;;   length)


;; use the above functions as post-read-converter
(defun utf-8m-post-read-conversion (length)
   "Document forthcoming..."
   (save-excursion
     (setq length (utf-8-post-read-conversion length)))
   (save-excursion
     (setq length (utf-8m-post-read-kana-conversion length)))
;;    (save-excursion
;;      (setq length (utf-8m-post-read-hangul-conversion length)))
   (save-excursion
     (setq length (utf-8m-post-read-latin-conversion length)))
   length)


;; defines the coding system (utf-8m)
(make-coding-system
 'utf-8m 4 ?u
 "modified UTF-8 encoding for Mac OS X hfs plus volume format."
 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
 `((safe-charsets
    ascii
    eight-bit-control
    eight-bit-graphic
    latin-iso8859-1
    mule-unicode-0100-24ff
    mule-unicode-2500-33ff
    mule-unicode-e000-ffff
    ,@(if utf-translate-cjk-mode
          utf-translate-cjk-charsets))
   (mime-charset . utf-8)
   (coding-category . coding-category-utf-8)
   (valid-codes (0 . 255))
   (pre-write-conversion . utf-8-pre-write-conversion)
   (post-read-conversion . utf-8m-post-read-conversion)
   (translation-table-for-encode . utf-translation-table-for-encode)
   (dependency unify-8859-on-encoding-mode
               unify-8859-on-decoding-mode
               utf-fragment-on-decoding
               utf-translate-cjk-mode)))

;; (set-file-name-coding-system 'utf-8m)
(provide 'utf-8m)

;; utf-8m.el ends here.