在Scheme中优化CSV解析器

ijnw1ujt  于 2023-09-28  发布在  其他
关注(0)|答案(1)|浏览(123)

我目前正在尝试学习Scheme(特别是小鸡Scheme),并希望更好地了解该语言的性能缺陷。我写了一个CSV解析器,并在下面分享。我尝试使用的测试文件是130 MB,需要大约7分钟来解析。简单地阅读所有行只需要几毫秒,所以问题在于解析器。我试图保持我的代码尽可能的“Lispy”,并希望避免引入太多的低级结构,我知道在Chicken Scheme中可用。我希望从改进这段代码中获得的主要东西是对Scheme的性能有更好的直觉。
编辑:我从评论中得到了一些建议,现在有了两个独立的解析器。第一个代码块使用基于列表的解决方案,第二个代码块使用string-refs。我原以为字符串引用会更快,因为线性记忆,但实验只让我感到困惑。但这是我试图解决的核心问题。我真的不明白为什么一个解决方案比另一个更好。任何洞察力都是伟大的。
我还回去做了一些性能测试:
列表解决方案编译:37.658s CPU time, 1.2s GC time (major), 258575354/244106 mutations (total/tracked), 1160/317431 GCs (major/minor), maximum live heap: 306.33 KiB
字符串-参考-解决方案-编译:644.939s CPU time, 1.308s GC time (major), 167854392/243430 mutations (total/tracked), 1281/1023116 GCs (major/minor), maximum live heap: 305.51 KiB

; List based solution
(import (chicken string))
(import utf8)
(import list-utils)

(define (lookahead-list ahead lst) 
  (cond [(null? ahead) #t]
    [(and (not (null? lst)) (not (null? ahead))) 
     (if (eq? (car ahead) (car lst)) (lookahead-list (cdr ahead) (cdr lst)) #f)]
    [else #f]
  )
)

(define (null-blist? blist) (null? (car blist)))

(define (lookahead-blist ahead blst) (lookahead-list ahead (car blst)))

(define (read-blist blist)
    (if (null? blist) #!eof
        (let ([head (caar blist)]) (set-car! blist (cdar blist)) head))
)

; Csv parsing

(define (csv/read-atom blist)
    (let loop ([res '()] [cmode #t]) 
      (cond [(lookahead-blist '(#\" #\") blist) (read-blist blist) (loop (cons (read-blist blist) res) cmode)]
        [(lookahead-blist '(#\") blist) (read-blist blist) (loop res (not cmode))]
        [(and cmode (lookahead-blist '(#\,) blist)) (reverse-list->string res)]
        [(null-blist? blist) (reverse-list->string res)]
        [else (loop (cons (read-blist blist) res) cmode)]
      ))
)

(define (csv/parse-non-blank-line blist)
    (reverse 
    (let loop ([res '()])
        (let ([nres (cons (csv/read-atom blist) res)])
             (if (lookahead-blist '(#\,) blist) 
               (begin (read-blist blist) (loop nres)) nres)
        )
    ))
)

(define (csv/parse-line str)
    (if (equal? str "") '() (csv/parse-non-blank-line (list (string->list str))))
)

(define (csv/parse func init port)
    (let loop ([acc init] [line (read-line port)])
        (if (eq? line #!eof) acc
            (loop (func acc (csv/parse-line line)) (read-line port))
        )
    )
)

(define (csv/parse-table func init port) 
    (let ([format (csv/parse-line (read-line port))]) 
        (define (shim acc elem) 
            (func acc (zip-alist format elem))
        )
        (csv/parse shim init port)
    )
)

(define (csv/parse-table-file func init fname) (call-with-input-file (lambda (p) (csv/parse-table func init p))))
; String-ref based solution
(import (chicken io))
(import (chicken string))
(import utf8)
(import list-utils)

; Cursor string
(define (string->cstring str)
    (cons 0 str)
)

(define (cstring/forward cstr) 
    (cons (+ 1 (car cstr)) (cdr cstr))
)

(define (cstring/peek cstr)
    (if (cstring/null? cstr)
        #!eof
        (string-ref (cdr cstr) (car cstr))
    )
)

(define (cstring/forward! cstr)
    (let ([ret (cstring/peek cstr)]) 
        (set-car! cstr (+ 1 (car cstr)))
        ret
    )
)

(define (cstring/null? cstr)
    (>= (car cstr) (string-length (cdr cstr)))
)

(define (lookahead-cstring ahead cstr) 
  (cond [(null? ahead) #t]
    [(and (not (cstring/null? cstr)) (not (null? ahead))) 
     (if (eq? (car ahead) (cstring/peek cstr)) (lookahead-cstring (cdr ahead) (cstring/forward cstr)) #f)]
    [else #f]
  )
)


; Csv parsing

(define (csv/read-atom cstr)
    (let loop ([res '()] [cmode #t]) 
      (cond [(lookahead-cstring '(#\" #\") cstr) (cstring/forward! cstr) (loop (cons (cstring/forward! cstr) res) cmode)]
        [(lookahead-cstring '(#\") cstr) (cstring/forward! cstr) (loop res (not cmode))]
        [(and cmode (lookahead-cstring '(#\,) cstr)) (reverse-list->string res)]
        [(cstring/null? cstr) (reverse-list->string res)]
        [else (loop (cons (cstring/forward! cstr) res) cmode)]
      ))
)


(define (csv/parse-non-blank-line cstr)
    (reverse 
    (let loop ([res '()])
        (let ([nres (cons (csv/read-atom cstr) res)])
             (if (lookahead-cstring '(#\,) cstr) 
               (begin (cstring/forward! cstr) (loop nres)) nres)
        )
    ))
)

(define (csv/parse-line str)
    (if (equal? str "") '() (csv/parse-non-blank-line (string->cstring str)))
)

(define (csv/parse func init port)
    (let loop ([acc init] [line (read-line port)])
        (if (eq? line #!eof) acc
            (loop (func acc (csv/parse-line line)) (read-line port))
        )
    )
)

(define (csv/parse-table func init port) 
    (let ([format (csv/parse-line (read-line port))]) 
        (define (shim acc elem) 
            (func acc (zip-alist format elem))
        )
        (csv/parse shim init port)
    )
)

(define (csv/parse-table-file func init fname) (call-with-input-file fname (lambda (p) (csv/parse-table func init p))))
3z6pesqy

3z6pesqy1#

如果代码对垃圾收集器的压力太大,则CHICKEN方案可能会表现出病态的不良性能行为。您可以尝试调优GC参数(参见-:?标志的输出,以获得可以传递给程序的运行时选项的完整列表),也可以重写程序以避免产生这么多垃圾。例如,来自评论的建议阅读带有read-char/peek-char的字符和阅读整行都将有助于这一点。
就我个人而言,我确实相信使用特定于实现的特性没有什么错。你不必全押在他们身上。例如,您可以使用cond-expand来引入特定于实现的代码,并定义一个csv/read-line辅助程序过程,该过程使用(chicken io)中的read-line,或者使用read-charpeek-char编写的可移植Scheme变体。Scheme非常适合语言实验,所以不要害怕尝试各种策略,看看它们在可读性/可移植性/性能/性能方面有多大的不同。

相关问题