converter xdfx to csv

This commit is contained in:
Морозов Андрей 2022-05-02 20:05:07 +04:00
parent 8cf21b5e62
commit 07cda3e10b
1 changed files with 92 additions and 33 deletions

123
main.go
View File

@ -2,37 +2,47 @@ package main
import ( import (
"bufio" "bufio"
"encoding/csv"
"fmt" "fmt"
"log" "log"
"os" "os"
"strings" "strings"
"sync"
"time"
) )
type Words struct {
word string
translation string
transcription string
}
var ( var (
full_name string full_name string
lang_from string lang_from string
lang_to string lang_to string
) )
const (
FIRST_LINES = 5
)
func main() { func main() {
f, err := os.OpenFile("dict.xdxf", os.O_RDONLY, os.ModePerm) fcsv, err := os.Create("./dict.csv")
if err != nil {
fmt.Println(err)
}
defer fcsv.Close()
writer := csv.NewWriter(fcsv)
writer.Comma = '|'
fxdfx, err := os.OpenFile("dict.xdxf", os.O_RDONLY, os.ModePerm)
if err != nil { if err != nil {
log.Fatalf("open file error: %v", err) log.Fatalf("open file error: %v", err)
return return
} }
defer f.Close() defer fxdfx.Close()
fullFile(f) fullFile(fxdfx, writer)
} }
func fullFile(file *os.File) { func fullFile(fxdfx *os.File, fcsv *csv.Writer) {
sc := bufio.NewScanner(file) sc := bufio.NewScanner(fxdfx)
/*for sc.Scan() { /*for sc.Scan() {
fmt.Println(sc.Text()) fmt.Println(sc.Text())
}*/ }*/
@ -42,7 +52,7 @@ func fullFile(file *os.File) {
} }
//skip first lines with some unnessesary data and save full name //skip first lines with some unnessesary data and save full name
first:="" first := ""
for sc.Scan() { for sc.Scan() {
line := sc.Text() line := sc.Text()
if strings.Contains(line, "<full_name>") { if strings.Contains(line, "<full_name>") {
@ -57,41 +67,90 @@ func fullFile(file *os.File) {
lang_to = strings.Split(tmp[2], " ")[0] lang_to = strings.Split(tmp[2], " ")[0]
} }
if strings.Contains(line, "<ar><k>") { if strings.Contains(line, "<ar><k>") {
first= line first = line
break break
} }
} }
fmt.Printf("Dict name: %s.\n", full_name) fmt.Printf("Dict name: %s.\n", full_name)
fmt.Printf("From %s to %s. \n", lang_from, lang_to) fmt.Printf("From %s to %s. \n", lang_from, lang_to)
fmt.Println(first) //fmt.Println(first)
// read file line by line
var (
word string
translation string
transcription string
)
for sc.Scan() { // read file line by line
line := sc.Text() neww := new(Words)
if strings.Contains(line, "<ar>") { for {
word = "" first = strings.TrimSpace(first)
translation = "" if first == "" {
transcription = "" if sc.Scan() {
fmt.Println(word, translation, transcription) first += sc.Text()
} else {
break
}
} }
if strings.Contains(line, "<k>") { if strings.Contains(first, "<ar>") {
_, word, _ = strings.Cut(line, "<k>") neww.word = ""
neww.translation = ""
neww.transcription = ""
//fmt.Println(neww)
}
if strings.Contains(first, "<k>") {
_, neww.word, _ = strings.Cut(first, "<k>")
if strings.Contains(neww.word, "</k>") {
neww.word, first, _ = strings.Cut(neww.word, "</k>")
first = strings.TrimSpace(first)
if first == "" {
if sc.Scan() {
first += sc.Text()
} else {
break
}
}
}
} else if strings.Contains(first, "<tr>") {
_, neww.transcription, _ = strings.Cut(first, "<tr>")
if strings.Contains(neww.transcription, "</tr>") {
neww.transcription, first, _ = strings.Cut(neww.transcription, "</tr>")
first = strings.TrimSpace(first)
if first == "" {
if sc.Scan() {
first += sc.Text()
} else {
break
}
}
}
} else if strings.Contains(first, "</ar>") {
neww.translation, first, _ = strings.Cut(first, "</ar>")
first = strings.TrimSpace(first)
first = strings.TrimSpace(first)
if first == "" {
if sc.Scan() {
//fmt.Printf("Word: %s. Transcription: %s. Translation: %s.\n", neww.word, neww.transcription, neww.translation)
err := fcsv.Write([]string{neww.word, neww.transcription, neww.translation})
if err != nil {
fmt.Println(err)
}
first += sc.Text()
} else {
break
}
}
} else {
if sc.Scan() {
//fmt.Printf("Word: %s. Transcription: %s. Translation: %s.\n", neww.word, neww.transcription, neww.translation)
first += sc.Text()
} else {
break
}
} }
fmt.Println(sc.Text())
} }
} }
func processChunk(chunk []byte, linesPool *sync.Pool, stringPool *sync.Pool, start time.Time, end time.Time) { /*func processChunk(chunk []byte, linesPool *sync.Pool, stringPool *sync.Pool, start time.Time, end time.Time) {
} }
//readFile opens file in chunks and process to other func. https://medium.com/swlh/processing-16gb-file-in-seconds-go-lang-3982c235dfa2 //readFile opens file in chunks and process to other func. https://medium.com/swlh/processing-16gb-file-in-seconds-go-lang-3982c235dfa2
func readFile() { func readFile() {
} }*/