Pour les codes de César et Vigénère que nous allons utiliser, il est
important que le texte ne comporte que des caractères alphabétiques (a
- z), sans ponctuation, sans accents, sans caractères particuliers
(@ç&€$œ...)
ni espaces.
On pourra à titre d'exercice récupérer un texte intégral par exemple
sur https://fr.wikisource.org,
et de l'avoir au format texte (le copier dans le bloc note de Windows
et l'enregistrer au format ANSI pour éviter des encodage particulier)
Dans l'exemple ci dessous le fichier pdf natif (ici, L'étranger,
d'Albert Camus)était paginé et contenait dans le pied de page une
chaine de caractère commencant par "Albert Camus" et suivi du numéro de
page.
Il fallait supprimer cette ligne qui apparait alors de façon récurrente
dans mon fichier texte.
#source=open(u"letranger_camus.txt","r",
encoding="utf8") source=open(u'letranger01.txt',"r") #enregistrer le fichier en ANSI avec le blocnote) txt="" entete="Albert Camus" longueur=len(entete) print(longueur) print(entete) i=0 for line in source: # je m'assure que le texte est bien encodé en l'affichant sur les 17 premières lignes if i < 17: print(line) print(line[0:longueur]) # Je peux afficher le nombre de "e" qui apparaissent dans les 17 premieres lignes #print(line.count('e')) if (line[0:longueur]==entete): line="" txt+=line if i < 17: print(line) i+=1 destination = open("etranger02.txt", "w") destination.write(txt) destination.close() source.close() |
source=open(u'etranger02.txt',"r") for line in source: line=line.replace("-"+chr(10), "") line=line.replace("- "+chr(10), "") line=line.replace(chr(10), " ") txt+=line txt.replace(" ", " ") txt.replace(" ", " ") txt.replace(" ", " ") destination = open("etranger03.txt", "w") destination.write(txt) destination.close() source.close() |
source=open(u'etranger03.txt',"r")
txt="" i=0 texte=source.read() texte=texte.lower() print(len(texte)) car=(("é","e"), ("[",""), ("-"," "), ("'"," "), ("è","e"), ("ê","e"), ("ë","e"), ("î","i"), ("ï","i"), ("ô","o"), ("ö","o"), ("ù","u"), ("û","u"), ("]",""), ("(",""), (")",""), ("0",""), ("1",""), ("2",""), ("3",""), ("4"," "), ("5","e"), ("6",""), ("7",""), ("8"," "), ("9","e"), ("\"",""), ("à","a"), ("â","a"), ("ä","a"), ("«",""), ("»",""), (".",""), ("!","e"), ("?",""), (",",""), (";",""), (":",""), ("’"," "), ("œ","oe"), ("ç","c")) l=len(car) print(l) for a in texte: for i in range(l): a=a.replace(car[i][0],car[i][1]) txt+=a txt=txt.replace(" "," ") txt=txt.replace(" "," ") txt=txt.replace(" "," ") txt=txt.replace(" "," ") print(len(txt)) destination = open("etranger04.txt", "w") destination.write(txt) destination.close() source.close() |
source=open(u'etranger03.txt',"r")
txt="" i=0 texte=source.read() texte=texte.lower() print(len(texte)) car=(("é","e"), ("[",""), ("-"," "), ("'"," "), ("è","e"), ("ê","e"), ("ë","e"), ("î","i"), ("ï","i"), ("ô","o"), ("ö","o"), ("ù","u"), ("û","u"), ("]",""), ("(",""), (")",""), ("0",""), ("1",""), ("2",""), ("3",""), ("4"," "), ("5","e"), ("6",""), ("7",""), ("8"," "), ("9","e"), ("\"",""), ("à","a"), ("â","a"), ("ä","a"), ("«",""), ("»",""), (".",""), ("!","e"), ("?",""), (",",""), (";",""), (":",""), ("’"," "), ("œ","oe"), ("ç","c")) l=len(car) print(l) for a in texte: for i in range(l): a=a.replace(car[i][0],car[i][1]) txt+=a txt=txt.replace(" "," ") txt=txt.replace(" "," ") txt=txt.replace(" "," ") txt=txt.replace(" "," ") txt=txt.replace(" ","") txt=txt.replace(" ","") print(len(txt)) destination = open("etranger04_sans_espace.txt", "w") destination.write(txt) destination.close() source.close() |
source=open(u'etranger04.txt',"r") txt="" i=0 texte=source.read() texte=texte.lower() print(len(texte)) car=(("a",""), ("b",""), ("c",""), ("d",""), ("e",""), ("f",""), ("g",""), ("h",""), ("i",""), ("j",""), ("k",""), ("l",""), ("m",""), ("n",""), ("o",""), ("p",""), ("q",""), ("r",""), ("s",""), ("t",""), ("u",""), ("v",""), ("w",""), ("x",""), ("y",""), ("z",""), (" ","")) l=len(car) print(l) for a in texte: for i in range(l): a=a.replace(car[i][0],car[i][1]) txt+=a print(len(txt)) destination = open("etranger05.txt", "w") destination.write(txt) destination.close() source.close() |
nomfich="disp" source=open(u'disparition01.txt',"r") #enregistrer le fichier en ANSI avec le blocnote) txt="" entete="Albert Camus" # On retire la ligne entiere qui commence par ce texte (pied de page ou entete) longueur=len(entete) print(longueur) #permet ici de lire les 1500 premieres caractères du texte '''debut=source.read() print(debut[0:1500]) ''' #suppression d'une ligne entière (entete ou pied de page) for line in source: if (line[0:longueur]==entete): line="" txt+=line destination = open(nomfich+"02.txt", "w") destination.write(txt) destination.close() # supprime les sauts de ligne et espaces doublés txt=txt.replace("-"+chr(10), "") txt=txt.replace("- "+chr(10), "") txt=txt.replace(chr(10), " ") txt=txt.replace(" ", " ") txt=txt.replace(" ", " ") txt=txt.replace(" ", " ") destination = open(nomfich+"03_avec_esp.txt", "w") destination.write(txt) destination.close() # remplace les caractères accentués... texte=txt txt="" texte=texte.lower() print(len(texte)) car=(("é","e"), ("[",""), ("-"," "), ("—"," "), ("'"," "), ("è","e"), ("ê","e"), ("ë","e"), ("î","i"), ("ï","i"), ("ô","o"), ("ö","o"), ("ù","u"), ("û","u"), ("]",""), ("(",""), (")",""), ("0",""), ("1",""), ("2",""), ("3",""), ("4"," "), ("5","e"), ("6",""), ("7",""), ("8"," "), ("9","e"), ("\"",""), ("à","a"), ("â","a"), ("ä","a"), ("«",""), ("»",""), (".",""), ("!","e"), ("?",""), (",",""), (";",""), (":",""), ("’"," "), ("œ","oe"), ("ç","c"), ("•","")) l=len(car) print(l) for a in texte: for i in range(l): a=a.replace(car[i][0],car[i][1]) txt+=a txt=txt.replace(" "," ") txt=txt.replace(" "," ") txt=txt.replace(" "," ") txt=txt.replace(" "," ") print(len(txt)) destination = open(nomfich+"04_avec_esp.txt", "w") destination.write(txt) destination.close() #supprime tous les espaces txt=txt.replace(" ","") txt=txt.replace(" ","") destination = open(nomfich+"05_sans_esp.txt", "w") destination.write(txt) destination.close() # ferme la source source.close() |