import React from 'react'

import { Text } from '@primer/components'

import Code from '../../components/Code'
import Snippet from '../../components/Snippet'

import { CATEGORY_INTRODUCE_BUGS as category, IMPACT_MEDIUM as impact } from '../../constants'


export const exampleTitle = 'catalogue.py'

export const exampleBefore = (
`with open('some/path.txt') as f:
    line_one = f.read()`
)

export const exampleAfter = (
`with open('some/path.txt', encoding='utf_8') as f:
    line_one = f.read()`
)

export const code = 'UseFileEncodingRead'

export const ogImage = `/og-image/${code}.png`

export const title = "Specify text encoding when reading files"

export const label = 'Use file encoding read'

export const wordCode = "use-file-encoding-read"

export const furtherReading = [
  {
    href: 'https://en.wikipedia.org/wiki/ISO/IEC_8859-1',
    text: 'Wikipedia page for EC 8859-1 text encoding.',
  },
]

export function Summary(props) {
  return (
    <Text as={'p'} className={props.className}>
      Not specifying <Code>encoding</Code> when reading a file can cause <Code>UnicodeDecodeError</Code> because Python assumes the file is encoded with the OS's default text encoding, but that's often an invalid assumption.
    </Text>
  )
}


export const explanation = (
  <>
    <Text as='p'>Files are stored as bytes. Therefore before we can save a Python string to disk the string must be serialising to bytes, and conversely it's necessary to decode those bytes back to string in order to read the file from disk. There are a <a href="https://docs.python.org/3/library/codecs.html#standard-encodings" target="_blank">variety of different text serialisation codecs</a> that handle this encoding and decoding, which are collectively referred to as text encoding. In order to make sense of bytes and decode them correctly it's necessary to know what text encoding was used when it was saved to disk.</Text>
    <Text as='p'>By default Python assumes the file is encoded with the OS's default text encoding, and according to <a href="https://www.python.org/dev/peps/pep-0597/" target="_blank">PEP 0597</a>, 12% of the most popular packages on PyPI fail during installation on Windows because of this assumption. Those packages have setup.py files that do: </Text>
    <Snippet value={`
setup(
    ...
    long_description=open("README.md").read()
    ...
)
`} />
    <Text as='p'>That may look OK. On Mac and Linux it will <i>probably</i> work fine, but it's actually a common mistake that introduces a bug that will primarily effect Windows (aka <a href="https://www.jetbrains.com/lp/python-developers-survey-2020/">50% of all Python developers</a>): for Python running on Windows README.md will be opened using the ASCII-based ISO-8859 text encoding. What if README.md contains Unicode characters like ś? Then Python tries to decode the bytes representing ś to ASCII and a <Code>UnicodeDecodeError</Code> exception will occur because there is no way to fit ś into <a href="https://en.wikipedia.org/wiki/ISO/IEC_8859-1" target="_blank">ASCII-ish character range</a>. This problem is less likely to happen on Mac and Linux as the default text encoding is usually utf-8 for those systems, which can handle Unicode characters like ś.</Text>
    <Text as='p'>The encoding problem can be solved by changing setup.py to instead do:</Text> 
    <Snippet value={`
setup(
    ...
    long_description=open("README.md", encoding="utf-8").read()
    ...
)
`} />
  <Text as='p'>This problem has been recognised by the Python community and <a href="https://www.python.org/dev/peps/pep-0597/" target="_blank">PEP 0597</a> highlights the issue, as a result if <Code>encoding</Code> is not used Python 3.10 then a <Code>EncodingWarning</Code> can be raised. You can view the raw stats for the 12% failures <a href="https://github.com/methane/pep597-pypi-ascii" target="_blank">here</a>, and see an example of an affected library being fixed <a href="https://github.com/pypa/packaging.python.org/pull/682" target="_blank">here</a>. </Text>
  <Text as='p'>Our code best practice checker infers the encoding of the file and suggests that encoding is used during <Code>open</Code>.</Text> 
  </>
)


export {category, impact}