# HG changeset patch # User Nina Engelhardt # Date 1348581333 -7200 # Node ID 897f711a7157c133b3bbdd2d9bb2e924baa9de8b # Parent 11d15c47beaf309fb4478842555b5448df024fbf rearrange to work with autoconf diff -r 11d15c47beaf -r 897f711a7157 COPYING.GPLv3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COPYING.GPLv3 Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff -r 11d15c47beaf -r 897f711a7157 README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,79 @@ +App: h264dec + +This application decodes H.264 raw videos. + +Build Sequential/Pthreads: + +autoreconf -i -f +mkdir build +cd build +../configure --enable-ssse3 --enable-sdl2 +make + +Build OmpSs: + +autoreconf -i -f +mkdir build +cd build-ss +../configure CC=sscc --enable-ssse3 --enable-sdl2 +make + +ssse3 enables assembler optimizations up to ssse3 (optional) +sdl enables a rudimentary viewing capability (optional) + +Usage Sequential/Pthreads: +./h264dec -i $(INPUT_VIDEO) -s +./h264dec -i $(INPUT_VIDEO) -t $(THREADS) + +Usage OmpSs: +NX_PES= ./h264dec -i -e -z --static-3d + +-e specify the number of entropy decode pipeline buffers and should be ideally +the same as the number of threads. + +-z allows to set the MB reconstruction grouped block size. A size between 6 by 6 to 10 by 10 +was found to strike a good balance between overhead and parallelism, but is machine and input +dependent. + +--static-3d performs overlapping wavefront decoding. + +General usage: +-d displays output +-f fullscreen +-o $(OUT_FILE) write raw YUV +-v show framerate + + +The INPUT_VIDEOs are in "inputs_encore", but should be able to decode any raw H.264 stream using +one slice per frame, non-interlaced, and CABAC, YUV420. + + +Integrated OmpSs player demo +---------------------------- +NOTE: for the player demo SDL2 must be installed. + +1. Go to the OmpSs build directory (/home/cchi/Projects/ffmpeg_smp/build-ss) + +2. Launch the H.264 decoder with the desired options: + +NX_PES= ./h264dec -v (verbose) -e -z -d (display) -f (fullscreen) + +note that should be equal or higher than for optimal performance + +Examples: + +NX_PES=7 ./h264dec -i ../../h264_movies/park_joy_2160px5.h264 -v -z 8 8 -df -e 9 +NX_PES=7 ./h264dec -i ../../h264_movies/big_buck_bunny_1080p24.h264 -v -d -z 6 6 -e 9 + +Interacting with the program +---------------------------- + Fullscreen mode + Window mode + Pause/resume + Show/hide macroblock borders + When macroblock borders are shown resizes the macroblocks + Close + +Force close in case of lockup +----------------------------- +On a terminal: killall -9 h264dec diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/benchmark.sh --- a/ffmpeg_smp/benchmark.sh Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -#! /bin/bash - -workers=(1 4 8 12 16 20 24 28 32) -cpus=(0 3 7 15 15 23 23 31 31) -nodes=(0 0 0 1 1 2 2 3 3) - -confs=( "1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17" #small - "1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15") #large - - - -#confsmall=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17") -# "7 10 21" "8 12 25" "10 15 29" "11 17 32") -#conflarge=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15") -#"5 12 21" "6 15 25" "7 17 30" "8 19 36") - - -configs=9 - -average_ompss_2d=0 -average_ompss_3d=0 -average_pthread=0 -average_serial=0 - -iterations_low=4 -iterations_high=8 - -nframes=10000 # max frames limit for debug purpose -inputs=("14" "10") -inputs_vebose=("Big Bug Bunny 1920x1080 10000 frames" "Park Joy 3840x2160 2500 frames") -osargs=("-z 8 8" "-z 12 12 --static-3d") - -time_stamp=`date +%Y.%m.%d_%H.%M.%S` -outputdir="/home/stefan.hauser/ffmpeg_smp/ppopp_results/rx600s5-1t/$time_stamp" -ompss_2d="$outputdir/ompss_2d.txt" -ompss_3d="$outputdir/ompss_3d.txt" -pthread="$outputdir/pthread.txt" -serial="$outputdir/serial.txt" - -#executes the experiments for a single conf $1=confnum $2 iterations $3 input_idx -function execute_single_conf { - conf=$1 - iter=$2 - iidx=$3 - - average_ompss_2d=0 - average_ompss_3d=0 - average_pthread=0 - - echo "Workers: " ${workers[$conf]} | tee -a $ompss_2d $ompss_3d $pthread $serial - - cd build-ss - for ((i=1;i<=$iter;i+=1)); do - # OMPSS - #export CSS_NUM_CPUS=$worker - NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[0]} 2> output - runtime=$(cat output | grep real | sed s/^.*l.//g) - average_ompss_2d=$(echo "$average_ompss_2d + $runtime"|bc) - echo -n $runtime " " >> $ompss_2d - done - - for ((i=1;i<=$iter;i+=1)); do - NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[1]} 2> output - runtime=$(cat output | grep real | sed s/^.*l.//g) - average_ompss_3d=$(echo "$average_ompss_3d + $runtime"|bc) - echo -n $runtime " " >> $ompss_3d - done - cd .. - - cd build - for ((i=1;i<=$iter;i+=1)); do - # Pthreads - numactl --physcpubind=0-$((${cpus[$conf]})) time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -t ${confs[$(($conf + $iidx * $configs))]} 2> output - runtime=$(cat output | grep real | sed s/^.*l.//g) - average_pthread=$(echo "$average_pthread + $runtime"|bc) - echo -n $runtime " " >> $pthread - done - cd .. - - echo "" | tee -a $pthread $ompss_2d $ompss_3d - average_ompss_2d=$(echo "scale=5;$average_ompss_2d/$iter"|bc) - average_ompss_3d=$(echo "scale=5;$average_ompss_3d/$iter"|bc) - average_pthread=$(echo "scale=5;$average_pthread/$iter"|bc) - - echo "time: " $average_ompss_2d >> $ompss_2d - echo "time: " $average_ompss_3d >> $ompss_3d - echo "time: " $average_pthread >> $pthread - echo "time: " $average_serial >> $serial -} - - -mkdir $outputdir - -echo "Processing inputs ..." - -echo "h264dec Benchmark" | tee $ompss_2d $ompss_3d $pthread $serial - -for n in 0 1; do - echo "Input: ${inputs_vebose[$n]}" | tee -a $ompss_2d $ompss_3d $pthread $serial - echo "" | tee -a $ompss_2d $ompss_3d $pthread $serial - - # Serial - cd build - numactl --physcpubind=0 time -p ./ffmpeg -i ${inputs[$n]} -n $nframes -s 2> output - runtime=$(cat output | grep real | sed s/^.*l.//g) - average_serial=$runtime - cd .. - - execute_single_conf 0 1 $n - - #Parallel - for ((confidx=1;confidx<=4;confidx+=1)); do - execute_single_conf $confidx $iterations_low $n - done - - for ((confidx=5;confidx<=$(($configs-1));confidx+=1)); do - execute_single_conf $confidx $iterations_high $n - done - - echo "-------------------" | tee -a $ompss_2d $ompss_3d $pthread $serial -done - -echo "FINISHED" - -rm build/output build-ss/output - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/COPYING.GPLv3 --- a/ffmpeg_smp/h264dec/COPYING.GPLv3 Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/README.txt --- a/ffmpeg_smp/h264dec/README.txt Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -App: h264dec - -This application decodes H.264 raw videos. - -Build Sequential/Pthreads: - -autoreconf -i -f -mkdir build -cd build -../configure --enable-ssse3 --enable-sdl2 -make - -Build OmpSs: - -autoreconf -i -f -mkdir build -cd build-ss -../configure CC=sscc --enable-ssse3 --enable-sdl2 -make - -ssse3 enables assembler optimizations up to ssse3 (optional) -sdl enables a rudimentary viewing capability (optional) - -Usage Sequential/Pthreads: -./h264dec -i $(INPUT_VIDEO) -s -./h264dec -i $(INPUT_VIDEO) -t $(THREADS) - -Usage OmpSs: -NX_PES= ./h264dec -i -e -z --static-3d - --e specify the number of entropy decode pipeline buffers and should be ideally -the same as the number of threads. - --z allows to set the MB reconstruction grouped block size. A size between 6 by 6 to 10 by 10 -was found to strike a good balance between overhead and parallelism, but is machine and input -dependent. - ---static-3d performs overlapping wavefront decoding. - -General usage: --d displays output --f fullscreen --o $(OUT_FILE) write raw YUV --v show framerate - - -The INPUT_VIDEOs are in "inputs_encore", but should be able to decode any raw H.264 stream using -one slice per frame, non-interlaced, and CABAC, YUV420. - - -Integrated OmpSs player demo ----------------------------- -NOTE: for the player demo SDL2 must be installed. - -1. Go to the OmpSs build directory (/home/cchi/Projects/ffmpeg_smp/build-ss) - -2. Launch the H.264 decoder with the desired options: - -NX_PES= ./h264dec -v (verbose) -e -z -d (display) -f (fullscreen) - -note that should be equal or higher than for optimal performance - -Examples: - -NX_PES=7 ./h264dec -i ../../h264_movies/park_joy_2160px5.h264 -v -z 8 8 -df -e 9 -NX_PES=7 ./h264dec -i ../../h264_movies/big_buck_bunny_1080p24.h264 -v -d -z 6 6 -e 9 - -Interacting with the program ----------------------------- - Fullscreen mode - Window mode - Pause/resume - Show/hide macroblock borders - When macroblock borders are shown resizes the macroblocks - Close - -Force close in case of lockup ------------------------------ -On a terminal: killall -9 h264dec diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/configure.ac --- a/ffmpeg_smp/h264dec/configure.ac Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,171 +0,0 @@ -# -*- Autoconf -*- -# Process this file with autoconf to produce a configure script. - -AC_PREREQ(2.61) -AC_INIT([h264_mt], [0.1], [cchi@cs.tu-berlin.de]) -#AM_INIT_AUTOMAKE(AC_PACKAGE_NAME, AC_PACKAGE_VERSION) -AM_INIT_AUTOMAKE([-Wall -Werror foreign]) - -AC_CONFIG_SRCDIR([h264dec.c]) -AC_PROG_RANLIB - -# Checks for programs. -AC_GNU_SOURCE -AC_PROG_CC -AM_CONDITIONAL([HAVE_OMPSS], [test $CC = "sscc"]) -AC_DEFINE([OMPSS], [0], [Define to 1 on when using the OmpSs compiler sscc]) -if test $CC = "sscc";then -AC_DEFINE([OMPSS], [1], [Define to 1 on when using the OmpSs compiler sscc]) -fi - -#if [ test -n "${CFLAGS+x}" ] ; then -# CFLAGS="-O3 -g" -#fi - -# Checks for libraries. -AC_CHECK_LIB([pthread], [pthread_yield]) -AC_CHECK_LIB([spe2], [spe_image_open]) -AC_CHECK_LIB([sync], [mutex_init]) -AC_CHECK_LIB([rt], [clock_gettime]) - -AC_ARG_ENABLE([sdl2], AS_HELP_STRING([--enable-sdl2], [Enable SDL2 playback])) -if test "$enable_sdl2" = "yes"; then - AC_CHECK_LIB([SDL2], [SDL_CreateWindow], [], [echo "Error! libSDL2 required for playback." exit -1]) -fi - -if test "$enable_sdl2" = "yes"; then - AC_CHECK_LIB([X11], [XInitThreads], [], [echo "Error! libX11 currently required for SDL2 workaround." exit -1]) -fi - -AC_ARG_ENABLE([sdl_ttf], AS_HELP_STRING([--enable-sdl_ttf], [Enable SDL_ttf for overlaying fonts])) -if test "$enable_sdl_ttf" = "yes"; then - AC_CHECK_LIB([SDL_ttf], [TTF_Init], [], [echo "Error! libSDL_ttf required for font rendering." exit -1]) -fi - - - -AC_ARG_ENABLE([opencl], AS_HELP_STRING([--enable-opencl], [Enable GPU decoder])) -if test "$enable_opencl" = "yes"; then - AC_CHECK_LIB([OpenCL], [clGetPlatformIDs], [], [echo "Error! libOpenCL required for GPU functionality." exit -1]) -fi -AM_CONDITIONAL([HAVE_OPENCL], [test "$enable_opencl" = "yes"]) - - -# Checks for header files. -AC_HEADER_STDC -AC_CHECK_HEADERS([stdint.h stdlib.h string.h unistd.h]) - -# Checks for typedefs, structures, and compiler characteristics. -AC_C_CONST -AC_TYPE_UINT32_T -AC_TYPE_UINT64_T -AC_TYPE_UINT8_T -AC_C_VOLATILE -AC_C_BIGENDIAN - -# Checks for library functions. -AC_CHECK_FUNCS([malloc realloc memalign posix_memalign memmove memset]) - -AC_CANONICAL_HOST -AC_CANONICAL_BUILD - -AC_MSG_CHECKING([for architecture]) - -AC_DEFINE([ARCH_ARM], [0], [Define to 1 on arm architectures.]) -AC_DEFINE([ARCH_X86_32], [0], [Define to 1 on x86 architectures.]) -AC_DEFINE([ARCH_X86_64], [0], [Define to 1 on x86_64 architectures.]) -AC_DEFINE([ARCH_X86], [ARCH_X86_32 ||ARCH_X86_64], [True on x86]) -AC_DEFINE([ARCH_PPC], [0], [Define to 1 on ppc architectures.]) -AC_DEFINE([ARCH_PPC64], [0], [Define to 1 on ppc64 architectures.]) -AC_DEFINE([ARCH_CELL], [0], [Define to 1 on cell architectures.]) - -if test "$enable_optimizations" != "no"; then - case $build_cpu in - arm ) - arch="arm" - AC_MSG_RESULT([arm]) - AC_DEFINE([ARCH_ARM], [1], [Define to 1 on arm architectures.]) - ;; - i686 ) - arch="x86" - AC_MSG_RESULT([x86]) - AC_DEFINE([ARCH_X86_32], [1], [Define to 1 on x86 architectures.]) - ;; - x86_64 ) - arch="x86_64" - AC_MSG_RESULT([x86_64]) - AC_DEFINE([ARCH_X86_64], [1], [Define to 1 on x86 architectures.]) - ;; - powerpc64 ) - AC_DEFINE([HAVE_BIGENDIAN], [1], [Define to 1 on bigendian architectures.]) - if grep -E ^cpu /proc/cpuinfo | grep -q Cell ; then - arch="cell" - AC_MSG_RESULT([cell]) - AC_DEFINE([ARCH_CELL], [1], [Define to 1 on cell architectures.]) - else - arch="powerpc64" - AC_MSG_RESULT([ppc64]) - AC_DEFINE([ARCH_PPC64], [1], [Define to 1 on ppc64 architectures.]) - fi - ;; - * ) - AC_MSG_RESULT([default (little endian).]) - ;; - esac -fi - -AM_CONDITIONAL([HAVE_CELL], [test $arch = "cell"]) - -# Additional options -AC_ARG_ENABLE([optimizations], AS_HELP_STRING([--disable-optimizations], [Disable all architecture specific optimizations. Compiler optimizations are not disabled.])) - -AC_DEFINE([HAVE_SSE], [0], [Define to 1 to enable sse optimizations.]) -AC_DEFINE([HAVE_MMX], [0], [Define to 1 to enable mmx optimizations.]) -AC_DEFINE([HAVE_MMX2], [0], [Define to 1 to enable mmx2 optimizations.]) -AC_DEFINE([HAVE_SSSE3], [0], [Define to 1 to enable ssse3 optimizations.]) -AC_DEFINE([HAVE_ALTIVEC], [0], [Define to 1 to enable altivec optimizations.]) -AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.]) - -AC_ARG_ENABLE([ssse3], AS_HELP_STRING([--enable-ssse3], [Enable ssse3 optimizations])) -if test "$enable_ssse3" = "yes"; then - AC_DEFINE([HAVE_SSSE3], [1], [Define to 1 to enable ssse3 optimizations.]) - AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.]) - AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.]) - AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.]) - ARCH_SUBDIR=x86 -fi - -AC_ARG_ENABLE([sse], AS_HELP_STRING([--enable-sse], [Enable sse optimizations])) -if test "$enable_sse" = "yes"; then - AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.]) - AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.]) - AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.]) - ARCH_SUBDIR=x86 -fi - -AC_ARG_ENABLE([altivec], AS_HELP_STRING([--enable-altivec], [Enable altivec optimizations])) -if test "$enable_altivec" = "yes"; then - AC_DEFINE([HAVE_ALTIVEC], [1], [Define to 1 to enable altivec optimizations.]) - ARCH_SUBDIR="$ARCH_SUBDIR ppc" - TMPCLAGS=$CFLAGS - CFLAGS="$CFLAGS -maltivec" - AC_CHECK_HEADERS(altivec.h) - CFLAGS=$TMPCLAGS -fi - -AC_ARG_ENABLE([neon], AS_HELP_STRING([--enable-neon], [Enable neon optimizations])) -if test "$enable_neon" = "yes"; then - AC_DEFINE([HAVE_NEON], [1], [Define to 1 to enable neon optimizations.]) - ARCH_SUBDIR=arm -fi - -AM_CONDITIONAL([HAVE_ARCH_SUBDIR], [test "$ARCH_SUBDIR" != ""]) -AC_SUBST([ARCH_SUBDIR]) - -AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.]) - -AC_CONFIG_HEADER([config.h]) - -AC_CONFIG_FILES([Makefile libavutil/Makefile libavcodec/Makefile libavcodec/x86/Makefile libavcodec/ppc/Makefile libavcodec/cell/Makefile]) - -AC_OUTPUT diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/h264dec.c --- a/ffmpeg_smp/h264dec/h264dec.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,288 +0,0 @@ -/* -* H264 decoder main -*/ - -#include "config.h" -#include "libavcodec/h264.h" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - - -static const char program_name[] = "h264dec"; -static const int program_birth_year = 2010; - -static const char *file_name; -static int ifile, ofile; -static int no_arch =0; -static int parallel = 1; -static int frame_width = 0; -static int frame_height = 0; - -static void av_exit(int ret) -{ - //do some free calls -#undef exit - exit(ret); -} - -static void opt_input_file(const char *filename) -{ - /* open the input file */ - ifile = open(filename, O_RDONLY, 0666); - if (ifile < 0){ - fprintf(stderr, "Failed to open %s\n", filename); - av_exit(-1); - } - - //parse first frame to get resolution (other information available but not used) - H264Slice slice; - PictureInfo pi; - GetBitContext gb = {0,}; - ParserContext *pc; - NalContext *nc; - - pc = get_parse_context(ifile); - nc = get_nal_context(0, 0); - - memset(&slice, 0, sizeof(H264Slice)); - slice.current_picture_info=π - - av_read_frame_internal(pc, &gb); - decode_nal_units(nc, &slice, &gb); - - frame_width = nc->width; - frame_height= nc->height; - - //clean up - av_freep(&gb.raw); - if (gb.rbsp) - av_freep(&gb.rbsp); - free_parse_context(pc); - free_nal_context(nc); - - //rewind file - int offset; - if ( (offset=lseek(ifile, 0, SEEK_SET)) ){ - fprintf(stderr, "Rewind input file %s failed at offset %d\n", filename, offset); - } - -} - -static void opt_output_file(const char *filename) -{ - if (filename){ - if (!strcmp(filename, "-")) - filename = "pipe:"; - - ofile = open(filename, O_CREAT | O_TRUNC | O_WRONLY, 0666); - }else{ - ofile =0; - } -} - -static void show_usage(void) -{ - printf("usage: ffmpeg [options] -i infile }...\n"); - printf("\n"); -} - -static struct option long_options[] = { - {"static-sched", 0, 0, 0}, - {"static-mbd", 0, 0, 0}, - {"numamap", 0, 0, 0}, - {"no-mbd", 0, 0, 0}, - {"static-3d", 0, 0, 0}, - {"slice-bufs", 1, 0, 0}, - {"smt", 0, 0, 0}, - {"noarch", 0, 0, 'a'}, - {"display", 0, 0, 'd'}, - {"fullscreen", 0, 0, 'f'}, - {"numframes", 1, 0, 'n'}, - {"use-ppe-ed", 1, 0, 'p'}, - {"sequential", 0, 0, 's'}, - {"threads", 1, 0, 't'}, - {"verbose", 1, 0, 'v'}, - {"wave-order", 1, 0, 'w'}, - {"smb-size", 1, 0, 'z'}, - {"pipe-bufs", 1, 0, 'e'}, - {0, 0, 0, 0} -}; - -static h264_options cli_opts; -static void parse_cmd(int argc, char **argv) -{ - int c; - int digit_optind = 0; - int option_index = 0; - char ofile_name[1024]; - extern char *optarg; - extern int optind, optopt; - - cli_opts.statsched =0; - cli_opts.numamap =0; - cli_opts.statmbd =0; - cli_opts.no_mbd= 0; - cli_opts.numframes = INT_MAX; - cli_opts.display=0; - cli_opts.fullscreen=0; - cli_opts.verbose=0; - cli_opts.ppe_ed=0; - cli_opts.profile=0; - cli_opts.threads = 1; - cli_opts.smb_size[0] = cli_opts.smb_size[1] = 1; - cli_opts.wave_order=0; - cli_opts.static_3d=0; - cli_opts.pipe_bufs=8; - cli_opts.slice_bufs=1; - cli_opts.smt= 0; - while ((c = getopt_long(argc, argv, "ade:fi:n:o:p:st:vwz:", long_options, &option_index)) != -1 ){ - int this_option_optind = optind ? optind : 1; - - switch (c){ - case 0: - if (option_index==0){ - cli_opts.statsched=1; - }else if (option_index==1){ - cli_opts.statmbd= 1; - }else if (option_index==2){ - cli_opts.numamap= 1; - }else if (option_index==3){ - cli_opts.no_mbd= 1; - }else if (option_index==4){ - cli_opts.static_3d= 1; - }else if (option_index==5){ - cli_opts.slice_bufs= (unsigned) atoi(optarg); - }else if (option_index==6){ - cli_opts.smt= 1; - } - break; - case '0': - case '1': - case '2': - if (digit_optind != 0 && digit_optind != this_option_optind) - printf("digits occur in two different argv-elements.\n"); - digit_optind = this_option_optind; - printf("option %c\n", c); - break; - case 'a': - no_arch=1; - break; - case 'd': - cli_opts.display=1; - break; - case 'f': - cli_opts.fullscreen=1; - break; - case 'i': - file_name = (const char *)optarg; - opt_input_file(file_name); - break; - case 'n': - cli_opts.numframes = (unsigned) atoi(optarg); - break; - case 'o': - strcpy(ofile_name, optarg); - opt_output_file(ofile_name); - break; - case 'p': - cli_opts.profile = (unsigned) atoi(optarg); - break; - case 's': - cli_opts.threads = 0; - parallel = 0; - break; - case 't': - cli_opts.threads = atoi(optarg); - if (cli_opts.threads<=0){ - fprintf(stderr, "Option -%c requires thread numbers > 0\n", c); - av_exit(-1); - } - break; - case 'v': - cli_opts.verbose = 1; - break; - case 'w': - cli_opts.wave_order = 1; - break; - case 'z': // only useful in ompss - if (argc < optind +1){ - fprintf(stderr, "Option -%c (--smb-size) requires 2 arguments\n", c); - av_exit(-1); - } - optind--; - for (int i=0; i<2; i++){ - cli_opts.smb_size[i] = atoi(argv[optind++]); - if (!(cli_opts.smb_size > 0)){ - fprintf(stderr, "Option -%c (--smb-size) requires dimensions > 0\n", c); - av_exit(-1); - } - } - break; - case 'e': - cli_opts.pipe_bufs = atoi(optarg); - break; - case ':': - fprintf(stderr, "Option -%c requires an operand\n", optopt); - av_exit(-1); - break; - case '?': - fprintf(stderr, "Unrecognized option: -%c\n", optopt); - av_exit(-1); - break; - } - } - -} - -int main(int argc, char **argv) -{ - /* parse options */ - parse_cmd(argc, argv); - - if(!ifile ) { - show_usage(); - av_exit(1); - } - - H264Context *h = get_h264dec_context(file_name, ifile, ofile, frame_width, frame_height, &cli_opts); -#if OMPSS - if (h264_decode_ompss( h ) < 0) - av_exit(-1); -#else - if (parallel){ - if (ARCH_CELL && !no_arch){ - if (h264_decode_cell( h ) < 0) - av_exit(-1); - }else{ - if (h264_decode_pthread( h ) < 0) - av_exit(1); - } - }else{ - if (ARCH_CELL && !no_arch){ - if (h264_decode_cell_seq( h ) < 0) - av_exit(1); - }else{ - if (h264_decode_seq( h ) < 0) - av_exit(1); - } - } -#endif - free_h264dec_context(h); - close(ifile); - close(ofile); - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/aac.h --- a/ffmpeg_smp/h264dec/libavcodec/arm/aac.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_AAC_H -#define AVCODEC_ARM_AAC_H - -#include "config.h" - -#if HAVE_NEON && HAVE_INLINE_ASM - -#define VMUL2 VMUL2 -static inline float *VMUL2(float *dst, const float *v, unsigned idx, - const float *scale) -{ - unsigned v0, v1; - __asm__ volatile ("ubfx %0, %4, #0, #4 \n\t" - "ubfx %1, %4, #4, #4 \n\t" - "ldr %0, [%3, %0, lsl #2] \n\t" - "ldr %1, [%3, %1, lsl #2] \n\t" - "vld1.32 {d1[]}, [%5,:32] \n\t" - "vmov d0, %0, %1 \n\t" - "vmul.f32 d0, d0, d1 \n\t" - "vst1.32 {d0}, [%2,:64]! \n\t" - : "=&r"(v0), "=&r"(v1), "+r"(dst) - : "r"(v), "r"(idx), "r"(scale) - : "d0", "d1"); - return dst; -} - -#define VMUL4 VMUL4 -static inline float *VMUL4(float *dst, const float *v, unsigned idx, - const float *scale) -{ - unsigned v0, v1, v2, v3; - __asm__ volatile ("ubfx %0, %6, #0, #2 \n\t" - "ubfx %1, %6, #2, #2 \n\t" - "ldr %0, [%5, %0, lsl #2] \n\t" - "ubfx %2, %6, #4, #2 \n\t" - "ldr %1, [%5, %1, lsl #2] \n\t" - "ubfx %3, %6, #6, #2 \n\t" - "ldr %2, [%5, %2, lsl #2] \n\t" - "vmov d0, %0, %1 \n\t" - "ldr %3, [%5, %3, lsl #2] \n\t" - "vld1.32 {d2[],d3[]},[%7,:32] \n\t" - "vmov d1, %2, %3 \n\t" - "vmul.f32 q0, q0, q1 \n\t" - "vst1.32 {q0}, [%4,:128]! \n\t" - : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst) - : "r"(v), "r"(idx), "r"(scale) - : "d0", "d1", "d2", "d3"); - return dst; -} - -#define VMUL2S VMUL2S -static inline float *VMUL2S(float *dst, const float *v, unsigned idx, - unsigned sign, const float *scale) -{ - unsigned v0, v1, v2, v3; - __asm__ volatile ("ubfx %0, %6, #0, #4 \n\t" - "ubfx %1, %6, #4, #4 \n\t" - "ldr %0, [%5, %0, lsl #2] \n\t" - "lsl %2, %8, #30 \n\t" - "ldr %1, [%5, %1, lsl #2] \n\t" - "lsl %3, %8, #31 \n\t" - "vmov d0, %0, %1 \n\t" - "bic %2, %2, #1<<30 \n\t" - "vld1.32 {d1[]}, [%7,:32] \n\t" - "vmov d2, %2, %3 \n\t" - "veor d0, d0, d2 \n\t" - "vmul.f32 d0, d0, d1 \n\t" - "vst1.32 {d0}, [%4,:64]! \n\t" - : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst) - : "r"(v), "r"(idx), "r"(scale), "r"(sign) - : "d0", "d1", "d2"); - return dst; -} - -#define VMUL4S VMUL4S -static inline float *VMUL4S(float *dst, const float *v, unsigned idx, - unsigned sign, const float *scale) -{ - unsigned v0, v1, v2, v3, nz; - __asm__ volatile ("vld1.32 {d2[],d3[]},[%9,:32] \n\t" - "ubfx %0, %8, #0, #2 \n\t" - "ubfx %1, %8, #2, #2 \n\t" - "ldr %0, [%7, %0, lsl #2] \n\t" - "ubfx %2, %8, #4, #2 \n\t" - "ldr %1, [%7, %1, lsl #2] \n\t" - "ubfx %3, %8, #6, #2 \n\t" - "ldr %2, [%7, %2, lsl #2] \n\t" - "vmov d0, %0, %1 \n\t" - "ldr %3, [%7, %3, lsl #2] \n\t" - "lsr %6, %8, #12 \n\t" - "rbit %6, %6 \n\t" - "vmov d1, %2, %3 \n\t" - "lsls %6, %6, #1 \n\t" - "and %0, %5, #1<<31 \n\t" - "lslcs %5, %5, #1 \n\t" - "lsls %6, %6, #1 \n\t" - "and %1, %5, #1<<31 \n\t" - "lslcs %5, %5, #1 \n\t" - "lsls %6, %6, #1 \n\t" - "and %2, %5, #1<<31 \n\t" - "lslcs %5, %5, #1 \n\t" - "vmov d4, %0, %1 \n\t" - "and %3, %5, #1<<31 \n\t" - "vmov d5, %2, %3 \n\t" - "veor q0, q0, q2 \n\t" - "vmul.f32 q0, q0, q1 \n\t" - "vst1.32 {q0}, [%4,:128]! \n\t" - : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), - "+r"(sign), "=r"(nz) - : "r"(v), "r"(idx), "r"(scale) - : "d0", "d1", "d2", "d3", "d4", "d5"); - return dst; -} - -#endif /* HAVE_NEON && HAVE_INLINE_ASM */ - -#endif /* AVCODEC_ARM_AAC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/asm.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/asm.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#ifdef __ELF__ -# define ELF -#else -# define ELF @ -#endif - - .macro require8, val=1 -ELF .eabi_attribute 24, \val - .endm - - .macro preserve8, val=1 -ELF .eabi_attribute 25, \val - .endm - - .macro function name, export=0 - .macro endfunc -ELF .size \name, . - \name - .endfunc - .purgem endfunc - .endm -.if \export - .global EXTERN_ASM\name -EXTERN_ASM\name: -.endif -ELF .type \name, %function - .func \name -\name: - .endm - - .macro movrel rd, val -#if HAVE_ARMV6T2 && !CONFIG_PIC - movw \rd, #:lower16:\val - movt \rd, #:upper16:\val -#else - ldr \rd, =\val -#endif - .endm - -#if HAVE_VFP_ARGS - .eabi_attribute 28, 1 -# define VFP -# define NOVFP @ -#else -# define VFP @ -# define NOVFP -#endif - -#define GLUE(a, b) a ## b -#define JOIN(a, b) GLUE(a, b) -#define X(s) JOIN(EXTERN_ASM, s) diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavcodec/dcadsp.h" - -void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, - int decifactor, float scale, float bias); - -void av_cold ff_dcadsp_init_arm(DCADSPContext *s) -{ - if (HAVE_NEON) - s->lfe_fir = ff_dca_lfe_fir_neon; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -function ff_dca_lfe_fir_neon, export=1 - push {r4-r6,lr} - - add r4, r0, r3, lsl #2 @ out2 - add r5, r2, #256*4-16 @ cf1 - sub r1, r1, #12 - cmp r3, #32 - moveq r6, #256/32 - movne r6, #256/64 -NOVFP vldr d0, [sp, #16] @ scale, bias - mov lr, #-16 -1: - vmov.f32 q2, #0.0 @ v0 - vmov.f32 q3, #0.0 @ v1 - mov r12, r6 -2: - vld1.32 {q8}, [r2,:128]! @ cf0 - vld1.32 {q9}, [r5,:128], lr @ cf1 - vld1.32 {q1}, [r1], lr @ in - subs r12, r12, #4 - vrev64.32 q10, q8 - vmla.f32 q3, q1, q9 - vmla.f32 d4, d2, d21 - vmla.f32 d5, d3, d20 - bne 2b - - add r1, r1, r6, lsl #2 - subs r3, r3, #1 - vadd.f32 d4, d4, d5 - vadd.f32 d6, d6, d7 - vpadd.f32 d4, d4, d6 - vdup.32 d5, d0[1] - vmla.f32 d5, d4, d0[0] - vst1.32 {d5[0]}, [r0,:32]! - vst1.32 {d5[1]}, [r4,:32]! - bne 1b - - pop {r4-r6,pc} -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,712 +0,0 @@ -@ -@ ARMv4 optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji -@ -@ This file is part of FFmpeg. -@ -@ FFmpeg is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ FFmpeg is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with FFmpeg; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -#include "config.h" -#include "asm.S" - - preserve8 - -#if !HAVE_PLD -.macro pld reg -.endm -#endif - -#if HAVE_ARMV5TE -function ff_prefetch_arm, export=1 - subs r2, r2, #1 - pld [r0] - add r0, r0, r1 - bne ff_prefetch_arm - bx lr -endfunc -#endif - -.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 - mov \Rd0, \Rn0, lsr #(\shift * 8) - mov \Rd1, \Rn1, lsr #(\shift * 8) - mov \Rd2, \Rn2, lsr #(\shift * 8) - mov \Rd3, \Rn3, lsr #(\shift * 8) - orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) - orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) - orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) - orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD shift, R0, R1, R2 - mov \R0, \R0, lsr #(\shift * 8) - orr \R0, \R0, \R1, lsl #(32 - \shift * 8) - mov \R1, \R1, lsr #(\shift * 8) - orr \R1, \R1, \R2, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 - mov \Rdst0, \Rsrc0, lsr #(\shift * 8) - mov \Rdst1, \Rsrc1, lsr #(\shift * 8) - orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) - orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) -.endm - -.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - orr \Rn0, \Rn0, \Rm0 - orr \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - sub \Rd0, \Rn0, \Rd0, lsr #1 - sub \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - and \Rn0, \Rn0, \Rm0 - and \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - add \Rd0, \Rn0, \Rd0, lsr #1 - add \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro JMP_ALIGN tmp, reg - ands \tmp, \reg, #3 - bic \reg, \reg, #3 - beq 1f - subs \tmp, \tmp, #1 - beq 2f - subs \tmp, \tmp, #1 - beq 3f - b 4f -.endm - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels16_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11, lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r7} - add r1, r1, r2 - stm r0, {r4-r7} - pld [r1] - subs r3, r3, #1 - add r0, r0, r2 - bne 1b - pop {r4-r11, pc} - .align 5 -2: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 2b - pop {r4-r11, pc} - .align 5 -3: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 3b - pop {r4-r11, pc} - .align 5 -4: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 4b - pop {r4-r11,pc} -endfunc - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r5,lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 - subs r3, r3, #1 - pld [r1] - stm r0, {r4-r5} - add r0, r0, r2 - bne 1b - pop {r4-r5,pc} - .align 5 -2: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 1, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r5,pc} - .align 5 -3: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 2, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r5,pc} - .align 5 -4: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 3, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 4b - pop {r4-r5,pc} -endfunc - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} -endfunc - - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} -endfunc - - .ltorg - -@ ---------------------------------------------------------------- -.macro RND_XY2_IT align, rnd - @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) - @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) -.if \align == 0 - ldm r1, {r6-r8} -.elseif \align == 3 - ldm r1, {r5-r7} -.else - ldm r1, {r8-r10} -.endif - add r1, r1, r2 - pld [r1] -.if \align == 0 - ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 -.elseif \align == 1 - ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 -.elseif \align == 2 - ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 -.elseif \align == 3 - ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 -.endif - ldr r14, =0x03030303 - tst r3, #1 - and r8, r4, r14 - and r9, r5, r14 - and r10, r6, r14 - and r11, r7, r14 - andeq r14, r14, r14, \rnd #1 - add r8, r8, r10 - add r9, r9, r11 - ldr r12, =0xfcfcfcfc >> 2 - addeq r8, r8, r14 - addeq r9, r9, r14 - and r4, r12, r4, lsr #2 - and r5, r12, r5, lsr #2 - and r6, r12, r6, lsr #2 - and r7, r12, r7, lsr #2 - add r10, r4, r6 - add r11, r5, r7 - subs r3, r3, #1 -.endm - -.macro RND_XY2_EXPAND align, rnd - RND_XY2_IT \align, \rnd -6: push {r8-r11} - RND_XY2_IT \align, \rnd - pop {r4-r7} - add r4, r4, r8 - add r5, r5, r9 - ldr r14, =0x0f0f0f0f - add r6, r6, r10 - add r7, r7, r11 - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - add r4, r4, r6 - add r5, r5, r7 - stm r0, {r4-r5} - add r0, r0, r2 - bge 6b - pop {r4-r11,pc} -.endm - - .align 5 -function ff_put_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} @ R14 is also called LR - JMP_ALIGN r5, r1 -1: RND_XY2_EXPAND 0, lsl - .align 5 -2: RND_XY2_EXPAND 1, lsl - .align 5 -3: RND_XY2_EXPAND 2, lsl - .align 5 -4: RND_XY2_EXPAND 3, lsl -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - JMP_ALIGN r5, r1 -1: RND_XY2_EXPAND 0, lsr - .align 5 -2: RND_XY2_EXPAND 1, lsr - .align 5 -3: RND_XY2_EXPAND 2, lsr - .align 5 -4: RND_XY2_EXPAND 3, lsr -endfunc - - .align 5 -@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) -function ff_add_pixels_clamped_arm, export=1 - push {r4-r10} - mov r10, #8 -1: - ldr r4, [r1] /* load dest */ - /* block[0] and block[1]*/ - ldrsh r5, [r0] - ldrsh r7, [r0, #2] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r5, r6 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #4] /* moved form [A] */ - orr r9, r9, r8, lsl #8 - /* block[2] and block[3] */ - /* [A] */ - ldrsh r7, [r0, #6] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - ldr r4, [r1, #4] /* moved form [B] */ - orr r9, r9, r8, lsl #24 - /* store dest */ - ldrsh r5, [r0, #8] /* moved form [C] */ - str r9, [r1] - - /* load dest */ - /* [B] */ - /* block[4] and block[5] */ - /* [C] */ - ldrsh r7, [r0, #10] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r5, r6 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #12] /* moved from [D] */ - orr r9, r9, r8, lsl #8 - /* block[6] and block[7] */ - /* [D] */ - ldrsh r7, [r0, #14] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - movne r6, r5, lsr #24 - tst r8, #0x100 - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - add r0, r0, #16 /* moved from [E] */ - orr r9, r9, r8, lsl #24 - subs r10, r10, #1 /* moved from [F] */ - /* store dest */ - str r9, [r1, #4] - - /* [E] */ - /* [F] */ - add r1, r1, r2 - bne 1b - - pop {r4-r10} - bx lr -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_DSPUTIL_H -#define AVCODEC_ARM_DSPUTIL_H - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" - -void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); -void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,623 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - preserve8 - - .text - -.macro call_2x_pixels type, subp -function ff_\type\()_pixels16\subp\()_armv6, export=1 - push {r0-r3, lr} - bl ff_\type\()_pixels8\subp\()_armv6 - pop {r0-r3, lr} - add r0, r0, #8 - add r1, r1, #8 - b ff_\type\()_pixels8\subp\()_armv6 -endfunc -.endm - -call_2x_pixels avg -call_2x_pixels put, _x2 -call_2x_pixels put, _y2 -call_2x_pixels put, _x2_no_rnd -call_2x_pixels put, _y2_no_rnd - -function ff_put_pixels16_armv6, export=1 - push {r4-r11} -1: - ldr r5, [r1, #4] - ldr r6, [r1, #8] - ldr r7, [r1, #12] - ldr r4, [r1], r2 - strd r6, r7, [r0, #8] - ldr r9, [r1, #4] - strd r4, r5, [r0], r2 - ldr r10, [r1, #8] - ldr r11, [r1, #12] - ldr r8, [r1], r2 - strd r10, r11, [r0, #8] - subs r3, r3, #2 - strd r8, r9, [r0], r2 - bne 1b - - pop {r4-r11} - bx lr -endfunc - -function ff_put_pixels8_armv6, export=1 - push {r4-r7} -1: - ldr r5, [r1, #4] - ldr r4, [r1], r2 - ldr r7, [r1, #4] - strd r4, r5, [r0], r2 - ldr r6, [r1], r2 - subs r3, r3, #2 - strd r6, r7, [r0], r2 - bne 1b - - pop {r4-r7} - bx lr -endfunc - -function ff_put_pixels8_x2_armv6, export=1 - push {r4-r11, lr} - mov r12, #1 - orr r12, r12, r12, lsl #8 - orr r12, r12, r12, lsl #16 -1: - ldr r4, [r1] - subs r3, r3, #2 - ldr r5, [r1, #4] - ldr r7, [r1, #5] - lsr r6, r4, #8 - ldr r8, [r1, r2]! - orr r6, r6, r5, lsl #24 - ldr r9, [r1, #4] - ldr r11, [r1, #5] - lsr r10, r8, #8 - add r1, r1, r2 - orr r10, r10, r9, lsl #24 - eor r14, r4, r6 - uhadd8 r4, r4, r6 - eor r6, r5, r7 - uhadd8 r5, r5, r7 - and r14, r14, r12 - and r6, r6, r12 - uadd8 r4, r4, r14 - eor r14, r8, r10 - uadd8 r5, r5, r6 - eor r6, r9, r11 - uhadd8 r8, r8, r10 - and r14, r14, r12 - uhadd8 r9, r9, r11 - and r6, r6, r12 - uadd8 r8, r8, r14 - strd r4, r5, [r0], r2 - uadd8 r9, r9, r6 - strd r8, r9, [r0], r2 - bne 1b - - pop {r4-r11, pc} -endfunc - -function ff_put_pixels8_y2_armv6, export=1 - push {r4-r11} - mov r12, #1 - orr r12, r12, r12, lsl #8 - orr r12, r12, r12, lsl #16 - ldr r4, [r1] - ldr r5, [r1, #4] - ldr r6, [r1, r2]! - ldr r7, [r1, #4] -1: - subs r3, r3, #2 - uhadd8 r8, r4, r6 - eor r10, r4, r6 - uhadd8 r9, r5, r7 - eor r11, r5, r7 - and r10, r10, r12 - ldr r4, [r1, r2]! - uadd8 r8, r8, r10 - and r11, r11, r12 - uadd8 r9, r9, r11 - ldr r5, [r1, #4] - uhadd8 r10, r4, r6 - eor r6, r4, r6 - uhadd8 r11, r5, r7 - and r6, r6, r12 - eor r7, r5, r7 - uadd8 r10, r10, r6 - and r7, r7, r12 - ldr r6, [r1, r2]! - uadd8 r11, r11, r7 - strd r8, r9, [r0], r2 - ldr r7, [r1, #4] - strd r10, r11, [r0], r2 - bne 1b - - pop {r4-r11} - bx lr -endfunc - -function ff_put_pixels8_x2_no_rnd_armv6, export=1 - push {r4-r9, lr} -1: - subs r3, r3, #2 - ldr r4, [r1] - ldr r5, [r1, #4] - ldr r7, [r1, #5] - ldr r8, [r1, r2]! - ldr r9, [r1, #4] - ldr r14, [r1, #5] - add r1, r1, r2 - lsr r6, r4, #8 - orr r6, r6, r5, lsl #24 - lsr r12, r8, #8 - orr r12, r12, r9, lsl #24 - uhadd8 r4, r4, r6 - uhadd8 r5, r5, r7 - uhadd8 r8, r8, r12 - uhadd8 r9, r9, r14 - stm r0, {r4,r5} - add r0, r0, r2 - stm r0, {r8,r9} - add r0, r0, r2 - bne 1b - - pop {r4-r9, pc} -endfunc - -function ff_put_pixels8_y2_no_rnd_armv6, export=1 - push {r4-r9, lr} - ldr r4, [r1] - ldr r5, [r1, #4] - ldr r6, [r1, r2]! - ldr r7, [r1, #4] -1: - subs r3, r3, #2 - uhadd8 r8, r4, r6 - ldr r4, [r1, r2]! - uhadd8 r9, r5, r7 - ldr r5, [r1, #4] - uhadd8 r12, r4, r6 - ldr r6, [r1, r2]! - uhadd8 r14, r5, r7 - ldr r7, [r1, #4] - stm r0, {r8,r9} - add r0, r0, r2 - stm r0, {r12,r14} - add r0, r0, r2 - bne 1b - - pop {r4-r9, pc} -endfunc - -function ff_avg_pixels8_armv6, export=1 - pld [r1, r2] - push {r4-r10, lr} - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 - ldrd r4, r5, [r0] - ldr r10, [r1, #4] - ldr r9, [r1], r2 - subs r3, r3, #2 -1: - pld [r1, r2] - eor r8, r4, r9 - uhadd8 r4, r4, r9 - eor r12, r5, r10 - ldrd r6, r7, [r0, r2] - uhadd8 r5, r5, r10 - and r8, r8, lr - ldr r10, [r1, #4] - and r12, r12, lr - uadd8 r4, r4, r8 - ldr r9, [r1], r2 - eor r8, r6, r9 - uadd8 r5, r5, r12 - pld [r1, r2, lsl #1] - eor r12, r7, r10 - uhadd8 r6, r6, r9 - strd r4, r5, [r0], r2 - uhadd8 r7, r7, r10 - beq 2f - and r8, r8, lr - ldrd r4, r5, [r0, r2] - uadd8 r6, r6, r8 - ldr r10, [r1, #4] - and r12, r12, lr - subs r3, r3, #2 - uadd8 r7, r7, r12 - ldr r9, [r1], r2 - strd r6, r7, [r0], r2 - b 1b -2: - and r8, r8, lr - and r12, r12, lr - uadd8 r6, r6, r8 - uadd8 r7, r7, r12 - strd r6, r7, [r0], r2 - - pop {r4-r10, pc} -endfunc - -function ff_add_pixels_clamped_armv6, export=1 - push {r4-r8,lr} - mov r3, #8 -1: - ldm r0!, {r4,r5,r12,lr} - ldrd r6, r7, [r1] - pkhbt r8, r4, r5, lsl #16 - pkhtb r5, r5, r4, asr #16 - pkhbt r4, r12, lr, lsl #16 - pkhtb lr, lr, r12, asr #16 - pld [r1, r2] - uxtab16 r8, r8, r6 - uxtab16 r5, r5, r6, ror #8 - uxtab16 r4, r4, r7 - uxtab16 lr, lr, r7, ror #8 - usat16 r8, #8, r8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 lr, #8, lr - orr r6, r8, r5, lsl #8 - orr r7, r4, lr, lsl #8 - subs r3, r3, #1 - strd r6, r7, [r1], r2 - bgt 1b - pop {r4-r8,pc} -endfunc - -function ff_get_pixels_armv6, export=1 - pld [r1, r2] - push {r4-r8, lr} - mov lr, #8 -1: - ldrd r4, r5, [r1], r2 - subs lr, lr, #1 - uxtb16 r6, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r12, r5 - uxtb16 r8, r5, ror #8 - pld [r1, r2] - pkhbt r5, r6, r4, lsl #16 - pkhtb r6, r4, r6, asr #16 - pkhbt r7, r12, r8, lsl #16 - pkhtb r12, r8, r12, asr #16 - stm r0!, {r5,r6,r7,r12} - bgt 1b - - pop {r4-r8, pc} -endfunc - -function ff_diff_pixels_armv6, export=1 - pld [r1, r3] - pld [r2, r3] - push {r4-r9, lr} - mov lr, #8 -1: - ldrd r4, r5, [r1], r3 - ldrd r6, r7, [r2], r3 - uxtb16 r8, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r6 - uxtb16 r6, r6, ror #8 - pld [r1, r3] - ssub16 r9, r8, r9 - ssub16 r6, r4, r6 - uxtb16 r8, r5 - uxtb16 r5, r5, ror #8 - pld [r2, r3] - pkhbt r4, r9, r6, lsl #16 - pkhtb r6, r6, r9, asr #16 - uxtb16 r9, r7 - uxtb16 r7, r7, ror #8 - ssub16 r9, r8, r9 - ssub16 r5, r5, r7 - subs lr, lr, #1 - pkhbt r8, r9, r5, lsl #16 - pkhtb r9, r5, r9, asr #16 - stm r0!, {r4,r6,r8,r9} - bgt 1b - - pop {r4-r9, pc} -endfunc - -function ff_pix_abs16_armv6, export=1 - ldr r0, [sp] - push {r4-r9, lr} - mov r12, #0 - mov lr, #0 - ldm r1, {r4-r7} - ldr r8, [r2] -1: - ldr r9, [r2, #4] - pld [r1, r3] - usada8 r12, r4, r8, r12 - ldr r8, [r2, #8] - pld [r2, r3] - usada8 lr, r5, r9, lr - ldr r9, [r2, #12] - usada8 r12, r6, r8, r12 - subs r0, r0, #1 - usada8 lr, r7, r9, lr - beq 2f - add r1, r1, r3 - ldm r1, {r4-r7} - add r2, r2, r3 - ldr r8, [r2] - b 1b -2: - add r0, r12, lr - pop {r4-r9, pc} -endfunc - -function ff_pix_abs16_x2_armv6, export=1 - ldr r12, [sp] - push {r4-r11, lr} - mov r0, #0 - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 -1: - ldr r8, [r2] - ldr r9, [r2, #4] - lsr r10, r8, #8 - ldr r4, [r1] - lsr r6, r9, #8 - orr r10, r10, r9, lsl #24 - ldr r5, [r2, #8] - eor r11, r8, r10 - uhadd8 r7, r8, r10 - orr r6, r6, r5, lsl #24 - and r11, r11, lr - uadd8 r7, r7, r11 - ldr r8, [r1, #4] - usada8 r0, r4, r7, r0 - eor r7, r9, r6 - lsr r10, r5, #8 - and r7, r7, lr - uhadd8 r4, r9, r6 - ldr r6, [r2, #12] - uadd8 r4, r4, r7 - pld [r1, r3] - orr r10, r10, r6, lsl #24 - usada8 r0, r8, r4, r0 - ldr r4, [r1, #8] - eor r11, r5, r10 - ldrb r7, [r2, #16] - and r11, r11, lr - uhadd8 r8, r5, r10 - ldr r5, [r1, #12] - uadd8 r8, r8, r11 - pld [r2, r3] - lsr r10, r6, #8 - usada8 r0, r4, r8, r0 - orr r10, r10, r7, lsl #24 - subs r12, r12, #1 - eor r11, r6, r10 - add r1, r1, r3 - uhadd8 r9, r6, r10 - and r11, r11, lr - uadd8 r9, r9, r11 - add r2, r2, r3 - usada8 r0, r5, r9, r0 - bgt 1b - - pop {r4-r11, pc} -endfunc - -.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 - ldr \n0, [r2] - eor \n1, \p0, \n0 - uhadd8 \p0, \p0, \n0 - and \n1, \n1, lr - ldr \n2, [r1] - uadd8 \p0, \p0, \n1 - ldr \n1, [r2, #4] - usada8 r0, \p0, \n2, r0 - pld [r1, r3] - eor \n3, \p1, \n1 - uhadd8 \p1, \p1, \n1 - and \n3, \n3, lr - ldr \p0, [r1, #4] - uadd8 \p1, \p1, \n3 - ldr \n2, [r2, #8] - usada8 r0, \p1, \p0, r0 - pld [r2, r3] - eor \p0, \p2, \n2 - uhadd8 \p2, \p2, \n2 - and \p0, \p0, lr - ldr \p1, [r1, #8] - uadd8 \p2, \p2, \p0 - ldr \n3, [r2, #12] - usada8 r0, \p2, \p1, r0 - eor \p1, \p3, \n3 - uhadd8 \p3, \p3, \n3 - and \p1, \p1, lr - ldr \p0, [r1, #12] - uadd8 \p3, \p3, \p1 - add r1, r1, r3 - usada8 r0, \p3, \p0, r0 - add r2, r2, r3 -.endm - -function ff_pix_abs16_y2_armv6, export=1 - pld [r1] - pld [r2] - ldr r12, [sp] - push {r4-r11, lr} - mov r0, #0 - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 - ldr r4, [r2] - ldr r5, [r2, #4] - ldr r6, [r2, #8] - ldr r7, [r2, #12] - add r2, r2, r3 -1: - usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 - subs r12, r12, #2 - usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 - bgt 1b - - pop {r4-r11, pc} -endfunc - -function ff_pix_abs8_armv6, export=1 - pld [r2, r3] - ldr r12, [sp] - push {r4-r9, lr} - mov r0, #0 - mov lr, #0 - ldrd r4, r5, [r1], r3 -1: - subs r12, r12, #2 - ldr r7, [r2, #4] - ldr r6, [r2], r3 - ldrd r8, r9, [r1], r3 - usada8 r0, r4, r6, r0 - pld [r2, r3] - usada8 lr, r5, r7, lr - ldr r7, [r2, #4] - ldr r6, [r2], r3 - beq 2f - ldrd r4, r5, [r1], r3 - usada8 r0, r8, r6, r0 - pld [r2, r3] - usada8 lr, r9, r7, lr - b 1b -2: - usada8 r0, r8, r6, r0 - usada8 lr, r9, r7, lr - add r0, r0, lr - pop {r4-r9, pc} -endfunc - -function ff_sse16_armv6, export=1 - ldr r12, [sp] - push {r4-r9, lr} - mov r0, #0 -1: - ldrd r4, r5, [r1] - ldr r8, [r2] - uxtb16 lr, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r8 - uxtb16 r8, r8, ror #8 - ldr r7, [r2, #4] - usub16 lr, lr, r9 - usub16 r4, r4, r8 - smlad r0, lr, lr, r0 - uxtb16 r6, r5 - uxtb16 lr, r5, ror #8 - uxtb16 r8, r7 - uxtb16 r9, r7, ror #8 - smlad r0, r4, r4, r0 - ldrd r4, r5, [r1, #8] - usub16 r6, r6, r8 - usub16 r8, lr, r9 - ldr r7, [r2, #8] - smlad r0, r6, r6, r0 - uxtb16 lr, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r7 - uxtb16 r7, r7, ror #8 - smlad r0, r8, r8, r0 - ldr r8, [r2, #12] - usub16 lr, lr, r9 - usub16 r4, r4, r7 - smlad r0, lr, lr, r0 - uxtb16 r6, r5 - uxtb16 r5, r5, ror #8 - uxtb16 r9, r8 - uxtb16 r8, r8, ror #8 - smlad r0, r4, r4, r0 - usub16 r6, r6, r9 - usub16 r5, r5, r8 - smlad r0, r6, r6, r0 - add r1, r1, r3 - add r2, r2, r3 - subs r12, r12, #1 - smlad r0, r5, r5, r0 - bgt 1b - - pop {r4-r9, pc} -endfunc - -function ff_pix_norm1_armv6, export=1 - push {r4-r6, lr} - mov r12, #16 - mov lr, #0 -1: - ldm r0, {r2-r5} - uxtb16 r6, r2 - uxtb16 r2, r2, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r3 - smlad lr, r2, r2, lr - uxtb16 r3, r3, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r4 - smlad lr, r3, r3, lr - uxtb16 r4, r4, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r5 - smlad lr, r4, r4, lr - uxtb16 r5, r5, ror #8 - smlad lr, r6, r6, lr - subs r12, r12, #1 - add r0, r0, r1 - smlad lr, r5, r5, lr - bgt 1b - - mov r0, lr - pop {r4-r6, pc} -endfunc - -function ff_pix_sum_armv6, export=1 - push {r4-r7, lr} - mov r12, #16 - mov r2, #0 - mov r3, #0 - mov lr, #0 - ldr r4, [r0] -1: - subs r12, r12, #1 - ldr r5, [r0, #4] - usada8 r2, r4, lr, r2 - ldr r6, [r0, #8] - usada8 r3, r5, lr, r3 - ldr r7, [r0, #12] - usada8 r2, r6, lr, r2 - beq 2f - ldr r4, [r0, r1]! - usada8 r3, r7, lr, r3 - bgt 1b -2: - usada8 r3, r7, lr, r3 - add r0, r2, r3 - pop {r4-r7, pc} -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -/* - * ARM optimized DSP utils - * Copyright (c) 2001 Lionel Ulmer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_j_rev_dct_arm(DCTELEM *data); -void ff_simple_idct_arm(DCTELEM *data); - -/* XXX: local hack */ -static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); -static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); - -void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); -void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) -CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) -CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) - -void ff_add_pixels_clamped_arm(const DCTELEM *block, uint8_t *dest, - int line_size); - -/* XXX: those functions should be suppressed ASAP when all IDCTs are - converted */ -static void j_rev_dct_arm_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_j_rev_dct_arm (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void j_rev_dct_arm_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_j_rev_dct_arm (block); - ff_add_pixels_clamped(block, dest, line_size); -} -static void simple_idct_arm_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_simple_idct_arm (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_simple_idct_arm (block); - ff_add_pixels_clamped(block, dest, line_size); -} - -int mm_support(void) -{ - return HAVE_IWMMXT * FF_MM_IWMMXT; -} - -void dsputil_init_arm(DSPContext* c) -{ - ff_put_pixels_clamped = c->put_pixels_clamped; - ff_add_pixels_clamped = c->add_pixels_clamped; - - c->idct_put = simple_idct_arm_put; - c->idct_add = simple_idct_arm_add; - c->idct = ff_simple_idct_arm; - c->idct_permutation_type = FF_NO_IDCT_PERM; - - c->add_pixels_clamped = ff_add_pixels_clamped_arm; - - c->put_pixels_tab[0][0] = ff_put_pixels16_arm; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; - c->put_pixels_tab[1][0] = ff_put_pixels8_arm; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; - c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; - c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; - c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; - c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; - - if (HAVE_NEON) ff_dsputil_init_neon(c); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_simple_idct_armv5te(DCTELEM *data); -void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data); -void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data); - -void ff_prefetch_arm(void *mem, int stride, int h); - -void av_cold ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx) -{ - if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { - c->idct_put = ff_simple_idct_put_armv5te; - c->idct_add = ff_simple_idct_add_armv5te; - c->idct = ff_simple_idct_armv5te; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } - - c->prefetch = ff_prefetch_arm; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_simple_idct_armv6(DCTELEM *data); -void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); -void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); - -void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int); - -void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); - -void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int); - -void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int); - -void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); - -void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int); - -void ff_add_pixels_clamped_armv6(const DCTELEM *block, - uint8_t *restrict pixels, - int line_size); - -void ff_get_pixels_armv6(DCTELEM *block, const uint8_t *pixels, int stride); -void ff_diff_pixels_armv6(DCTELEM *block, const uint8_t *s1, - const uint8_t *s2, int stride); - -int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); -int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); -int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_pix_norm1_armv6(uint8_t *pix, int line_size); -int ff_pix_sum_armv6(uint8_t *pix, int line_size); - -void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx) -{ - if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) { - c->idct_put = ff_simple_idct_put_armv6; - c->idct_add = ff_simple_idct_add_armv6; - c->idct = ff_simple_idct_armv6; - c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; - } - - c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; -/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ - c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; -/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; -/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; -/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; - c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; - - c->add_pixels_clamped = ff_add_pixels_clamped_armv6; - c->get_pixels = ff_get_pixels_armv6; - c->diff_pixels = ff_diff_pixels_armv6; - - c->pix_abs[0][0] = ff_pix_abs16_armv6; - c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; - c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; - - c->pix_abs[1][0] = ff_pix_abs8_armv6; - - c->sad[0] = ff_pix_abs16_armv6; - c->sad[1] = ff_pix_abs8_armv6; - - c->sse[0] = ff_sse16_armv6; - - c->pix_norm1 = ff_pix_norm1_armv6; - c->pix_sum = ff_pix_sum_armv6; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,308 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_simple_idct_neon(DCTELEM *data); -void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); -void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); - -void ff_vp3_idct_neon(DCTELEM *data); -void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); -void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); -void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); - -void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); -void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); - -void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); -void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int); - -void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); -void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); -void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); - -void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); -void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); -void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); - -void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); -void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); - -void ff_vector_fmul_neon(float *dst, const float *src, int len); -void ff_vector_fmul_window_neon(float *dst, const float *src0, - const float *src1, const float *win, - float add_bias, int len); -void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, - int len); -void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, - const float **vp, float mul, int len); -void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, - const float **vp, float mul, int len); -void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, - int len); -void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, - int len); -void ff_butterflies_float_neon(float *v1, float *v2, int len); -float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); -void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, - float mul, int len); -void ff_vector_fmul_reverse_neon(float *dst, const float *src0, - const float *src1, int len); -void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, - const float *src2, int len); - -void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, - int len); -void ff_float_to_int16_neon(int16_t *, const float *, long); -void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); - -void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); - -int32_t ff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len, - int shift); -int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, int16_t *v2, - int16_t *v3, int len, int mul); - -void ff_dsputil_init_neon(DSPContext *c) -{ - - { - c->idct_put = ff_simple_idct_put_neon; - c->idct_add = ff_simple_idct_add_neon; - c->idct = ff_simple_idct_neon; - c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; - - } - - c->put_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; - c->put_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; - c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; - - c->add_pixels_clamped = ff_add_pixels_clamped_neon; - c->put_pixels_clamped = ff_put_pixels_clamped_neon; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; - - - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; - c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon; - - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; - c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon; - - c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; - c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; - c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; - c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; - c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; - c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; - c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; - c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; - c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; - c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; - c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; - c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; - c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; - c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; - c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; - c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; - - c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; - c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; - c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; - c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; - c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; - c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; - c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; - c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; - c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; - c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; - c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; - c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; - c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; - c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; - c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; - c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; - - c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; - c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon; - c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon; - c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon; - c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon; - c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon; - c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon; - c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon; - c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon; - c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon; - c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon; - c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon; - c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon; - c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon; - c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon; - c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon; - - c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; - c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon; - c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon; - c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon; - c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon; - c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon; - c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon; - c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon; - c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon; - c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon; - c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon; - c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon; - c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon; - c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon; - c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon; - c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon; - - c->vector_fmul = ff_vector_fmul_neon; - c->vector_fmul_window = ff_vector_fmul_window_neon; - c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; - c->butterflies_float = ff_butterflies_float_neon; - c->scalarproduct_float = ff_scalarproduct_float_neon; - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; - c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; - c->vector_fmul_add = ff_vector_fmul_add_neon; - c->vector_clipf = ff_vector_clipf_neon; - - c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; - c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; - - c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; - c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; - - - c->float_to_int16 = ff_float_to_int16_neon; - c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; - - c->scalarproduct_int16 = ff_scalarproduct_int16_neon; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_vector_fmul_vfp(float *dst, const float *src, int len); -void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, - const float *src1, int len); -void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); - -void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) -{ - c->vector_fmul = ff_vector_fmul_vfp; - c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; -#if HAVE_ARMV6 - c->float_to_int16 = ff_float_to_int16_vfp; -#endif -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,205 +0,0 @@ -/* - * iWMMXt optimized DSP utils - * Copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" - -#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt -#define SET_RND(regd) __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); -#define WAVG2B "wavg2b" -#include "dsputil_iwmmxt_rnd_template.c" -#undef DEF -#undef SET_RND -#undef WAVG2B - -#define DEF(x, y) x ## _ ## y ##_iwmmxt -#define SET_RND(regd) __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); -#define WAVG2B "wavg2br" -#include "dsputil_iwmmxt_rnd_template.c" -#undef DEF -#undef SET_RND -#undef WAVG2BR - -// need scheduling -#define OP(AVG) \ - __asm__ volatile ( \ - /* alignment */ \ - "and r12, %[pixels], #7 \n\t" \ - "bic %[pixels], %[pixels], #7 \n\t" \ - "tmcr wcgr1, r12 \n\t" \ - \ - "wldrd wr0, [%[pixels]] \n\t" \ - "wldrd wr1, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "walignr1 wr4, wr0, wr1 \n\t" \ - \ - "1: \n\t" \ - \ - "wldrd wr2, [%[pixels]] \n\t" \ - "wldrd wr3, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "pld [%[pixels]] \n\t" \ - "walignr1 wr5, wr2, wr3 \n\t" \ - AVG " wr6, wr4, wr5 \n\t" \ - "wstrd wr6, [%[block]] \n\t" \ - "add %[block], %[block], %[line_size] \n\t" \ - \ - "wldrd wr0, [%[pixels]] \n\t" \ - "wldrd wr1, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "walignr1 wr4, wr0, wr1 \n\t" \ - "pld [%[pixels]] \n\t" \ - AVG " wr6, wr4, wr5 \n\t" \ - "wstrd wr6, [%[block]] \n\t" \ - "add %[block], %[block], %[line_size] \n\t" \ - \ - "subs %[h], %[h], #2 \n\t" \ - "bne 1b \n\t" \ - : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ - : [line_size]"r"(line_size) \ - : "memory", "r12"); -void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - OP("wavg2br"); -} -void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - OP("wavg2b"); -} -#undef OP - -void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - uint8_t *pixels2 = pixels + line_size; - - __asm__ volatile ( - "mov r12, #4 \n\t" - "1: \n\t" - "pld [%[pixels], %[line_size2]] \n\t" - "pld [%[pixels2], %[line_size2]] \n\t" - "wldrd wr4, [%[pixels]] \n\t" - "wldrd wr5, [%[pixels2]] \n\t" - "pld [%[block], #32] \n\t" - "wunpckelub wr6, wr4 \n\t" - "wldrd wr0, [%[block]] \n\t" - "wunpckehub wr7, wr4 \n\t" - "wldrd wr1, [%[block], #8] \n\t" - "wunpckelub wr8, wr5 \n\t" - "wldrd wr2, [%[block], #16] \n\t" - "wunpckehub wr9, wr5 \n\t" - "wldrd wr3, [%[block], #24] \n\t" - "add %[block], %[block], #32 \n\t" - "waddhss wr10, wr0, wr6 \n\t" - "waddhss wr11, wr1, wr7 \n\t" - "waddhss wr12, wr2, wr8 \n\t" - "waddhss wr13, wr3, wr9 \n\t" - "wpackhus wr14, wr10, wr11 \n\t" - "wpackhus wr15, wr12, wr13 \n\t" - "wstrd wr14, [%[pixels]] \n\t" - "add %[pixels], %[pixels], %[line_size2] \n\t" - "subs r12, r12, #1 \n\t" - "wstrd wr15, [%[pixels2]] \n\t" - "add %[pixels2], %[pixels2], %[line_size2] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) - : [line_size2]"r"(line_size << 1) - : "cc", "memory", "r12"); -} - -static void clear_blocks_iwmmxt(DCTELEM *blocks) -{ - __asm__ volatile( - "wzero wr0 \n\t" - "mov r1, #(128 * 6 / 32) \n\t" - "1: \n\t" - "wstrd wr0, [%0] \n\t" - "wstrd wr0, [%0, #8] \n\t" - "wstrd wr0, [%0, #16] \n\t" - "wstrd wr0, [%0, #24] \n\t" - "subs r1, r1, #1 \n\t" - "add %0, %0, #32 \n\t" - "bne 1b \n\t" - : "+r"(blocks) - : - : "r1" - ); -} - -static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - return; -} - -/* A run time test is not simple. If this file is compiled in - * then we should install the functions - */ -int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */ - -void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) -{ - if (avctx->dsp_mask) { - if (avctx->dsp_mask & FF_MM_FORCE) - mm_flags |= (avctx->dsp_mask & 0xffff); - else - mm_flags &= ~(avctx->dsp_mask & 0xffff); - } - - if (!(mm_flags & FF_MM_IWMMXT)) return; - - c->add_pixels_clamped = add_pixels_clamped_iwmmxt; - - c->clear_blocks = clear_blocks_iwmmxt; - - c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; - c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; - c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; - c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; - - c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; - c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; - c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; - c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; - c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; - - c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; - c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; - - c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; - c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; - c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; - c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1114 +0,0 @@ -/* - * iWMMXt optimized DSP utils - * copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr4, [r4, #8] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr4, [r4, #8] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr0, [%[block]] \n\t" - "wldrd wr2, [r5] \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - WAVG2B" wr8, wr8, wr0 \n\t" - WAVG2B" wr10, wr10, wr2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr2, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr4, [r4, #8] \n\t" - "walignr1 wr9, wr1, wr2 \n\t" - "wldrd wr5, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wstrd wr8, [%[block]] \n\t" - "walignr1 wr11, wr4, wr5 \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "wstrd wr11, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ volatile ( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr2, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr4, [r4, #8] \n\t" - "walignr1 wr9, wr1, wr2 \n\t" - "wldrd wr5, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "wldrd wr0, [%[block]] \n\t" - "pld [r4] \n\t" - "wldrd wr1, [%[block], #8] \n\t" - "pld [r4, #32] \n\t" - "wldrd wr2, [r5] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wldrd wr3, [r5, #8] \n\t" - WAVG2B" wr8, wr8, wr0 \n\t" - WAVG2B" wr9, wr9, wr1 \n\t" - WAVG2B" wr10, wr10, wr2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "walignr1 wr11, wr4, wr5 \n\t" - WAVG2B" wr11, wr11, wr3 \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "wstrd wr11, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr6, wr14 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr2, [r5] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr15, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "walignr1 wr3, wr14, wr15 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr5, wr12 \n\t" - "wmoveq wr6, wr14 \n\t" - "wmoveq wr7, wr15 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr5, wr11, wr12 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "walignr2ne wr7, wr14, wr15 \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr1, wr1, wr5 \n\t" - "wstrd wr0, [%[block]] \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wstrd wr1, [%[block], #8] \n\t" - WAVG2B" wr3, wr3, wr7 \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr2, [r5] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr3, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr6, wr14 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "wldrd wr12, [r5] \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - WAVG2B" wr0, wr0, wr10 \n\t" - WAVG2B" wr2, wr2, wr12 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr2, [r5] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr15, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "walignr1 wr3, wr14, wr15 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr5, wr12 \n\t" - "wmoveq wr6, wr14 \n\t" - "wmoveq wr7, wr15 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr5, wr11, wr12 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "walignr2ne wr7, wr14, wr15 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr1, wr1, wr5 \n\t" - "wldrd wr12, [r5] \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wldrd wr13, [r5, #8] \n\t" - WAVG2B" wr3, wr3, wr7 \n\t" - WAVG2B" wr0, wr0, wr10 \n\t" - WAVG2B" wr1, wr1, wr11 \n\t" - WAVG2B" wr2, wr2, wr12 \n\t" - WAVG2B" wr3, wr3, wr13 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr1, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr2, [r5] \n\t" - "pld [%[block]] \n\t" - "wstrd wr3, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [%[block], #32] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - :"r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "cc", "memory", "r12"); -} - -void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "walignr1 wr5, wr11, wr12 \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ volatile( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "walignr1 wr5, wr11, wr12 \n\t" - "wldrd wr10, [%[block]] \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - WAVG2B" wr9, wr9, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "wldrd wr10, [%[block]] \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - WAVG2B" wr9, wr9, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "add r12, r12, #1 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "tmcr wcgr2, r12 \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "cmp r12, #8 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - /* alignment */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "tmcr wcgr2, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr7, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr6, wr7 \n\t" - "wunpckehub wr7, wr7 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr6, wr6, wr10 \n\t" - "waddhus wr7, wr7, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "add r12, r12, #1 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "tmcr wcgr2, r12 \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "cmp r12, #8 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wldrd wr12, [%[pixels]] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "subs %[h], %[h], #2 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - /* alignment */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "tmcr wcgr2, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr7, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr6, wr7 \n\t" - "wunpckehub wr7, wr7 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr6, wr6, wr10 \n\t" - "waddhus wr7, wr7, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wldrd wr13, [%[block], #8] \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - WAVG2B" wr9, wr9, wr13 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "pld [%[block]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "pld [%[block], #32] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wldrd wr13, [%[block], #8] \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - WAVG2B" wr9, wr9, wr13 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1146 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "asm.S" - - preserve8 - .text - - .macro pixels16 avg=0 -.if \avg - mov ip, r0 -.endif -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d2, d3}, [r1], r2 - vld1.64 {d4, d5}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.64 {d6, d7}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] -.if \avg - vld1.64 {d16,d17}, [ip,:128], r2 - vrhadd.u8 q0, q0, q8 - vld1.64 {d18,d19}, [ip,:128], r2 - vrhadd.u8 q1, q1, q9 - vld1.64 {d20,d21}, [ip,:128], r2 - vrhadd.u8 q2, q2, q10 - vld1.64 {d22,d23}, [ip,:128], r2 - vrhadd.u8 q3, q3, q11 -.endif - subs r3, r3, #4 - vst1.64 {d0, d1}, [r0,:128], r2 - vst1.64 {d2, d3}, [r0,:128], r2 - vst1.64 {d4, d5}, [r0,:128], r2 - vst1.64 {d6, d7}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_x2 vhadd=vrhadd.u8 -1: vld1.64 {d0-d2}, [r1], r2 - vld1.64 {d4-d6}, [r1], r2 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vext.8 q1, q0, q1, #1 - \vhadd q0, q0, q1 - vext.8 q3, q2, q3, #1 - \vhadd q2, q2, q3 - vst1.64 {d0, d1}, [r0,:128], r2 - vst1.64 {d4, d5}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_y2 vhadd=vrhadd.u8 - vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d2, d3}, [r1], r2 -1: subs r3, r3, #2 - \vhadd q2, q0, q1 - vld1.64 {d0, d1}, [r1], r2 - \vhadd q3, q0, q1 - vld1.64 {d2, d3}, [r1], r2 - pld [r1] - pld [r1, r2] - vst1.64 {d4, d5}, [r0,:128], r2 - vst1.64 {d6, d7}, [r0,:128], r2 - bne 1b - bx lr - .endm - - .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 - vld1.64 {d0-d2}, [r1], r2 - vld1.64 {d4-d6}, [r1], r2 -.if \no_rnd - vmov.i16 q13, #1 -.endif - pld [r1] - pld [r1, r2] - vext.8 q1, q0, q1, #1 - vext.8 q3, q2, q3, #1 - vaddl.u8 q8, d0, d2 - vaddl.u8 q10, d1, d3 - vaddl.u8 q9, d4, d6 - vaddl.u8 q11, d5, d7 -1: subs r3, r3, #2 - vld1.64 {d0-d2}, [r1], r2 - vadd.u16 q12, q8, q9 - pld [r1] -.if \no_rnd - vadd.u16 q12, q12, q13 -.endif - vext.8 q15, q0, q1, #1 - vadd.u16 q1 , q10, q11 - \vshrn d28, q12, #2 -.if \no_rnd - vadd.u16 q1, q1, q13 -.endif - \vshrn d29, q1, #2 - vaddl.u8 q8, d0, d30 - vld1.64 {d2-d4}, [r1], r2 - vaddl.u8 q10, d1, d31 - vst1.64 {d28,d29}, [r0,:128], r2 - vadd.u16 q12, q8, q9 - pld [r1, r2] -.if \no_rnd - vadd.u16 q12, q12, q13 -.endif - vext.8 q2, q1, q2, #1 - vadd.u16 q0, q10, q11 - \vshrn d30, q12, #2 -.if \no_rnd - vadd.u16 q0, q0, q13 -.endif - \vshrn d31, q0, #2 - vaddl.u8 q9, d2, d4 - vaddl.u8 q11, d3, d5 - vst1.64 {d30,d31}, [r0,:128], r2 - bgt 1b - bx lr - .endm - - .macro pixels8 avg=0 -1: vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.64 {d3}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] -.if \avg - vld1.64 {d4}, [r0,:64], r2 - vrhadd.u8 d0, d0, d4 - vld1.64 {d5}, [r0,:64], r2 - vrhadd.u8 d1, d1, d5 - vld1.64 {d6}, [r0,:64], r2 - vrhadd.u8 d2, d2, d6 - vld1.64 {d7}, [r0,:64], r2 - vrhadd.u8 d3, d3, d7 - sub r0, r0, r2, lsl #2 -.endif - subs r3, r3, #4 - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - vst1.64 {d2}, [r0,:64], r2 - vst1.64 {d3}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_x2 vhadd=vrhadd.u8 -1: vld1.64 {d0, d1}, [r1], r2 - vext.8 d1, d0, d1, #1 - vld1.64 {d2, d3}, [r1], r2 - vext.8 d3, d2, d3, #1 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vswp d1, d2 - \vhadd q0, q0, q1 - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_y2 vhadd=vrhadd.u8 - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 -1: subs r3, r3, #2 - \vhadd d4, d0, d1 - vld1.64 {d0}, [r1], r2 - \vhadd d5, d0, d1 - vld1.64 {d1}, [r1], r2 - pld [r1] - pld [r1, r2] - vst1.64 {d4}, [r0,:64], r2 - vst1.64 {d5}, [r0,:64], r2 - bne 1b - bx lr - .endm - - .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 - vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d2, d3}, [r1], r2 -.if \no_rnd - vmov.i16 q11, #1 -.endif - pld [r1] - pld [r1, r2] - vext.8 d4, d0, d1, #1 - vext.8 d6, d2, d3, #1 - vaddl.u8 q8, d0, d4 - vaddl.u8 q9, d2, d6 -1: subs r3, r3, #2 - vld1.64 {d0, d1}, [r1], r2 - pld [r1] - vadd.u16 q10, q8, q9 - vext.8 d4, d0, d1, #1 -.if \no_rnd - vadd.u16 q10, q10, q11 -.endif - vaddl.u8 q8, d0, d4 - \vshrn d5, q10, #2 - vld1.64 {d2, d3}, [r1], r2 - vadd.u16 q10, q8, q9 - pld [r1, r2] -.if \no_rnd - vadd.u16 q10, q10, q11 -.endif - vst1.64 {d5}, [r0,:64], r2 - \vshrn d7, q10, #2 - vext.8 d6, d2, d3, #1 - vaddl.u8 q9, d2, d6 - vst1.64 {d7}, [r0,:64], r2 - bgt 1b - bx lr - .endm - - .macro pixfunc pfx name suf rnd_op args:vararg -function ff_\pfx\name\suf\()_neon, export=1 - \name \rnd_op \args -endfunc - .endm - - .macro pixfunc2 pfx name args:vararg - pixfunc \pfx \name - pixfunc \pfx \name \args - .endm - -function ff_put_h264_qpel16_mc00_neon, export=1 - mov r3, #16 -endfunc - - pixfunc put_ pixels16 - pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 - -function ff_avg_h264_qpel16_mc00_neon, export=1 - mov r3, #16 -endfunc - - pixfunc avg_ pixels16,, 1 - -function ff_put_h264_qpel8_mc00_neon, export=1 - mov r3, #8 -endfunc - - pixfunc put_ pixels8 - pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 - pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 - -function ff_avg_h264_qpel8_mc00_neon, export=1 - mov r3, #8 -endfunc - - pixfunc avg_ pixels8,, 1 - -function ff_put_pixels_clamped_neon, export=1 - vld1.64 {d16-d19}, [r0,:128]! - vqmovun.s16 d0, q8 - vld1.64 {d20-d23}, [r0,:128]! - vqmovun.s16 d1, q9 - vld1.64 {d24-d27}, [r0,:128]! - vqmovun.s16 d2, q10 - vld1.64 {d28-d31}, [r0,:128]! - vqmovun.s16 d3, q11 - vst1.64 {d0}, [r1,:64], r2 - vqmovun.s16 d4, q12 - vst1.64 {d1}, [r1,:64], r2 - vqmovun.s16 d5, q13 - vst1.64 {d2}, [r1,:64], r2 - vqmovun.s16 d6, q14 - vst1.64 {d3}, [r1,:64], r2 - vqmovun.s16 d7, q15 - vst1.64 {d4}, [r1,:64], r2 - vst1.64 {d5}, [r1,:64], r2 - vst1.64 {d6}, [r1,:64], r2 - vst1.64 {d7}, [r1,:64], r2 - bx lr -endfunc - -function ff_put_signed_pixels_clamped_neon, export=1 - vmov.u8 d31, #128 - vld1.64 {d16-d17}, [r0,:128]! - vqmovn.s16 d0, q8 - vld1.64 {d18-d19}, [r0,:128]! - vqmovn.s16 d1, q9 - vld1.64 {d16-d17}, [r0,:128]! - vqmovn.s16 d2, q8 - vld1.64 {d18-d19}, [r0,:128]! - vadd.u8 d0, d0, d31 - vld1.64 {d20-d21}, [r0,:128]! - vadd.u8 d1, d1, d31 - vld1.64 {d22-d23}, [r0,:128]! - vadd.u8 d2, d2, d31 - vst1.64 {d0}, [r1,:64], r2 - vqmovn.s16 d3, q9 - vst1.64 {d1}, [r1,:64], r2 - vqmovn.s16 d4, q10 - vst1.64 {d2}, [r1,:64], r2 - vqmovn.s16 d5, q11 - vld1.64 {d24-d25}, [r0,:128]! - vadd.u8 d3, d3, d31 - vld1.64 {d26-d27}, [r0,:128]! - vadd.u8 d4, d4, d31 - vadd.u8 d5, d5, d31 - vst1.64 {d3}, [r1,:64], r2 - vqmovn.s16 d6, q12 - vst1.64 {d4}, [r1,:64], r2 - vqmovn.s16 d7, q13 - vst1.64 {d5}, [r1,:64], r2 - vadd.u8 d6, d6, d31 - vadd.u8 d7, d7, d31 - vst1.64 {d6}, [r1,:64], r2 - vst1.64 {d7}, [r1,:64], r2 - bx lr -endfunc - -function ff_add_pixels_clamped_neon, export=1 - mov r3, r1 - vld1.64 {d16}, [r1,:64], r2 - vld1.64 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vld1.64 {d17}, [r1,:64], r2 - vld1.64 {d2-d3}, [r0,:128]! - vqmovun.s16 d0, q0 - vld1.64 {d18}, [r1,:64], r2 - vaddw.u8 q1, q1, d17 - vld1.64 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.64 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.64 {d19}, [r1,:64], r2 - vld1.64 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vqmovun.s16 d4, q2 - vst1.64 {d2}, [r3,:64], r2 - vld1.64 {d16}, [r1,:64], r2 - vqmovun.s16 d6, q3 - vld1.64 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vst1.64 {d4}, [r3,:64], r2 - vld1.64 {d17}, [r1,:64], r2 - vld1.64 {d2-d3}, [r0,:128]! - vaddw.u8 q1, q1, d17 - vst1.64 {d6}, [r3,:64], r2 - vqmovun.s16 d0, q0 - vld1.64 {d18}, [r1,:64], r2 - vld1.64 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.64 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.64 {d19}, [r1,:64], r2 - vqmovun.s16 d4, q2 - vld1.64 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vst1.64 {d2}, [r3,:64], r2 - vqmovun.s16 d6, q3 - vst1.64 {d4}, [r3,:64], r2 - vst1.64 {d6}, [r3,:64], r2 - bx lr -endfunc - -function ff_float_to_int16_neon, export=1 - subs r2, r2, #8 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q9, q1, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vshrn.s32 d4, q8, #16 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q0, q0, #16 - vshrn.s32 d5, q9, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vld1.64 {d16-d17},[r1,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r1,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6-d7}, [r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vld1.64 {d0-d1}, [r1,:128]! - vshrn.s32 d4, q8, #16 - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vshrn.s32 d5, q9, #16 - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vst1.64 {d6-d7}, [r0,:128]! - bx lr -3: vshrn.s32 d4, q8, #16 - vshrn.s32 d5, q9, #16 - vst1.64 {d4-d5}, [r0,:128]! - bx lr -endfunc - -function ff_float_to_int16_interleave_neon, export=1 - cmp r3, #2 - ldrlt r1, [r1] - blt ff_float_to_int16_neon - bne 4f - - ldr r3, [r1] - ldr r1, [r1, #4] - - subs r2, r2, #8 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q9, q1, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q10, q8, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vld1.64 {d26-d27},[r1,:128]! - vsri.32 q11, q9, #16 - vst1.64 {d20-d21},[r0,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q12, q0, #16 - vld1.64 {d16-d17},[r3,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d25},[r0,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r3,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d26-d27},[r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vsri.32 q10, q8, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vsri.32 q11, q9, #16 - vld1.64 {d26-d27},[r1,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d20-d21},[r0,:128]! - vsri.32 q12, q0, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d27},[r0,:128]! - bx lr -3: vsri.32 q10, q8, #16 - vsri.32 q11, q9, #16 - vst1.64 {d20-d23},[r0,:128]! - bx lr - -4: push {r4-r8,lr} - cmp r3, #4 - lsl ip, r3, #1 - blt 4f - - @ 4 channels -5: ldmia r1!, {r4-r7} - mov lr, r2 - mov r8, r0 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #8 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q9, q8, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 q11, q10, #16 - vld1.64 {d4-d5}, [r6,:128]! - vcvt.s32.f32 q2, q2, #16 - vzip.32 d18, d22 - vld1.64 {d6-d7}, [r7,:128]! - vcvt.s32.f32 q3, q3, #16 - vzip.32 d19, d23 - vst1.64 {d18}, [r8], ip - vsri.32 q1, q0, #16 - vst1.64 {d22}, [r8], ip - vsri.32 q3, q2, #16 - vst1.64 {d19}, [r8], ip - vzip.32 d2, d6 - vst1.64 {d23}, [r8], ip - vzip.32 d3, d7 - beq 7f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.64 {d2}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6}, [r8], ip - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.64 {d3}, [r8], ip - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d7}, [r8], ip - b 6b -7: vst1.64 {d2}, [r8], ip - vst1.64 {d6}, [r8], ip - vst1.64 {d3}, [r8], ip - vst1.64 {d7}, [r8], ip - subs r3, r3, #4 - popeq {r4-r8,pc} - cmp r3, #4 - add r0, r0, #8 - bge 5b - - @ 2 channels -4: cmp r3, #2 - blt 4f - ldmia r1!, {r4-r5} - mov lr, r2 - mov r8, r0 - tst lr, #8 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 6f - subs lr, lr, #8 - beq 7f - vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #16 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 d18, d16, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 d19, d17, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r5,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vsri.32 d2, d0, #16 - vst1.32 {d19[1]}, [r8], ip - vsri.32 d3, d1, #16 - vst1.32 {d22[0]}, [r8], ip - vsri.32 d6, d4, #16 - vst1.32 {d22[1]}, [r8], ip - vsri.32 d7, d5, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - beq 6f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - bgt 6b -6: vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - b 8f -7: vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip -8: subs r3, r3, #2 - add r0, r0, #4 - popeq {r4-r8,pc} - - @ 1 channel -4: ldr r4, [r1],#4 - tst r2, #8 - mov lr, r2 - mov r5, r0 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - bne 8f -6: subs lr, lr, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r4,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - beq 7f - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 -7: vst1.16 {d4[1]}, [r5,:16], ip - vst1.16 {d4[3]}, [r5,:16], ip - vst1.16 {d5[1]}, [r5,:16], ip - vst1.16 {d5[3]}, [r5,:16], ip - vst1.16 {d6[1]}, [r5,:16], ip - vst1.16 {d6[3]}, [r5,:16], ip - vst1.16 {d7[1]}, [r5,:16], ip - vst1.16 {d7[3]}, [r5,:16], ip - bgt 6b - pop {r4-r8,pc} -8: subs lr, lr, #8 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - popeq {r4-r8,pc} - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - b 6b -endfunc - -function ff_vector_fmul_neon, export=1 - mov r3, r0 - subs r2, r2, #8 - vld1.64 {d0-d3}, [r0,:128]! - vld1.64 {d4-d7}, [r1,:128]! - vmul.f32 q8, q0, q2 - vmul.f32 q9, q1, q3 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vmul.f32 q10, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vmul.f32 q11, q1, q3 - vst1.64 {d16-d19},[r3,:128]! - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vmul.f32 q9, q1, q3 - vst1.64 {d20-d23},[r3,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vst1.64 {d16-d17},[r3,:128]! - vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vst1.64 {d18-d19},[r3,:128]! - vmul.f32 q9, q1, q3 -3: vst1.64 {d16-d19},[r3,:128]! - bx lr -endfunc - -function ff_vector_fmul_window_neon, export=1 -VFP vdup.32 q8, d0[0] -NOVFP vld1.32 {d16[],d17[]}, [sp,:32] - push {r4,r5,lr} -VFP ldr lr, [sp, #12] -NOVFP ldr lr, [sp, #16] - sub r2, r2, #8 - sub r5, lr, #2 - add r2, r2, r5, lsl #2 - add r4, r3, r5, lsl #3 - add ip, r0, r5, lsl #3 - mov r5, #-16 - vld1.64 {d0,d1}, [r1,:128]! - vld1.64 {d2,d3}, [r2,:128], r5 - vld1.64 {d4,d5}, [r3,:128]! - vld1.64 {d6,d7}, [r4,:128], r5 -1: subs lr, lr, #4 - vmov q11, q8 - vmla.f32 d22, d0, d4 - vmov q10, q8 - vmla.f32 d23, d1, d5 - vrev64.32 q3, q3 - vmla.f32 d20, d0, d7 - vrev64.32 q1, q1 - vmla.f32 d21, d1, d6 - beq 2f - vmla.f32 d22, d3, d7 - vld1.64 {d0,d1}, [r1,:128]! - vmla.f32 d23, d2, d6 - vld1.64 {d18,d19},[r2,:128], r5 - vmls.f32 d20, d3, d4 - vld1.64 {d24,d25},[r3,:128]! - vmls.f32 d21, d2, d5 - vld1.64 {d6,d7}, [r4,:128], r5 - vmov q1, q9 - vrev64.32 q11, q11 - vmov q2, q12 - vswp d22, d23 - vst1.64 {d20,d21},[r0,:128]! - vst1.64 {d22,d23},[ip,:128], r5 - b 1b -2: vmla.f32 d22, d3, d7 - vmla.f32 d23, d2, d6 - vmls.f32 d20, d3, d4 - vmls.f32 d21, d2, d5 - vrev64.32 q11, q11 - vswp d22, d23 - vst1.64 {d20,d21},[r0,:128]! - vst1.64 {d22,d23},[ip,:128], r5 - pop {r4,r5,pc} -endfunc - -#if CONFIG_VORBIS_DECODER -function ff_vorbis_inverse_coupling_neon, export=1 - vmov.i32 q10, #1<<31 - subs r2, r2, #4 - mov r3, r0 - mov r12, r1 - beq 3f - - vld1.32 {d24-d25},[r1,:128]! - vld1.32 {d22-d23},[r0,:128]! - vcle.s32 q8, q12, #0 - vand q9, q11, q10 - veor q12, q12, q9 - vand q2, q12, q8 - vbic q3, q12, q8 - vadd.f32 q12, q11, q2 - vsub.f32 q11, q11, q3 -1: vld1.32 {d2-d3}, [r1,:128]! - vld1.32 {d0-d1}, [r0,:128]! - vcle.s32 q8, q1, #0 - vand q9, q0, q10 - veor q1, q1, q9 - vst1.32 {d24-d25},[r3, :128]! - vst1.32 {d22-d23},[r12,:128]! - vand q2, q1, q8 - vbic q3, q1, q8 - vadd.f32 q1, q0, q2 - vsub.f32 q0, q0, q3 - subs r2, r2, #8 - ble 2f - vld1.32 {d24-d25},[r1,:128]! - vld1.32 {d22-d23},[r0,:128]! - vcle.s32 q8, q12, #0 - vand q9, q11, q10 - veor q12, q12, q9 - vst1.32 {d2-d3}, [r3, :128]! - vst1.32 {d0-d1}, [r12,:128]! - vand q2, q12, q8 - vbic q3, q12, q8 - vadd.f32 q12, q11, q2 - vsub.f32 q11, q11, q3 - b 1b - -2: vst1.32 {d2-d3}, [r3, :128]! - vst1.32 {d0-d1}, [r12,:128]! - bxlt lr - -3: vld1.32 {d2-d3}, [r1,:128] - vld1.32 {d0-d1}, [r0,:128] - vcle.s32 q8, q1, #0 - vand q9, q0, q10 - veor q1, q1, q9 - vand q2, q1, q8 - vbic q3, q1, q8 - vadd.f32 q1, q0, q2 - vsub.f32 q0, q0, q3 - vst1.32 {d2-d3}, [r0,:128]! - vst1.32 {d0-d1}, [r1,:128]! - bx lr -endfunc -#endif - -function ff_vector_fmul_scalar_neon, export=1 -VFP len .req r2 -NOVFP len .req r3 -VFP vdup.32 q8, d0[0] -NOVFP vdup.32 q8, r2 - bics r12, len, #15 - beq 3f - vld1.32 {q0},[r1,:128]! - vld1.32 {q1},[r1,:128]! -1: vmul.f32 q0, q0, q8 - vld1.32 {q2},[r1,:128]! - vmul.f32 q1, q1, q8 - vld1.32 {q3},[r1,:128]! - vmul.f32 q2, q2, q8 - vst1.32 {q0},[r0,:128]! - vmul.f32 q3, q3, q8 - vst1.32 {q1},[r0,:128]! - subs r12, r12, #16 - beq 2f - vld1.32 {q0},[r1,:128]! - vst1.32 {q2},[r0,:128]! - vld1.32 {q1},[r1,:128]! - vst1.32 {q3},[r0,:128]! - b 1b -2: vst1.32 {q2},[r0,:128]! - vst1.32 {q3},[r0,:128]! - ands len, len, #15 - bxeq lr -3: vld1.32 {q0},[r1,:128]! - vmul.f32 q0, q0, q8 - vst1.32 {q0},[r0,:128]! - subs len, len, #4 - bgt 3b - bx lr - .unreq len -endfunc - -function ff_vector_fmul_sv_scalar_2_neon, export=1 -VFP vdup.32 d16, d0[0] -NOVFP vdup.32 d16, r3 -NOVFP ldr r3, [sp] - vld1.32 {d0},[r1,:64]! - vld1.32 {d1},[r1,:64]! -1: subs r3, r3, #4 - vmul.f32 d4, d0, d16 - vmul.f32 d5, d1, d16 - ldr r12, [r2], #4 - vld1.32 {d2},[r12,:64] - ldr r12, [r2], #4 - vld1.32 {d3},[r12,:64] - vmul.f32 d4, d4, d2 - vmul.f32 d5, d5, d3 - beq 2f - vld1.32 {d0},[r1,:64]! - vld1.32 {d1},[r1,:64]! - vst1.32 {d4},[r0,:64]! - vst1.32 {d5},[r0,:64]! - b 1b -2: vst1.32 {d4},[r0,:64]! - vst1.32 {d5},[r0,:64]! - bx lr -endfunc - -function ff_vector_fmul_sv_scalar_4_neon, export=1 -VFP vdup.32 q10, d0[0] -NOVFP vdup.32 q10, r3 -NOVFP ldr r3, [sp] - push {lr} - bics lr, r3, #7 - beq 3f - vld1.32 {q0},[r1,:128]! - vld1.32 {q2},[r1,:128]! -1: ldr r12, [r2], #4 - vld1.32 {q1},[r12,:128] - ldr r12, [r2], #4 - vld1.32 {q3},[r12,:128] - vmul.f32 q8, q0, q10 - vmul.f32 q8, q8, q1 - vmul.f32 q9, q2, q10 - vmul.f32 q9, q9, q3 - subs lr, lr, #8 - beq 2f - vld1.32 {q0},[r1,:128]! - vld1.32 {q2},[r1,:128]! - vst1.32 {q8},[r0,:128]! - vst1.32 {q9},[r0,:128]! - b 1b -2: vst1.32 {q8},[r0,:128]! - vst1.32 {q9},[r0,:128]! - ands r3, r3, #7 - popeq {pc} -3: vld1.32 {q0},[r1,:128]! - ldr r12, [r2], #4 - vld1.32 {q1},[r12,:128] - vmul.f32 q0, q0, q10 - vmul.f32 q0, q0, q1 - vst1.32 {q0},[r0,:128]! - subs r3, r3, #4 - bgt 3b - pop {pc} -endfunc - -function ff_sv_fmul_scalar_2_neon, export=1 -VFP len .req r2 -NOVFP len .req r3 -VFP vdup.32 q8, d0[0] -NOVFP vdup.32 q8, r2 - ldr r12, [r1], #4 - vld1.32 {d0},[r12,:64] - ldr r12, [r1], #4 - vld1.32 {d1},[r12,:64] -1: vmul.f32 q1, q0, q8 - subs len, len, #4 - beq 2f - ldr r12, [r1], #4 - vld1.32 {d0},[r12,:64] - ldr r12, [r1], #4 - vld1.32 {d1},[r12,:64] - vst1.32 {q1},[r0,:128]! - b 1b -2: vst1.32 {q1},[r0,:128]! - bx lr - .unreq len -endfunc - -function ff_sv_fmul_scalar_4_neon, export=1 -VFP len .req r2 -NOVFP len .req r3 -VFP vdup.32 q8, d0[0] -NOVFP vdup.32 q8, r2 -1: ldr r12, [r1], #4 - vld1.32 {q0},[r12,:128] - vmul.f32 q0, q0, q8 - vst1.32 {q0},[r0,:128]! - subs len, len, #4 - bgt 1b - bx lr - .unreq len -endfunc - -function ff_butterflies_float_neon, export=1 -1: vld1.32 {q0},[r0,:128] - vld1.32 {q1},[r1,:128] - vsub.f32 q2, q0, q1 - vadd.f32 q1, q0, q1 - vst1.32 {q2},[r1,:128]! - vst1.32 {q1},[r0,:128]! - subs r2, r2, #4 - bgt 1b - bx lr -endfunc - -function ff_scalarproduct_float_neon, export=1 - vmov.f32 q2, #0.0 -1: vld1.32 {q0},[r0,:128]! - vld1.32 {q1},[r1,:128]! - vmla.f32 q2, q0, q1 - subs r2, r2, #4 - bgt 1b - vadd.f32 d0, d4, d5 - vpadd.f32 d0, d0, d0 -NOVFP vmov.32 r0, d0[0] - bx lr -endfunc - -function ff_int32_to_float_fmul_scalar_neon, export=1 -VFP vdup.32 q0, d0[0] -VFP len .req r2 -NOVFP vdup.32 q0, r2 -NOVFP len .req r3 - - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 -1: subs len, len, #8 - pld [r1, #16] - vmul.f32 q9, q3, q0 - vmul.f32 q10, q8, q0 - beq 2f - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 - vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - b 1b -2: vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - bx lr - .unreq len -endfunc - -function ff_vector_fmul_reverse_neon, export=1 - add r2, r2, r3, lsl #2 - sub r2, r2, #32 - mov r12, #-32 - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q2-q3}, [r2,:128], r12 -1: pld [r1, #32] - vrev64.32 q3, q3 - vmul.f32 d16, d0, d7 - vmul.f32 d17, d1, d6 - pld [r2, #-32] - vrev64.32 q2, q2 - vmul.f32 d18, d2, d5 - vmul.f32 d19, d3, d4 - subs r3, r3, #8 - beq 2f - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q2-q3}, [r2,:128], r12 - vst1.32 {q8-q9}, [r0,:128]! - b 1b -2: vst1.32 {q8-q9}, [r0,:128]! - bx lr -endfunc - -function ff_vector_fmul_add_neon, export=1 - ldr r12, [sp] - vld1.32 {q0-q1}, [r1,:128]! - vld1.32 {q8-q9}, [r2,:128]! - vld1.32 {q2-q3}, [r3,:128]! - vmul.f32 q10, q0, q8 - vmul.f32 q11, q1, q9 -1: vadd.f32 q12, q2, q10 - vadd.f32 q13, q3, q11 - pld [r1, #16] - pld [r2, #16] - pld [r3, #16] - subs r12, r12, #8 - beq 2f - vld1.32 {q0}, [r1,:128]! - vld1.32 {q8}, [r2,:128]! - vmul.f32 q10, q0, q8 - vld1.32 {q1}, [r1,:128]! - vld1.32 {q9}, [r2,:128]! - vmul.f32 q11, q1, q9 - vld1.32 {q2-q3}, [r3,:128]! - vst1.32 {q12-q13},[r0,:128]! - b 1b -2: vst1.32 {q12-q13},[r0,:128]! - bx lr -endfunc - -function ff_vector_clipf_neon, export=1 -VFP vdup.32 q1, d0[1] -VFP vdup.32 q0, d0[0] -NOVFP vdup.32 q0, r2 -NOVFP vdup.32 q1, r3 -NOVFP ldr r2, [sp] - vld1.f32 {q2},[r1,:128]! - vmin.f32 q10, q2, q1 - vld1.f32 {q3},[r1,:128]! - vmin.f32 q11, q3, q1 -1: vmax.f32 q8, q10, q0 - vmax.f32 q9, q11, q0 - subs r2, r2, #8 - beq 2f - vld1.f32 {q2},[r1,:128]! - vmin.f32 q10, q2, q1 - vld1.f32 {q3},[r1,:128]! - vmin.f32 q11, q3, q1 - vst1.f32 {q8},[r0,:128]! - vst1.f32 {q9},[r0,:128]! - b 1b -2: vst1.f32 {q8},[r0,:128]! - vst1.f32 {q9},[r0,:128]! - bx lr -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2008 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "asm.S" - - .syntax unified -/* - * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle - * throughput for almost all the instructions (except for double precision - * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles - * for arithmetic operations. Scheduling code to avoid pipeline stalls is very - * important for performance. One more interesting feature is that VFP has - * independent load/store and arithmetics pipelines, so it is possible to make - * them work simultaneously and get more than 1 operation per cycle. Load/store - * pipeline can process 2 single precision floating point values per cycle and - * supports bulk loads and stores for large sets of registers. Arithmetic operations - * can be done on vectors, which allows to keep the arithmetics pipeline busy, - * while the processor may issue and execute other instructions. Detailed - * optimization manuals can be found at http://www.arm.com - */ - -/** - * ARM VFP optimized implementation of 'vector_fmul_c' function. - * Assume that len is a positive number and is multiple of 8 - */ -@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) -function ff_vector_fmul_vfp, export=1 - vpush {d8-d15} - mov r3, r0 - fmrx r12, fpscr - orr r12, r12, #(3 << 16) /* set vector size to 4 */ - fmxr fpscr, r12 - - vldmia r3!, {s0-s3} - vldmia r1!, {s8-s11} - vldmia r3!, {s4-s7} - vldmia r1!, {s12-s15} - vmul.f32 s8, s0, s8 -1: - subs r2, r2, #16 - vmul.f32 s12, s4, s12 - vldmiage r3!, {s16-s19} - vldmiage r1!, {s24-s27} - vldmiage r3!, {s20-s23} - vldmiage r1!, {s28-s31} - vmulge.f32 s24, s16, s24 - vstmia r0!, {s8-s11} - vstmia r0!, {s12-s15} - vmulge.f32 s28, s20, s28 - vldmiagt r3!, {s0-s3} - vldmiagt r1!, {s8-s11} - vldmiagt r3!, {s4-s7} - vldmiagt r1!, {s12-s15} - vmulge.f32 s8, s0, s8 - vstmiage r0!, {s24-s27} - vstmiage r0!, {s28-s31} - bgt 1b - - bic r12, r12, #(7 << 16) /* set vector size back to 1 */ - fmxr fpscr, r12 - vpop {d8-d15} - bx lr -endfunc - -/** - * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. - * Assume that len is a positive number and is multiple of 8 - */ -@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, -@ const float *src1, int len) -function ff_vector_fmul_reverse_vfp, export=1 - vpush {d8-d15} - add r2, r2, r3, lsl #2 - vldmdb r2!, {s0-s3} - vldmia r1!, {s8-s11} - vldmdb r2!, {s4-s7} - vldmia r1!, {s12-s15} - vmul.f32 s8, s3, s8 - vmul.f32 s9, s2, s9 - vmul.f32 s10, s1, s10 - vmul.f32 s11, s0, s11 -1: - subs r3, r3, #16 - vldmdbge r2!, {s16-s19} - vmul.f32 s12, s7, s12 - vldmiage r1!, {s24-s27} - vmul.f32 s13, s6, s13 - vldmdbge r2!, {s20-s23} - vmul.f32 s14, s5, s14 - vldmiage r1!, {s28-s31} - vmul.f32 s15, s4, s15 - vmulge.f32 s24, s19, s24 - vldmdbgt r2!, {s0-s3} - vmulge.f32 s25, s18, s25 - vstmia r0!, {s8-s13} - vmulge.f32 s26, s17, s26 - vldmiagt r1!, {s8-s11} - vmulge.f32 s27, s16, s27 - vmulge.f32 s28, s23, s28 - vldmdbgt r2!, {s4-s7} - vmulge.f32 s29, s22, s29 - vstmia r0!, {s14-s15} - vmulge.f32 s30, s21, s30 - vmulge.f32 s31, s20, s31 - vmulge.f32 s8, s3, s8 - vldmiagt r1!, {s12-s15} - vmulge.f32 s9, s2, s9 - vmulge.f32 s10, s1, s10 - vstmiage r0!, {s24-s27} - vmulge.f32 s11, s0, s11 - vstmiage r0!, {s28-s31} - bgt 1b - - vpop {d8-d15} - bx lr -endfunc - -#if HAVE_ARMV6 -/** - * ARM VFP optimized float to int16 conversion. - * Assume that len is a positive number and is multiple of 8, destination - * buffer is at least 4 bytes aligned (8 bytes alignment is better for - * performance), little endian byte sex - */ -@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) -function ff_float_to_int16_vfp, export=1 - push {r4-r8,lr} - vpush {d8-d11} - vldmia r1!, {s16-s23} - vcvt.s32.f32 s0, s16 - vcvt.s32.f32 s1, s17 - vcvt.s32.f32 s2, s18 - vcvt.s32.f32 s3, s19 - vcvt.s32.f32 s4, s20 - vcvt.s32.f32 s5, s21 - vcvt.s32.f32 s6, s22 - vcvt.s32.f32 s7, s23 -1: - subs r2, r2, #8 - vmov r3, r4, s0, s1 - vmov r5, r6, s2, s3 - vmov r7, r8, s4, s5 - vmov ip, lr, s6, s7 - vldmiagt r1!, {s16-s23} - ssat r4, #16, r4 - ssat r3, #16, r3 - ssat r6, #16, r6 - ssat r5, #16, r5 - pkhbt r3, r3, r4, lsl #16 - pkhbt r4, r5, r6, lsl #16 - vcvtgt.s32.f32 s0, s16 - vcvtgt.s32.f32 s1, s17 - vcvtgt.s32.f32 s2, s18 - vcvtgt.s32.f32 s3, s19 - vcvtgt.s32.f32 s4, s20 - vcvtgt.s32.f32 s5, s21 - vcvtgt.s32.f32 s6, s22 - vcvtgt.s32.f32 s7, s23 - ssat r8, #16, r8 - ssat r7, #16, r7 - ssat lr, #16, lr - ssat ip, #16, ip - pkhbt r5, r7, r8, lsl #16 - pkhbt r6, ip, lr, lsl #16 - stmia r0!, {r3-r6} - bgt 1b - - vpop {d8-d11} - pop {r4-r8,pc} -endfunc -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/fft.h" -#include "libavcodec/synth_filter.h" - -void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); -void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); - -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - -void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); - -void ff_synth_filter_float_neon(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale, float bias); - -av_cold void ff_fft_init_arm(FFTContext *s) -{ - if (HAVE_NEON) { - s->fft_permute = ff_fft_permute_neon; - s->fft_calc = ff_fft_calc_neon; - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->permutation = FF_MDCT_PERM_INTERLEAVE; - } -} - -#if CONFIG_RDFT -av_cold void ff_rdft_init_arm(RDFTContext *s) -{ - if (HAVE_NEON) - s->rdft_calc = ff_rdft_calc_neon; -} -#endif - -#if CONFIG_DCA_DECODER -av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) -{ - if (HAVE_NEON) - s->synth_filter_float = ff_synth_filter_float_neon; -} -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,371 +0,0 @@ -/* - * ARM NEON optimised FFT - * - * Copyright (c) 2009 Mans Rullgard - * Copyright (c) 2009 Naotoshi Nojiri - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -#define M_SQRT1_2 0.70710678118654752440 - - .text - -function fft4_neon - vld1.32 {d0-d3}, [r0,:128] - - vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 - vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 - vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 - vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 - vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 - vadd.f32 d1, d6, d7 - vsub.f32 d3, d6, d7 - vadd.f32 d0, d4, d5 - vsub.f32 d2, d4, d5 - - vst1.32 {d0-d3}, [r0,:128] - - bx lr -endfunc - -function fft8_neon - mov r1, r0 - vld1.32 {d0-d3}, [r1,:128]! - vld1.32 {d16-d19}, [r1,:128] - - movw r2, #0x04f3 @ sqrt(1/2) - movt r2, #0x3f35 - eor r3, r2, #1<<31 - vdup.32 d31, r2 - - vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 - vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 - vmov d28, r3, r2 - vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 - vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 - vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 - vrev64.32 d29, d28 - vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 - vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 - vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w - vext.32 q3, q2, q2, #1 - vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w - vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 - vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 - vmul.f32 d24, d17, d31 @ a2r*w,a2i*w - vmul.f32 d25, d19, d31 @ a3r*w,a3i*w - vadd.f32 d0, d20, d21 - vsub.f32 d2, d20, d21 - vadd.f32 d1, d22, d23 - vrev64.32 q13, q13 - vsub.f32 d3, d22, d23 - vsub.f32 d6, d6, d7 - vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 - vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 - vadd.f32 d7, d4, d5 - vsub.f32 d18, d2, d6 - vext.32 q13, q12, q12, #1 - vadd.f32 d2, d2, d6 - vsub.f32 d16, d0, d7 - vadd.f32 d5, d25, d24 - vsub.f32 d4, d26, d27 - vadd.f32 d0, d0, d7 - vsub.f32 d17, d1, d5 - vsub.f32 d19, d3, d4 - vadd.f32 d3, d3, d4 - vadd.f32 d1, d1, d5 - - vst1.32 {d16-d19}, [r1,:128] - vst1.32 {d0-d3}, [r0,:128] - - bx lr -endfunc - -function fft16_neon - movrel r1, mppm - vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} - pld [r0, #32] - vld1.32 {d2-d3}, [r1,:128] - vext.32 q13, q9, q9, #1 - vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} - vadd.f32 d4, d16, d17 - vsub.f32 d5, d16, d17 - vadd.f32 d18, d18, d19 - vsub.f32 d19, d26, d27 - - vadd.f32 d20, d22, d23 - vsub.f32 d22, d22, d23 - vsub.f32 d23, d24, d25 - vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} - vadd.f32 d21, d24, d25 - vmul.f32 d24, d22, d2 - vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} - vmul.f32 d25, d23, d3 - vuzp.32 d16, d17 @ {r0,r1,i0,i1} - vmul.f32 q1, q11, d2[1] - vuzp.32 d18, d19 @ {r2,r3,i2,i3} - vrev64.32 q12, q12 - vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} - vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} - vzip.32 q10, q11 - vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - sub r0, r0, #96 - vext.32 q13, q13, q13, #1 - vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} - vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} - vext.32 q15, q15, q15, #1 - vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} - vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} - vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} - vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} - vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} - vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} - movrel r2, X(ff_cos_16) - vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} - vrev64.32 d1, d1 - vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} - vrev64.32 d3, d3 - movrel r3, pmmp - vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} - vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} - vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} - vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} - vld1.32 {d4-d5}, [r2,:64] - vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} - vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} - vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} - vld1.32 {d6-d7}, [r3,:128] - vrev64.32 q1, q14 - vmul.f32 q14, q14, d4[1] - vmul.f32 q1, q1, q3 - vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} - vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} - vzip.32 q12, q14 - vadd.f32 d0, d28, d24 - vadd.f32 d1, d25, d29 - vsub.f32 d2, d25, d29 - vsub.f32 d3, d28, d24 - vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} - vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} - vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} - mov r1, #32 - vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} - vrev64.32 q0, q13 - vmul.f32 q13, q13, d5[0] - vrev64.32 q1, q15 - vmul.f32 q15, q15, d5[1] - vst2.32 {d16-d17},[r0,:128], r1 - vmul.f32 q0, q0, q3 - vst2.32 {d20-d21},[r0,:128], r1 - vmul.f32 q1, q1, q3 - vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} - vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} - vst2.32 {d24-d25},[r0,:128], r1 - vst2.32 {d28-d29},[r0,:128] - vzip.32 q13, q15 - sub r0, r0, #80 - vadd.f32 d0, d30, d26 - vadd.f32 d1, d27, d31 - vsub.f32 d2, d27, d31 - vsub.f32 d3, d30, d26 - vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} - vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} - vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} - vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} - vst2.32 {d18-d19},[r0,:128], r1 - vst2.32 {d22-d23},[r0,:128], r1 - vst2.32 {d26-d27},[r0,:128], r1 - vst2.32 {d30-d31},[r0,:128] - bx lr -endfunc - -function fft_pass_neon - push {r4-r6,lr} - mov r6, r2 @ n - lsl r5, r2, #3 @ 2 * n * sizeof FFTSample - lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex - lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex - add r3, r2, r4 - add r4, r4, r0 @ &z[o1] - add r2, r2, r0 @ &z[o2] - add r3, r3, r0 @ &z[o3] - vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} - movrel r12, pmmp - vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} - add r5, r5, r1 @ wim - vld1.32 {d6-d7}, [r12,:128] @ pmmp - vswp d21, d22 - vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} - sub r5, r5, #4 @ wim-- - vrev64.32 q1, q11 - vmul.f32 q11, q11, d4[1] - vmul.f32 q1, q1, q3 - vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] - vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} - vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} - sub r6, r6, #1 @ n-- - vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} - vzip.32 q10, q11 - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - vsub.f32 q10, q8, q0 - vadd.f32 q8, q8, q0 - vsub.f32 q11, q9, q1 - vadd.f32 q9, q9, q1 - vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} - vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} - vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} - vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} - sub r5, r5, #8 @ wim -= 2 -1: - vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} - vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} - vswp d21, d22 - vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} - vrev64.32 q0, q10 - vmul.f32 q10, q10, d4[0] - vrev64.32 q1, q11 - vmul.f32 q11, q11, d4[1] - vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} - vmul.f32 q0, q0, q3 - sub r5, r5, #8 @ wim -= 2 - vmul.f32 q1, q1, q3 - vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} - vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} - vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} - subs r6, r6, #1 @ n-- - vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} - vzip.32 q10, q11 - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - vsub.f32 q10, q8, q0 - vadd.f32 q8, q8, q0 - vsub.f32 q11, q9, q1 - vadd.f32 q9, q9, q1 - vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} - vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} - vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} - vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} - bne 1b - - pop {r4-r6,pc} -endfunc - -.macro def_fft n, n2, n4 - .align 6 -function fft\n\()_neon - push {r4, lr} - mov r4, r0 - bl fft\n2\()_neon - add r0, r4, #\n4*2*8 - bl fft\n4\()_neon - add r0, r4, #\n4*3*8 - bl fft\n4\()_neon - mov r0, r4 - pop {r4, lr} - movrel r1, X(ff_cos_\n) - mov r2, #\n4/2 - b fft_pass_neon -endfunc -.endm - - def_fft 32, 16, 8 - def_fft 64, 32, 16 - def_fft 128, 64, 32 - def_fft 256, 128, 64 - def_fft 512, 256, 128 - def_fft 1024, 512, 256 - def_fft 2048, 1024, 512 - def_fft 4096, 2048, 1024 - def_fft 8192, 4096, 2048 - def_fft 16384, 8192, 4096 - def_fft 32768, 16384, 8192 - def_fft 65536, 32768, 16384 - -function ff_fft_calc_neon, export=1 - ldr r2, [r0] - sub r2, r2, #2 - movrel r3, fft_tab_neon - ldr r3, [r3, r2, lsl #2] - mov r0, r1 - bx r3 -endfunc - -function ff_fft_permute_neon, export=1 - push {r4,lr} - mov r12, #1 - ldr r2, [r0] @ nbits - ldr r3, [r0, #20] @ tmp_buf - ldr r0, [r0, #8] @ revtab - lsl r12, r12, r2 - mov r2, r12 -1: - vld1.32 {d0-d1}, [r1,:128]! - ldr r4, [r0], #4 - uxth lr, r4 - uxth r4, r4, ror #16 - add lr, r3, lr, lsl #3 - add r4, r3, r4, lsl #3 - vst1.32 {d0}, [lr,:64] - vst1.32 {d1}, [r4,:64] - subs r12, r12, #2 - bgt 1b - - sub r1, r1, r2, lsl #3 -1: - vld1.32 {d0-d3}, [r3,:128]! - vst1.32 {d0-d3}, [r1,:128]! - subs r2, r2, #4 - bgt 1b - - pop {r4,pc} -endfunc - - .section .rodata - .align 4 -fft_tab_neon: - .word fft4_neon - .word fft8_neon - .word fft16_neon - .word fft32_neon - .word fft64_neon - .word fft128_neon - .word fft256_neon - .word fft512_neon - .word fft1024_neon - .word fft2048_neon - .word fft4096_neon - .word fft8192_neon - .word fft16384_neon - .word fft32768_neon - .word fft65536_neon - .size fft_tab_neon, . - fft_tab_neon - - .align 4 -pmmp: .float +1.0, -1.0, -1.0, +1.0 -mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavcodec/dsputil.h" -#include "libavcodec/h264dsp.h" - -void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); - -void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); -void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, - int weight, int offset); - -void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); -void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, - int log2_den, int weightd, int weights, - int offset); - -void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, - DCTELEM *block, int stride, - const uint8_t nnzc[6*8]); -void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, - DCTELEM *block, int stride, - const uint8_t nnzc[6*8]); -void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, - DCTELEM *block, int stride, - const uint8_t nnzc[6*8]); - -#if HAVE_NEON -static void ff_h264dsp_init_neon(H264DSPContext *c) -{ - c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; - c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; - c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; - c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; - - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; - c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; - c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; - c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; - c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; - c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; - - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; - c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; - c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; - c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; - c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; - c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; - - c->h264_idct_add = ff_h264_idct_add_neon; - c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; - c->h264_idct_add16 = ff_h264_idct_add16_neon; - c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; - c->h264_idct_add8 = ff_h264_idct_add8_neon; -} -#endif - -void ff_h264dsp_init_arm(H264DSPContext *c) -{ - if (HAVE_NEON) ff_h264dsp_init_neon(c); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1883 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 - vtrn.32 \r0, \r4 - vtrn.32 \r1, \r5 - vtrn.32 \r2, \r6 - vtrn.32 \r3, \r7 - vtrn.16 \r0, \r2 - vtrn.16 \r1, \r3 - vtrn.16 \r4, \r6 - vtrn.16 \r5, \r7 - vtrn.8 \r0, \r1 - vtrn.8 \r2, \r3 - vtrn.8 \r4, \r5 - vtrn.8 \r6, \r7 - .endm - - .macro transpose_4x4 r0 r1 r2 r3 - vtrn.16 \r0, \r2 - vtrn.16 \r1, \r3 - vtrn.8 \r0, \r1 - vtrn.8 \r2, \r3 - .endm - - .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 - vswp \r0, \r4 - vswp \r1, \r5 - vswp \r2, \r6 - vswp \r3, \r7 - .endm - - .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 - vtrn.32 \r0, \r2 - vtrn.32 \r1, \r3 - vtrn.32 \r4, \r6 - vtrn.32 \r5, \r7 - vtrn.16 \r0, \r1 - vtrn.16 \r2, \r3 - vtrn.16 \r4, \r5 - vtrn.16 \r6, \r7 - .endm - -/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ - .macro h264_chroma_mc8 type -function ff_\type\()_h264_chroma_mc8_neon, export=1 - push {r4-r7, lr} - ldrd r4, [sp, #20] -.ifc \type,avg - mov lr, r0 -.endif - pld [r1] - pld [r1, r2] - - muls r7, r4, r5 - rsb r6, r7, r5, lsl #3 - rsb ip, r7, r4, lsl #3 - sub r4, r7, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 - - beq 2f - - add r5, r1, r2 - - vdup.8 d0, r4 - lsl r4, r2, #1 - vdup.8 d1, ip - vld1.64 {d4, d5}, [r1], r4 - vdup.8 d2, r6 - vld1.64 {d6, d7}, [r5], r4 - vdup.8 d3, r7 - - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - -1: pld [r5] - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r1], r4 - vmlal.u8 q8, d6, d2 - vext.8 d5, d4, d5, #1 - vmlal.u8 q8, d7, d3 - vmull.u8 q9, d6, d0 - subs r3, r3, #2 - vmlal.u8 q9, d7, d1 - vmlal.u8 q9, d4, d2 - vmlal.u8 q9, d5, d3 - vrshrn.u16 d16, q8, #6 - vld1.64 {d6, d7}, [r5], r4 - pld [r1] - vrshrn.u16 d17, q9, #6 -.ifc \type,avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 -.endif - vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 - bgt 1b - - pop {r4-r7, pc} - -2: tst r6, r6 - add ip, ip, r6 - vdup.8 d0, r4 - vdup.8 d1, ip - - beq 4f - - add r5, r1, r2 - lsl r4, r2, #1 - vld1.64 {d4}, [r1], r4 - vld1.64 {d6}, [r5], r4 - -3: pld [r5] - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d1 - vld1.64 {d4}, [r1], r4 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d1 - vld1.64 {d6}, [r5], r4 - vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 -.ifc \type,avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 -.endif - subs r3, r3, #2 - pld [r1] - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 - bgt 3b - - pop {r4-r7, pc} - -4: vld1.64 {d4, d5}, [r1], r2 - vld1.64 {d6, d7}, [r1], r2 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - -5: pld [r1] - subs r3, r3, #2 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 - vld1.64 {d4, d5}, [r1], r2 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d7, d1 - pld [r1] - vext.8 d5, d4, d5, #1 - vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 -.ifc \type,avg - vld1.64 {d20}, [lr,:64], r2 - vld1.64 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 -.endif - vld1.64 {d6, d7}, [r1], r2 - vext.8 d7, d6, d7, #1 - vst1.64 {d16}, [r0,:64], r2 - vst1.64 {d17}, [r0,:64], r2 - bgt 5b - - pop {r4-r7, pc} -endfunc - .endm - -/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ - .macro h264_chroma_mc4 type -function ff_\type\()_h264_chroma_mc4_neon, export=1 - push {r4-r7, lr} - ldrd r4, [sp, #20] -.ifc \type,avg - mov lr, r0 -.endif - pld [r1] - pld [r1, r2] - - muls r7, r4, r5 - rsb r6, r7, r5, lsl #3 - rsb ip, r7, r4, lsl #3 - sub r4, r7, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 - - beq 2f - - add r5, r1, r2 - - vdup.8 d0, r4 - lsl r4, r2, #1 - vdup.8 d1, ip - vld1.64 {d4}, [r1], r4 - vdup.8 d2, r6 - vld1.64 {d6}, [r5], r4 - vdup.8 d3, r7 - - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vtrn.32 d4, d5 - vtrn.32 d6, d7 - - vtrn.32 d0, d1 - vtrn.32 d2, d3 - -1: pld [r5] - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d2 - vld1.64 {d4}, [r1], r4 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d2 - vld1.64 {d6}, [r5], r4 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - vrshrn.u16 d16, q8, #6 - subs r3, r3, #2 - pld [r1] -.ifc \type,avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 -.endif - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 1b - - pop {r4-r7, pc} - -2: tst r6, r6 - add ip, ip, r6 - vdup.8 d0, r4 - vdup.8 d1, ip - vtrn.32 d0, d1 - - beq 4f - - vext.32 d1, d0, d1, #1 - add r5, r1, r2 - lsl r4, r2, #1 - vld1.32 {d4[0]}, [r1], r4 - vld1.32 {d4[1]}, [r5], r4 - -3: pld [r5] - vmull.u8 q8, d4, d0 - vld1.32 {d4[0]}, [r1], r4 - vmull.u8 q9, d4, d1 - vld1.32 {d4[1]}, [r5], r4 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - vrshrn.u16 d16, q8, #6 -.ifc \type,avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 -.endif - subs r3, r3, #2 - pld [r1] - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 3b - - pop {r4-r7, pc} - -4: vld1.64 {d4}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vtrn.32 d4, d5 - vtrn.32 d6, d7 - -5: vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - subs r3, r3, #2 - vld1.64 {d4}, [r1], r2 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - pld [r1] - vrshrn.u16 d16, q8, #6 -.ifc \type,avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 -.endif - vld1.64 {d6}, [r1], r2 - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - pld [r1] - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 5b - - pop {r4-r7, pc} -endfunc - .endm - - .macro h264_chroma_mc2 type -function ff_\type\()_h264_chroma_mc2_neon, export=1 - push {r4-r6, lr} - ldr r4, [sp, #16] - ldr lr, [sp, #20] - pld [r1] - pld [r1, r2] - orrs r5, r4, lr - beq 2f - - mul r5, r4, lr - rsb r6, r5, lr, lsl #3 - rsb r12, r5, r4, lsl #3 - sub r4, r5, r4, lsl #3 - sub r4, r4, lr, lsl #3 - add r4, r4, #64 - vdup.8 d0, r4 - vdup.8 d2, r12 - vdup.8 d1, r6 - vdup.8 d3, r5 - vtrn.16 q0, q1 -1: - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1], r2 - vrev64.32 d5, d4 - vld1.32 {d5[1]}, [r1] - vext.8 q3, q2, q2, #1 - vtrn.16 q2, q3 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 -.ifc \type,avg - vld1.16 {d18[0]}, [r0,:16], r2 - vld1.16 {d18[1]}, [r0,:16] - sub r0, r0, r2 -.endif - vtrn.32 d16, d17 - vadd.i16 d16, d16, d17 - vrshrn.u16 d16, q8, #6 -.ifc \type,avg - vrhadd.u8 d16, d16, d18 -.endif - vst1.16 {d16[0]}, [r0,:16], r2 - vst1.16 {d16[1]}, [r0,:16], r2 - subs r3, r3, #2 - bgt 1b - pop {r4-r6, pc} -2: -.ifc \type,put - ldrh r5, [r1], r2 - strh r5, [r0], r2 - ldrh r6, [r1], r2 - strh r6, [r0], r2 -.else - vld1.16 {d16[0]}, [r1], r2 - vld1.16 {d16[1]}, [r1], r2 - vld1.16 {d18[0]}, [r0,:16], r2 - vld1.16 {d18[1]}, [r0,:16] - sub r0, r0, r2 - vrhadd.u8 d16, d16, d18 - vst1.16 {d16[0]}, [r0,:16], r2 - vst1.16 {d16[1]}, [r0,:16], r2 -.endif - subs r3, r3, #2 - bgt 2b - pop {r4-r6, pc} -endfunc -.endm - - .text - .align - - h264_chroma_mc8 put - h264_chroma_mc8 avg - h264_chroma_mc4 put - h264_chroma_mc4 avg - h264_chroma_mc2 put - h264_chroma_mc2 avg - - /* H.264 loop filter */ - - .macro h264_loop_filter_start - ldr ip, [sp] - tst r2, r2 - ldr ip, [ip] - tstne r3, r3 - vmov.32 d24[0], ip - and ip, ip, ip, lsl #16 - bxeq lr - ands ip, ip, ip, lsl #8 - bxlt lr - .endm - - .macro align_push_regs - and ip, sp, #15 - add ip, ip, #32 - sub sp, sp, ip - vst1.64 {d12-d15}, [sp,:128] - sub sp, sp, #32 - vst1.64 {d8-d11}, [sp,:128] - .endm - - .macro align_pop_regs - vld1.64 {d8-d11}, [sp,:128]! - vld1.64 {d12-d15}, [sp,:128], ip - .endm - - .macro h264_loop_filter_luma - vdup.8 q11, r2 @ alpha - vmovl.u8 q12, d24 - vabd.u8 q6, q8, q0 @ abs(p0 - q0) - vmovl.u16 q12, d24 - vabd.u8 q14, q9, q8 @ abs(p1 - p0) - vsli.16 q12, q12, #8 - vabd.u8 q15, q1, q0 @ abs(q1 - q0) - vsli.32 q12, q12, #16 - vclt.u8 q6, q6, q11 @ < alpha - vdup.8 q11, r3 @ beta - vclt.s8 q7, q12, #0 - vclt.u8 q14, q14, q11 @ < beta - vclt.u8 q15, q15, q11 @ < beta - vbic q6, q6, q7 - vabd.u8 q4, q10, q8 @ abs(p2 - p0) - vand q6, q6, q14 - vabd.u8 q5, q2, q0 @ abs(q2 - q0) - vclt.u8 q4, q4, q11 @ < beta - vand q6, q6, q15 - vclt.u8 q5, q5, q11 @ < beta - vand q4, q4, q6 - vand q5, q5, q6 - vand q12, q12, q6 - vrhadd.u8 q14, q8, q0 - vsub.i8 q6, q12, q4 - vqadd.u8 q7, q9, q12 - vhadd.u8 q10, q10, q14 - vsub.i8 q6, q6, q5 - vhadd.u8 q14, q2, q14 - vmin.u8 q7, q7, q10 - vqsub.u8 q11, q9, q12 - vqadd.u8 q2, q1, q12 - vmax.u8 q7, q7, q11 - vqsub.u8 q11, q1, q12 - vmin.u8 q14, q2, q14 - vmovl.u8 q2, d0 - vmax.u8 q14, q14, q11 - vmovl.u8 q10, d1 - vsubw.u8 q2, q2, d16 - vsubw.u8 q10, q10, d17 - vshl.i16 q2, q2, #2 - vshl.i16 q10, q10, #2 - vaddw.u8 q2, q2, d18 - vaddw.u8 q10, q10, d19 - vsubw.u8 q2, q2, d2 - vsubw.u8 q10, q10, d3 - vrshrn.i16 d4, q2, #3 - vrshrn.i16 d5, q10, #3 - vbsl q4, q7, q9 - vbsl q5, q14, q1 - vneg.s8 q7, q6 - vmovl.u8 q14, d16 - vmin.s8 q2, q2, q6 - vmovl.u8 q6, d17 - vmax.s8 q2, q2, q7 - vmovl.u8 q11, d0 - vmovl.u8 q12, d1 - vaddw.s8 q14, q14, d4 - vaddw.s8 q6, q6, d5 - vsubw.s8 q11, q11, d4 - vsubw.s8 q12, q12, d5 - vqmovun.s16 d16, q14 - vqmovun.s16 d17, q6 - vqmovun.s16 d0, q11 - vqmovun.s16 d1, q12 - .endm - -function ff_h264_v_loop_filter_luma_neon, export=1 - h264_loop_filter_start - - vld1.64 {d0, d1}, [r0,:128], r1 - vld1.64 {d2, d3}, [r0,:128], r1 - vld1.64 {d4, d5}, [r0,:128], r1 - sub r0, r0, r1, lsl #2 - sub r0, r0, r1, lsl #1 - vld1.64 {d20,d21}, [r0,:128], r1 - vld1.64 {d18,d19}, [r0,:128], r1 - vld1.64 {d16,d17}, [r0,:128], r1 - - align_push_regs - - h264_loop_filter_luma - - sub r0, r0, r1, lsl #1 - vst1.64 {d8, d9}, [r0,:128], r1 - vst1.64 {d16,d17}, [r0,:128], r1 - vst1.64 {d0, d1}, [r0,:128], r1 - vst1.64 {d10,d11}, [r0,:128] - - align_pop_regs - bx lr -endfunc - -function ff_h264_h_loop_filter_luma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, #4 - vld1.64 {d6}, [r0], r1 - vld1.64 {d20}, [r0], r1 - vld1.64 {d18}, [r0], r1 - vld1.64 {d16}, [r0], r1 - vld1.64 {d0}, [r0], r1 - vld1.64 {d2}, [r0], r1 - vld1.64 {d4}, [r0], r1 - vld1.64 {d26}, [r0], r1 - vld1.64 {d7}, [r0], r1 - vld1.64 {d21}, [r0], r1 - vld1.64 {d19}, [r0], r1 - vld1.64 {d17}, [r0], r1 - vld1.64 {d1}, [r0], r1 - vld1.64 {d3}, [r0], r1 - vld1.64 {d5}, [r0], r1 - vld1.64 {d27}, [r0], r1 - - transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 - - align_push_regs - - h264_loop_filter_luma - - transpose_4x4 q4, q8, q0, q5 - - sub r0, r0, r1, lsl #4 - add r0, r0, #2 - vst1.32 {d8[0]}, [r0], r1 - vst1.32 {d16[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d10[0]}, [r0], r1 - vst1.32 {d8[1]}, [r0], r1 - vst1.32 {d16[1]}, [r0], r1 - vst1.32 {d0[1]}, [r0], r1 - vst1.32 {d10[1]}, [r0], r1 - vst1.32 {d9[0]}, [r0], r1 - vst1.32 {d17[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - vst1.32 {d11[0]}, [r0], r1 - vst1.32 {d9[1]}, [r0], r1 - vst1.32 {d17[1]}, [r0], r1 - vst1.32 {d1[1]}, [r0], r1 - vst1.32 {d11[1]}, [r0], r1 - - align_pop_regs - bx lr -endfunc - - .macro h264_loop_filter_chroma - vdup.8 d22, r2 @ alpha - vmovl.u8 q12, d24 - vabd.u8 d26, d16, d0 @ abs(p0 - q0) - vmovl.u8 q2, d0 - vabd.u8 d28, d18, d16 @ abs(p1 - p0) - vsubw.u8 q2, q2, d16 - vsli.16 d24, d24, #8 - vshl.i16 q2, q2, #2 - vabd.u8 d30, d2, d0 @ abs(q1 - q0) - vaddw.u8 q2, q2, d18 - vclt.u8 d26, d26, d22 @ < alpha - vsubw.u8 q2, q2, d2 - vdup.8 d22, r3 @ beta - vclt.s8 d25, d24, #0 - vrshrn.i16 d4, q2, #3 - vclt.u8 d28, d28, d22 @ < beta - vbic d26, d26, d25 - vclt.u8 d30, d30, d22 @ < beta - vand d26, d26, d28 - vneg.s8 d25, d24 - vand d26, d26, d30 - vmin.s8 d4, d4, d24 - vmovl.u8 q14, d16 - vand d4, d4, d26 - vmax.s8 d4, d4, d25 - vmovl.u8 q11, d0 - vaddw.s8 q14, q14, d4 - vsubw.s8 q11, q11, d4 - vqmovun.s16 d16, q14 - vqmovun.s16 d0, q11 - .endm - -function ff_h264_v_loop_filter_chroma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, r1, lsl #1 - vld1.64 {d18}, [r0,:64], r1 - vld1.64 {d16}, [r0,:64], r1 - vld1.64 {d0}, [r0,:64], r1 - vld1.64 {d2}, [r0,:64] - - h264_loop_filter_chroma - - sub r0, r0, r1, lsl #1 - vst1.64 {d16}, [r0,:64], r1 - vst1.64 {d0}, [r0,:64], r1 - - bx lr -endfunc - -function ff_h264_h_loop_filter_chroma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, #2 - vld1.32 {d18[0]}, [r0], r1 - vld1.32 {d16[0]}, [r0], r1 - vld1.32 {d0[0]}, [r0], r1 - vld1.32 {d2[0]}, [r0], r1 - vld1.32 {d18[1]}, [r0], r1 - vld1.32 {d16[1]}, [r0], r1 - vld1.32 {d0[1]}, [r0], r1 - vld1.32 {d2[1]}, [r0], r1 - - vtrn.16 d18, d0 - vtrn.16 d16, d2 - vtrn.8 d18, d16 - vtrn.8 d0, d2 - - h264_loop_filter_chroma - - vtrn.16 d18, d0 - vtrn.16 d16, d2 - vtrn.8 d18, d16 - vtrn.8 d0, d2 - - sub r0, r0, r1, lsl #3 - vst1.32 {d18[0]}, [r0], r1 - vst1.32 {d16[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d2[0]}, [r0], r1 - vst1.32 {d18[1]}, [r0], r1 - vst1.32 {d16[1]}, [r0], r1 - vst1.32 {d0[1]}, [r0], r1 - vst1.32 {d2[1]}, [r0], r1 - - bx lr -endfunc - - /* H.264 qpel MC */ - - .macro lowpass_const r - movw \r, #5 - movt \r, #20 - vmov.32 d6[0], \r - .endm - - .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 -.if \narrow - t0 .req q0 - t1 .req q8 -.else - t0 .req \d0 - t1 .req \d1 -.endif - vext.8 d2, \r0, \r1, #2 - vext.8 d3, \r0, \r1, #3 - vaddl.u8 q1, d2, d3 - vext.8 d4, \r0, \r1, #1 - vext.8 d5, \r0, \r1, #4 - vaddl.u8 q2, d4, d5 - vext.8 d30, \r0, \r1, #5 - vaddl.u8 t0, \r0, d30 - vext.8 d18, \r2, \r3, #2 - vmla.i16 t0, q1, d6[1] - vext.8 d19, \r2, \r3, #3 - vaddl.u8 q9, d18, d19 - vext.8 d20, \r2, \r3, #1 - vmls.i16 t0, q2, d6[0] - vext.8 d21, \r2, \r3, #4 - vaddl.u8 q10, d20, d21 - vext.8 d31, \r2, \r3, #5 - vaddl.u8 t1, \r2, d31 - vmla.i16 t1, q9, d6[1] - vmls.i16 t1, q10, d6[0] -.if \narrow - vqrshrun.s16 \d0, t0, #5 - vqrshrun.s16 \d1, t1, #5 -.endif - .unreq t0 - .unreq t1 - .endm - - .macro lowpass_8_1 r0, r1, d0, narrow=1 -.if \narrow - t0 .req q0 -.else - t0 .req \d0 -.endif - vext.8 d2, \r0, \r1, #2 - vext.8 d3, \r0, \r1, #3 - vaddl.u8 q1, d2, d3 - vext.8 d4, \r0, \r1, #1 - vext.8 d5, \r0, \r1, #4 - vaddl.u8 q2, d4, d5 - vext.8 d30, \r0, \r1, #5 - vaddl.u8 t0, \r0, d30 - vmla.i16 t0, q1, d6[1] - vmls.i16 t0, q2, d6[0] -.if \narrow - vqrshrun.s16 \d0, t0, #5 -.endif - .unreq t0 - .endm - - .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d - vext.16 q1, \r0, \r1, #2 - vext.16 q0, \r0, \r1, #3 - vaddl.s16 q9, d2, d0 - vext.16 q2, \r0, \r1, #1 - vaddl.s16 q1, d3, d1 - vext.16 q3, \r0, \r1, #4 - vaddl.s16 q10, d4, d6 - vext.16 \r1, \r0, \r1, #5 - vaddl.s16 q2, d5, d7 - vaddl.s16 q0, \h0, \h1 - vaddl.s16 q8, \l0, \l1 - - vshl.i32 q3, q9, #4 - vshl.i32 q9, q9, #2 - vshl.i32 q15, q10, #2 - vadd.i32 q9, q9, q3 - vadd.i32 q10, q10, q15 - - vshl.i32 q3, q1, #4 - vshl.i32 q1, q1, #2 - vshl.i32 q15, q2, #2 - vadd.i32 q1, q1, q3 - vadd.i32 q2, q2, q15 - - vadd.i32 q9, q9, q8 - vsub.i32 q9, q9, q10 - - vadd.i32 q1, q1, q0 - vsub.i32 q1, q1, q2 - - vrshrn.s32 d18, q9, #10 - vrshrn.s32 d19, q1, #10 - - vqmovun.s16 \d, q9 - .endm - -function put_h264_qpel16_h_lowpass_neon_packed - mov r4, lr - mov ip, #16 - mov r3, #8 - bl put_h264_qpel8_h_lowpass_neon - sub r1, r1, r2, lsl #4 - add r1, r1, #8 - mov ip, #16 - mov lr, r4 - b put_h264_qpel8_h_lowpass_neon -endfunc - - .macro h264_qpel_h_lowpass type -function \type\()_h264_qpel16_h_lowpass_neon - push {lr} - mov ip, #16 - bl \type\()_h264_qpel8_h_lowpass_neon - sub r0, r0, r3, lsl #4 - sub r1, r1, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #8 - mov ip, #16 - pop {lr} -endfunc - -function \type\()_h264_qpel8_h_lowpass_neon -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d16,d17}, [r1], r2 - subs ip, ip, #2 - lowpass_8 d0, d1, d16, d17, d0, d16 -.ifc \type,avg - vld1.8 {d2}, [r0,:64], r3 - vrhadd.u8 d0, d0, d2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 d16, d16, d3 - sub r0, r0, r3 -.endif - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d16}, [r0,:64], r3 - bne 1b - bx lr -endfunc - .endm - - h264_qpel_h_lowpass put - h264_qpel_h_lowpass avg - - .macro h264_qpel_h_lowpass_l2 type -function \type\()_h264_qpel16_h_lowpass_l2_neon - push {lr} - mov ip, #16 - bl \type\()_h264_qpel8_h_lowpass_l2_neon - sub r0, r0, r2, lsl #4 - sub r1, r1, r2, lsl #4 - sub r3, r3, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #8 - add r3, r3, #8 - mov ip, #16 - pop {lr} -endfunc - -function \type\()_h264_qpel8_h_lowpass_l2_neon -1: vld1.64 {d0, d1}, [r1], r2 - vld1.64 {d16,d17}, [r1], r2 - vld1.64 {d28}, [r3], r2 - vld1.64 {d29}, [r3], r2 - subs ip, ip, #2 - lowpass_8 d0, d1, d16, d17, d0, d1 - vrhadd.u8 q0, q0, q14 -.ifc \type,avg - vld1.8 {d2}, [r0,:64], r2 - vrhadd.u8 d0, d0, d2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 d1, d1, d3 - sub r0, r0, r2 -.endif - vst1.64 {d0}, [r0,:64], r2 - vst1.64 {d1}, [r0,:64], r2 - bne 1b - bx lr -endfunc - .endm - - h264_qpel_h_lowpass_l2 put - h264_qpel_h_lowpass_l2 avg - -function put_h264_qpel16_v_lowpass_neon_packed - mov r4, lr - mov r2, #8 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 - b put_h264_qpel8_v_lowpass_neon -endfunc - - .macro h264_qpel_v_lowpass type -function \type\()_h264_qpel16_v_lowpass_neon - mov r4, lr - bl \type\()_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_v_lowpass_neon - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl \type\()_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 -endfunc - -function \type\()_h264_qpel8_v_lowpass_neon - vld1.64 {d8}, [r1], r3 - vld1.64 {d10}, [r1], r3 - vld1.64 {d12}, [r1], r3 - vld1.64 {d14}, [r1], r3 - vld1.64 {d22}, [r1], r3 - vld1.64 {d24}, [r1], r3 - vld1.64 {d26}, [r1], r3 - vld1.64 {d28}, [r1], r3 - vld1.64 {d9}, [r1], r3 - vld1.64 {d11}, [r1], r3 - vld1.64 {d13}, [r1], r3 - vld1.64 {d15}, [r1], r3 - vld1.64 {d23}, [r1] - - transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 - lowpass_8 d8, d9, d10, d11, d8, d10 - lowpass_8 d12, d13, d14, d15, d12, d14 - lowpass_8 d22, d23, d24, d25, d22, d24 - lowpass_8 d26, d27, d28, d29, d26, d28 - transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 - -.ifc \type,avg - vld1.8 {d9}, [r0,:64], r2 - vrhadd.u8 d8, d8, d9 - vld1.8 {d11}, [r0,:64], r2 - vrhadd.u8 d10, d10, d11 - vld1.8 {d13}, [r0,:64], r2 - vrhadd.u8 d12, d12, d13 - vld1.8 {d15}, [r0,:64], r2 - vrhadd.u8 d14, d14, d15 - vld1.8 {d23}, [r0,:64], r2 - vrhadd.u8 d22, d22, d23 - vld1.8 {d25}, [r0,:64], r2 - vrhadd.u8 d24, d24, d25 - vld1.8 {d27}, [r0,:64], r2 - vrhadd.u8 d26, d26, d27 - vld1.8 {d29}, [r0,:64], r2 - vrhadd.u8 d28, d28, d29 - sub r0, r0, r2, lsl #3 -.endif - - vst1.64 {d8}, [r0,:64], r2 - vst1.64 {d10}, [r0,:64], r2 - vst1.64 {d12}, [r0,:64], r2 - vst1.64 {d14}, [r0,:64], r2 - vst1.64 {d22}, [r0,:64], r2 - vst1.64 {d24}, [r0,:64], r2 - vst1.64 {d26}, [r0,:64], r2 - vst1.64 {d28}, [r0,:64], r2 - - bx lr -endfunc - .endm - - h264_qpel_v_lowpass put - h264_qpel_v_lowpass avg - - .macro h264_qpel_v_lowpass_l2 type -function \type\()_h264_qpel16_v_lowpass_l2_neon - mov r4, lr - bl \type\()_h264_qpel8_v_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_v_lowpass_l2_neon - sub r0, r0, r3, lsl #4 - sub ip, ip, r2, lsl #4 - add r0, r0, #8 - add ip, ip, #8 - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl \type\()_h264_qpel8_v_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 -endfunc - -function \type\()_h264_qpel8_v_lowpass_l2_neon - vld1.64 {d8}, [r1], r3 - vld1.64 {d10}, [r1], r3 - vld1.64 {d12}, [r1], r3 - vld1.64 {d14}, [r1], r3 - vld1.64 {d22}, [r1], r3 - vld1.64 {d24}, [r1], r3 - vld1.64 {d26}, [r1], r3 - vld1.64 {d28}, [r1], r3 - vld1.64 {d9}, [r1], r3 - vld1.64 {d11}, [r1], r3 - vld1.64 {d13}, [r1], r3 - vld1.64 {d15}, [r1], r3 - vld1.64 {d23}, [r1] - - transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 - lowpass_8 d8, d9, d10, d11, d8, d9 - lowpass_8 d12, d13, d14, d15, d12, d13 - lowpass_8 d22, d23, d24, d25, d22, d23 - lowpass_8 d26, d27, d28, d29, d26, d27 - transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 - - vld1.64 {d0}, [ip], r2 - vld1.64 {d1}, [ip], r2 - vld1.64 {d2}, [ip], r2 - vld1.64 {d3}, [ip], r2 - vld1.64 {d4}, [ip], r2 - vrhadd.u8 q0, q0, q4 - vld1.64 {d5}, [ip], r2 - vrhadd.u8 q1, q1, q6 - vld1.64 {d10}, [ip], r2 - vrhadd.u8 q2, q2, q11 - vld1.64 {d11}, [ip], r2 - vrhadd.u8 q5, q5, q13 - -.ifc \type,avg - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d0, d0, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d1, d1, d17 - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d2, d2, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d3, d3, d17 - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d4, d4, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d5, d5, d17 - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d10, d10, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d11, d11, d17 - sub r0, r0, r3, lsl #3 -.endif - - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d1}, [r0,:64], r3 - vst1.64 {d2}, [r0,:64], r3 - vst1.64 {d3}, [r0,:64], r3 - vst1.64 {d4}, [r0,:64], r3 - vst1.64 {d5}, [r0,:64], r3 - vst1.64 {d10}, [r0,:64], r3 - vst1.64 {d11}, [r0,:64], r3 - - bx lr -endfunc - .endm - - h264_qpel_v_lowpass_l2 put - h264_qpel_v_lowpass_l2 avg - -function put_h264_qpel8_hv_lowpass_neon_top - lowpass_const ip - mov ip, #12 -1: vld1.64 {d0, d1}, [r1], r3 - vld1.64 {d16,d17}, [r1], r3 - subs ip, ip, #2 - lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 - vst1.64 {d22-d25}, [r4,:128]! - bne 1b - - vld1.64 {d0, d1}, [r1] - lowpass_8_1 d0, d1, q12, narrow=0 - - mov ip, #-16 - add r4, r4, ip - vld1.64 {d30,d31}, [r4,:128], ip - vld1.64 {d20,d21}, [r4,:128], ip - vld1.64 {d18,d19}, [r4,:128], ip - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d14,d15}, [r4,:128], ip - vld1.64 {d12,d13}, [r4,:128], ip - vld1.64 {d10,d11}, [r4,:128], ip - vld1.64 {d8, d9}, [r4,:128], ip - vld1.64 {d6, d7}, [r4,:128], ip - vld1.64 {d4, d5}, [r4,:128], ip - vld1.64 {d2, d3}, [r4,:128], ip - vld1.64 {d0, d1}, [r4,:128] - - swap4 d1, d3, d5, d7, d8, d10, d12, d14 - transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 - - swap4 d17, d19, d21, d31, d24, d26, d28, d22 - transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 - - vst1.64 {d30,d31}, [r4,:128]! - vst1.64 {d6, d7}, [r4,:128]! - vst1.64 {d20,d21}, [r4,:128]! - vst1.64 {d4, d5}, [r4,:128]! - vst1.64 {d18,d19}, [r4,:128]! - vst1.64 {d2, d3}, [r4,:128]! - vst1.64 {d16,d17}, [r4,:128]! - vst1.64 {d0, d1}, [r4,:128] - - lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 - lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 - lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 - lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 - - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip - lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip - lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128], ip - lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 - vld1.64 {d16,d17}, [r4,:128], ip - vld1.64 {d30,d31}, [r4,:128] - lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 - - transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 - - bx lr -endfunc - - .macro h264_qpel8_hv_lowpass type -function \type\()_h264_qpel8_hv_lowpass_neon - mov r10, lr - bl put_h264_qpel8_hv_lowpass_neon_top -.ifc \type,avg - vld1.8 {d0}, [r0,:64], r2 - vrhadd.u8 d12, d12, d0 - vld1.8 {d1}, [r0,:64], r2 - vrhadd.u8 d13, d13, d1 - vld1.8 {d2}, [r0,:64], r2 - vrhadd.u8 d14, d14, d2 - vld1.8 {d3}, [r0,:64], r2 - vrhadd.u8 d15, d15, d3 - vld1.8 {d4}, [r0,:64], r2 - vrhadd.u8 d8, d8, d4 - vld1.8 {d5}, [r0,:64], r2 - vrhadd.u8 d9, d9, d5 - vld1.8 {d6}, [r0,:64], r2 - vrhadd.u8 d10, d10, d6 - vld1.8 {d7}, [r0,:64], r2 - vrhadd.u8 d11, d11, d7 - sub r0, r0, r2, lsl #3 -.endif - vst1.64 {d12}, [r0,:64], r2 - vst1.64 {d13}, [r0,:64], r2 - vst1.64 {d14}, [r0,:64], r2 - vst1.64 {d15}, [r0,:64], r2 - vst1.64 {d8}, [r0,:64], r2 - vst1.64 {d9}, [r0,:64], r2 - vst1.64 {d10}, [r0,:64], r2 - vst1.64 {d11}, [r0,:64], r2 - - mov lr, r10 - bx lr -endfunc - .endm - - h264_qpel8_hv_lowpass put - h264_qpel8_hv_lowpass avg - - .macro h264_qpel8_hv_lowpass_l2 type -function \type\()_h264_qpel8_hv_lowpass_l2_neon - mov r10, lr - bl put_h264_qpel8_hv_lowpass_neon_top - - vld1.64 {d0, d1}, [r2,:128]! - vld1.64 {d2, d3}, [r2,:128]! - vrhadd.u8 q0, q0, q6 - vld1.64 {d4, d5}, [r2,:128]! - vrhadd.u8 q1, q1, q7 - vld1.64 {d6, d7}, [r2,:128]! - vrhadd.u8 q2, q2, q4 - vrhadd.u8 q3, q3, q5 -.ifc \type,avg - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d0, d0, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d1, d1, d17 - vld1.8 {d18}, [r0,:64], r3 - vrhadd.u8 d2, d2, d18 - vld1.8 {d19}, [r0,:64], r3 - vrhadd.u8 d3, d3, d19 - vld1.8 {d20}, [r0,:64], r3 - vrhadd.u8 d4, d4, d20 - vld1.8 {d21}, [r0,:64], r3 - vrhadd.u8 d5, d5, d21 - vld1.8 {d22}, [r0,:64], r3 - vrhadd.u8 d6, d6, d22 - vld1.8 {d23}, [r0,:64], r3 - vrhadd.u8 d7, d7, d23 - sub r0, r0, r3, lsl #3 -.endif - vst1.64 {d0}, [r0,:64], r3 - vst1.64 {d1}, [r0,:64], r3 - vst1.64 {d2}, [r0,:64], r3 - vst1.64 {d3}, [r0,:64], r3 - vst1.64 {d4}, [r0,:64], r3 - vst1.64 {d5}, [r0,:64], r3 - vst1.64 {d6}, [r0,:64], r3 - vst1.64 {d7}, [r0,:64], r3 - - mov lr, r10 - bx lr -endfunc - .endm - - h264_qpel8_hv_lowpass_l2 put - h264_qpel8_hv_lowpass_l2 avg - - .macro h264_qpel16_hv type -function \type\()_h264_qpel16_hv_lowpass_neon - mov r9, lr - bl \type\()_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - bl \type\()_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r9 - b \type\()_h264_qpel8_hv_lowpass_neon -endfunc - -function \type\()_h264_qpel16_hv_lowpass_l2_neon - mov r9, lr - sub r2, r4, #256 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - sub r0, r0, r3, lsl #4 - add r0, r0, #8 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - mov lr, r9 - b \type\()_h264_qpel8_hv_lowpass_l2_neon -endfunc - .endm - - h264_qpel16_hv put - h264_qpel16_hv avg - - .macro h264_qpel8 type -function ff_\type\()_h264_qpel8_mc10_neon, export=1 - lowpass_const r3 - mov r3, r1 - sub r1, r1, #2 - mov ip, #8 - b \type\()_h264_qpel8_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel8_mc20_neon, export=1 - lowpass_const r3 - sub r1, r1, #2 - mov r3, r2 - mov ip, #8 - b \type\()_h264_qpel8_h_lowpass_neon -endfunc - -function ff_\type\()_h264_qpel8_mc30_neon, export=1 - lowpass_const r3 - add r3, r1, #1 - sub r1, r1, #2 - mov ip, #8 - b \type\()_h264_qpel8_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel8_mc01_neon, export=1 - push {lr} - mov ip, r1 -\type\()_h264_qpel8_mc01: - lowpass_const r3 - mov r3, r2 - sub r1, r1, r2, lsl #1 - vpush {d8-d15} - bl \type\()_h264_qpel8_v_lowpass_l2_neon - vpop {d8-d15} - pop {pc} -endfunc - -function ff_\type\()_h264_qpel8_mc11_neon, export=1 - push {r0, r1, r11, lr} -\type\()_h264_qpel8_mc11: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #64 - mov r0, sp - sub r1, r1, #2 - mov r3, #8 - mov ip, #8 - vpush {d8-d15} - bl put_h264_qpel8_h_lowpass_neon - ldrd r0, [r11] - mov r3, r2 - add ip, sp, #64 - sub r1, r1, r2, lsl #1 - mov r2, #8 - bl \type\()_h264_qpel8_v_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc21_neon, export=1 - push {r0, r1, r4, r10, r11, lr} -\type\()_h264_qpel8_mc21: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(8*8+16*12) - sub r1, r1, #2 - mov r3, #8 - mov r0, sp - mov ip, #8 - vpush {d8-d15} - bl put_h264_qpel8_h_lowpass_neon - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub r2, r4, #64 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4, r10, r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc31_neon, export=1 - add r1, r1, #1 - push {r0, r1, r11, lr} - sub r1, r1, #1 - b \type\()_h264_qpel8_mc11 -endfunc - -function ff_\type\()_h264_qpel8_mc02_neon, export=1 - push {lr} - lowpass_const r3 - sub r1, r1, r2, lsl #1 - mov r3, r2 - vpush {d8-d15} - bl \type\()_h264_qpel8_v_lowpass_neon - vpop {d8-d15} - pop {pc} -endfunc - -function ff_\type\()_h264_qpel8_mc12_neon, export=1 - push {r0, r1, r4, r10, r11, lr} -\type\()_h264_qpel8_mc12: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(8*8+16*12) - sub r1, r1, r2, lsl #1 - mov r3, r2 - mov r2, #8 - mov r0, sp - vpush {d8-d15} - bl put_h264_qpel8_v_lowpass_neon - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r3, lsl #1 - sub r1, r1, #2 - sub r2, r4, #64 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4, r10, r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc22_neon, export=1 - push {r4, r10, r11, lr} - mov r11, sp - bic sp, sp, #15 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub sp, sp, #(16*12) - mov r4, sp - vpush {d8-d15} - bl \type\()_h264_qpel8_hv_lowpass_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r10, r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc32_neon, export=1 - push {r0, r1, r4, r10, r11, lr} - add r1, r1, #1 - b \type\()_h264_qpel8_mc12 -endfunc - -function ff_\type\()_h264_qpel8_mc03_neon, export=1 - push {lr} - add ip, r1, r2 - b \type\()_h264_qpel8_mc01 -endfunc - -function ff_\type\()_h264_qpel8_mc13_neon, export=1 - push {r0, r1, r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel8_mc11 -endfunc - -function ff_\type\()_h264_qpel8_mc23_neon, export=1 - push {r0, r1, r4, r10, r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel8_mc21 -endfunc - -function ff_\type\()_h264_qpel8_mc33_neon, export=1 - add r1, r1, #1 - push {r0, r1, r11, lr} - add r1, r1, r2 - sub r1, r1, #1 - b \type\()_h264_qpel8_mc11 -endfunc - .endm - - h264_qpel8 put - h264_qpel8 avg - - .macro h264_qpel16 type -function ff_\type\()_h264_qpel16_mc10_neon, export=1 - lowpass_const r3 - mov r3, r1 - sub r1, r1, #2 - b \type\()_h264_qpel16_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel16_mc20_neon, export=1 - lowpass_const r3 - sub r1, r1, #2 - mov r3, r2 - b \type\()_h264_qpel16_h_lowpass_neon -endfunc - -function ff_\type\()_h264_qpel16_mc30_neon, export=1 - lowpass_const r3 - add r3, r1, #1 - sub r1, r1, #2 - b \type\()_h264_qpel16_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel16_mc01_neon, export=1 - push {r4, lr} - mov ip, r1 -\type\()_h264_qpel16_mc01: - lowpass_const r3 - mov r3, r2 - sub r1, r1, r2, lsl #1 - vpush {d8-d15} - bl \type\()_h264_qpel16_v_lowpass_l2_neon - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc11_neon, export=1 - push {r0, r1, r4, r11, lr} -\type\()_h264_qpel16_mc11: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #256 - mov r0, sp - sub r1, r1, #2 - mov r3, #16 - vpush {d8-d15} - bl put_h264_qpel16_h_lowpass_neon - ldrd r0, [r11] - mov r3, r2 - add ip, sp, #64 - sub r1, r1, r2, lsl #1 - mov r2, #16 - bl \type\()_h264_qpel16_v_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4, r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc21_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} -\type\()_h264_qpel16_mc21: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(16*16+16*12) - sub r1, r1, #2 - mov r0, sp - vpush {d8-d15} - bl put_h264_qpel16_h_lowpass_neon_packed - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - bl \type\()_h264_qpel16_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4-r5, r9-r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc31_neon, export=1 - add r1, r1, #1 - push {r0, r1, r4, r11, lr} - sub r1, r1, #1 - b \type\()_h264_qpel16_mc11 -endfunc - -function ff_\type\()_h264_qpel16_mc02_neon, export=1 - push {r4, lr} - lowpass_const r3 - sub r1, r1, r2, lsl #1 - mov r3, r2 - vpush {d8-d15} - bl \type\()_h264_qpel16_v_lowpass_neon - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc12_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} -\type\()_h264_qpel16_mc12: - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub sp, sp, #(16*16+16*12) - sub r1, r1, r2, lsl #1 - mov r0, sp - mov r3, r2 - vpush {d8-d15} - bl put_h264_qpel16_v_lowpass_neon_packed - mov r4, r0 - ldrd r0, [r11] - sub r1, r1, r3, lsl #1 - sub r1, r1, #2 - mov r2, r3 - bl \type\()_h264_qpel16_hv_lowpass_l2_neon - vpop {d8-d15} - add sp, r11, #8 - pop {r4-r5, r9-r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc22_neon, export=1 - push {r4, r9-r11, lr} - lowpass_const r3 - mov r11, sp - bic sp, sp, #15 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub sp, sp, #(16*12) - mov r4, sp - vpush {d8-d15} - bl \type\()_h264_qpel16_hv_lowpass_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r9-r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc32_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} - add r1, r1, #1 - b \type\()_h264_qpel16_mc12 -endfunc - -function ff_\type\()_h264_qpel16_mc03_neon, export=1 - push {r4, lr} - add ip, r1, r2 - b \type\()_h264_qpel16_mc01 -endfunc - -function ff_\type\()_h264_qpel16_mc13_neon, export=1 - push {r0, r1, r4, r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel16_mc11 -endfunc - -function ff_\type\()_h264_qpel16_mc23_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel16_mc21 -endfunc - -function ff_\type\()_h264_qpel16_mc33_neon, export=1 - add r1, r1, #1 - push {r0, r1, r4, r11, lr} - add r1, r1, r2 - sub r1, r1, #1 - b \type\()_h264_qpel16_mc11 -endfunc - .endm - - h264_qpel16 put - h264_qpel16 avg - -@ Biweighted prediction - - .macro biweight_16 macs, macd - vdup.8 d0, r4 - vdup.8 d1, r5 - vmov q2, q8 - vmov q3, q8 -1: subs ip, ip, #2 - vld1.8 {d20-d21},[r0,:128], r2 - \macd q2, d0, d20 - pld [r0] - \macd q3, d0, d21 - vld1.8 {d22-d23},[r1,:128], r2 - \macs q2, d1, d22 - pld [r1] - \macs q3, d1, d23 - vmov q12, q8 - vld1.8 {d28-d29},[r0,:128], r2 - vmov q13, q8 - \macd q12, d0, d28 - pld [r0] - \macd q13, d0, d29 - vld1.8 {d30-d31},[r1,:128], r2 - \macs q12, d1, d30 - pld [r1] - \macs q13, d1, d31 - vshl.s16 q2, q2, q9 - vshl.s16 q3, q3, q9 - vqmovun.s16 d4, q2 - vqmovun.s16 d5, q3 - vshl.s16 q12, q12, q9 - vshl.s16 q13, q13, q9 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vmov q3, q8 - vst1.8 {d4- d5}, [r6,:128], r2 - vmov q2, q8 - vst1.8 {d24-d25},[r6,:128], r2 - bne 1b - pop {r4-r6, pc} - .endm - - .macro biweight_8 macs, macd - vdup.8 d0, r4 - vdup.8 d1, r5 - vmov q1, q8 - vmov q10, q8 -1: subs ip, ip, #2 - vld1.8 {d4},[r0,:64], r2 - \macd q1, d0, d4 - pld [r0] - vld1.8 {d5},[r1,:64], r2 - \macs q1, d1, d5 - pld [r1] - vld1.8 {d6},[r0,:64], r2 - \macd q10, d0, d6 - pld [r0] - vld1.8 {d7},[r1,:64], r2 - \macs q10, d1, d7 - pld [r1] - vshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vmov q10, q8 - vst1.8 {d2},[r6,:64], r2 - vmov q1, q8 - vst1.8 {d4},[r6,:64], r2 - bne 1b - pop {r4-r6, pc} - .endm - - .macro biweight_4 macs, macd - vdup.8 d0, r4 - vdup.8 d1, r5 - vmov q1, q8 - vmov q10, q8 -1: subs ip, ip, #4 - vld1.32 {d4[0]},[r0,:32], r2 - vld1.32 {d4[1]},[r0,:32], r2 - \macd q1, d0, d4 - pld [r0] - vld1.32 {d5[0]},[r1,:32], r2 - vld1.32 {d5[1]},[r1,:32], r2 - \macs q1, d1, d5 - pld [r1] - blt 2f - vld1.32 {d6[0]},[r0,:32], r2 - vld1.32 {d6[1]},[r0,:32], r2 - \macd q10, d0, d6 - pld [r0] - vld1.32 {d7[0]},[r1,:32], r2 - vld1.32 {d7[1]},[r1,:32], r2 - \macs q10, d1, d7 - pld [r1] - vshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vmov q10, q8 - vst1.32 {d2[0]},[r6,:32], r2 - vst1.32 {d2[1]},[r6,:32], r2 - vmov q1, q8 - vst1.32 {d4[0]},[r6,:32], r2 - vst1.32 {d4[1]},[r6,:32], r2 - bne 1b - pop {r4-r6, pc} -2: vshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vst1.32 {d2[0]},[r6,:32], r2 - vst1.32 {d2[1]},[r6,:32], r2 - pop {r4-r6, pc} - .endm - - .macro biweight_func w -function biweight_h264_pixels_\w\()_neon - push {r4-r6, lr} - add r4, sp, #16 - ldm r4, {r4-r6} - lsr lr, r4, #31 - add r6, r6, #1 - eors lr, lr, r5, lsr #30 - orr r6, r6, #1 - vdup.16 q9, r3 - lsl r6, r6, r3 - vmvn q9, q9 - vdup.16 q8, r6 - mov r6, r0 - beq 10f - subs lr, lr, #1 - beq 20f - subs lr, lr, #1 - beq 30f - b 40f -10: biweight_\w vmlal.u8, vmlal.u8 -20: rsb r4, r4, #0 - biweight_\w vmlal.u8, vmlsl.u8 -30: rsb r4, r4, #0 - rsb r5, r5, #0 - biweight_\w vmlsl.u8, vmlsl.u8 -40: rsb r5, r5, #0 - biweight_\w vmlsl.u8, vmlal.u8 -endfunc - .endm - - .macro biweight_entry w, h, b=1 -function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 - mov ip, #\h -.if \b - b biweight_h264_pixels_\w\()_neon -.endif -endfunc - .endm - - biweight_entry 16, 8 - biweight_entry 16, 16, b=0 - biweight_func 16 - - biweight_entry 8, 16 - biweight_entry 8, 4 - biweight_entry 8, 8, b=0 - biweight_func 8 - - biweight_entry 4, 8 - biweight_entry 4, 2 - biweight_entry 4, 4, b=0 - biweight_func 4 - -@ Weighted prediction - - .macro weight_16 add - vdup.8 d0, r3 -1: subs ip, ip, #2 - vld1.8 {d20-d21},[r0,:128], r1 - vmull.u8 q2, d0, d20 - pld [r0] - vmull.u8 q3, d0, d21 - vld1.8 {d28-d29},[r0,:128], r1 - vmull.u8 q12, d0, d28 - pld [r0] - vmull.u8 q13, d0, d29 - \add q2, q8, q2 - vrshl.s16 q2, q2, q9 - \add q3, q8, q3 - vrshl.s16 q3, q3, q9 - vqmovun.s16 d4, q2 - vqmovun.s16 d5, q3 - \add q12, q8, q12 - vrshl.s16 q12, q12, q9 - \add q13, q8, q13 - vrshl.s16 q13, q13, q9 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vst1.8 {d4- d5}, [r4,:128], r1 - vst1.8 {d24-d25},[r4,:128], r1 - bne 1b - pop {r4, pc} - .endm - - .macro weight_8 add - vdup.8 d0, r3 -1: subs ip, ip, #2 - vld1.8 {d4},[r0,:64], r1 - vmull.u8 q1, d0, d4 - pld [r0] - vld1.8 {d6},[r0,:64], r1 - vmull.u8 q10, d0, d6 - \add q1, q8, q1 - pld [r0] - vrshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - \add q10, q8, q10 - vrshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vst1.8 {d2},[r4,:64], r1 - vst1.8 {d4},[r4,:64], r1 - bne 1b - pop {r4, pc} - .endm - - .macro weight_4 add - vdup.8 d0, r3 - vmov q1, q8 - vmov q10, q8 -1: subs ip, ip, #4 - vld1.32 {d4[0]},[r0,:32], r1 - vld1.32 {d4[1]},[r0,:32], r1 - vmull.u8 q1, d0, d4 - pld [r0] - blt 2f - vld1.32 {d6[0]},[r0,:32], r1 - vld1.32 {d6[1]},[r0,:32], r1 - vmull.u8 q10, d0, d6 - pld [r0] - \add q1, q8, q1 - vrshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - \add q10, q8, q10 - vrshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vmov q10, q8 - vst1.32 {d2[0]},[r4,:32], r1 - vst1.32 {d2[1]},[r4,:32], r1 - vmov q1, q8 - vst1.32 {d4[0]},[r4,:32], r1 - vst1.32 {d4[1]},[r4,:32], r1 - bne 1b - pop {r4, pc} -2: \add q1, q8, q1 - vrshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vst1.32 {d2[0]},[r4,:32], r1 - vst1.32 {d2[1]},[r4,:32], r1 - pop {r4, pc} - .endm - - .macro weight_func w -function weight_h264_pixels_\w\()_neon - push {r4, lr} - ldr r4, [sp, #8] - cmp r2, #1 - lsl r4, r4, r2 - vdup.16 q8, r4 - mov r4, r0 - ble 20f - rsb lr, r2, #1 - vdup.16 q9, lr - cmp r3, #0 - blt 10f - weight_\w vhadd.s16 -10: rsb r3, r3, #0 - weight_\w vhsub.s16 -20: rsb lr, r2, #0 - vdup.16 q9, lr - cmp r3, #0 - blt 10f - weight_\w vadd.s16 -10: rsb r3, r3, #0 - weight_\w vsub.s16 -endfunc - .endm - - .macro weight_entry w, h, b=1 -function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 - mov ip, #\h -.if \b - b weight_h264_pixels_\w\()_neon -.endif -endfunc - .endm - - weight_entry 16, 8 - weight_entry 16, 16, b=0 - weight_func 16 - - weight_entry 8, 16 - weight_entry 8, 4 - weight_entry 8, 8, b=0 - weight_func 8 - - weight_entry 4, 8 - weight_entry 4, 2 - weight_entry 4, 4, b=0 - weight_func 4 diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - preserve8 - .text - -function ff_h264_idct_add_neon, export=1 - vld1.64 {d0-d3}, [r1,:128] - - vswp d1, d2 - vadd.i16 d4, d0, d1 - vshr.s16 q8, q1, #1 - vsub.i16 d5, d0, d1 - vadd.i16 d6, d2, d17 - vsub.i16 d7, d16, d3 - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vtrn.16 d0, d1 - vtrn.16 d3, d2 - vtrn.32 d0, d3 - vtrn.32 d1, d2 - - vadd.i16 d4, d0, d3 - vld1.32 {d18[0]}, [r0,:32], r2 - vswp d1, d3 - vshr.s16 q8, q1, #1 - vld1.32 {d19[1]}, [r0,:32], r2 - vsub.i16 d5, d0, d1 - vld1.32 {d18[1]}, [r0,:32], r2 - vadd.i16 d6, d16, d3 - vld1.32 {d19[0]}, [r0,:32], r2 - vsub.i16 d7, d2, d17 - sub r0, r0, r2, lsl #2 - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vrshr.s16 q0, q0, #6 - vrshr.s16 q1, q1, #6 - - vaddw.u8 q0, q0, d18 - vaddw.u8 q1, q1, d19 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - - bx lr -endfunc - -function ff_h264_idct_dc_add_neon, export=1 - vld1.16 {d2[],d3[]}, [r1,:16] - vrshr.s16 q1, q1, #6 - vld1.32 {d0[0]}, [r0,:32], r2 - vld1.32 {d0[1]}, [r0,:32], r2 - vaddw.u8 q2, q1, d0 - vld1.32 {d1[0]}, [r0,:32], r2 - vld1.32 {d1[1]}, [r0,:32], r2 - vaddw.u8 q1, q1, d1 - vqmovun.s16 d0, q2 - vqmovun.s16 d1, q1 - sub r0, r0, r2, lsl #2 - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - bx lr -endfunc - -function ff_h264_idct_add16_neon, export=1 - push {r4-r8,lr} - mov r4, r0 - mov r5, r1 - mov r1, r2 - mov r2, r3 - ldr r6, [sp, #24] - movrel r7, scan8 - mov ip, #16 -1: ldrb r8, [r7], #1 - ldr r0, [r5], #4 - ldrb r8, [r6, r8] - subs r8, r8, #1 - blt 2f - ldrsh lr, [r1] - add r0, r0, r4 - movne lr, #0 - cmp lr, #0 - adrne lr, ff_h264_idct_dc_add_neon - adreq lr, ff_h264_idct_add_neon - blx lr -2: subs ip, ip, #1 - add r1, r1, #32 - bne 1b - pop {r4-r8,pc} -endfunc - -function ff_h264_idct_add16intra_neon, export=1 - push {r4-r8,lr} - mov r4, r0 - mov r5, r1 - mov r1, r2 - mov r2, r3 - ldr r6, [sp, #24] - movrel r7, scan8 - mov ip, #16 -1: ldrb r8, [r7], #1 - ldr r0, [r5], #4 - ldrb r8, [r6, r8] - add r0, r0, r4 - cmp r8, #0 - ldrsh r8, [r1] - adrne lr, ff_h264_idct_add_neon - adreq lr, ff_h264_idct_dc_add_neon - cmpeq r8, #0 - blxne lr - subs ip, ip, #1 - add r1, r1, #32 - bne 1b - pop {r4-r8,pc} -endfunc - -function ff_h264_idct_add8_neon, export=1 - push {r4-r10,lr} - ldm r0, {r4,r9} - add r5, r1, #16*4 - add r1, r2, #16*32 - mov r2, r3 - ldr r6, [sp, #32] - movrel r7, scan8+16 - mov ip, #8 -1: ldrb r8, [r7], #1 - ldr r0, [r5], #4 - ldrb r8, [r6, r8] - tst ip, #4 - addeq r0, r0, r4 - addne r0, r0, r9 - cmp r8, #0 - ldrsh r8, [r1] - adrne lr, ff_h264_idct_add_neon - adreq lr, ff_h264_idct_dc_add_neon - cmpeq r8, #0 - blxne lr - subs ip, ip, #1 - add r1, r1, #32 - bne 1b - pop {r4-r10,pc} -endfunc - - .section .rodata -scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 - .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 - .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 - .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 - .byte 1+1*8, 2+1*8 - .byte 1+2*8, 2+2*8 - .byte 1+4*8, 2+4*8 - .byte 1+5*8, 2+5*8 diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavcodec/h264pred.h" - -void ff_pred16x16_vert_neon(uint8_t *src, int stride); -void ff_pred16x16_hor_neon(uint8_t *src, int stride); -void ff_pred16x16_plane_neon(uint8_t *src, int stride); -void ff_pred16x16_dc_neon(uint8_t *src, int stride); -void ff_pred16x16_128_dc_neon(uint8_t *src, int stride); -void ff_pred16x16_left_dc_neon(uint8_t *src, int stride); -void ff_pred16x16_top_dc_neon(uint8_t *src, int stride); - -void ff_pred8x8_vert_neon(uint8_t *src, int stride); -void ff_pred8x8_hor_neon(uint8_t *src, int stride); -void ff_pred8x8_plane_neon(uint8_t *src, int stride); -void ff_pred8x8_dc_neon(uint8_t *src, int stride); -void ff_pred8x8_128_dc_neon(uint8_t *src, int stride); -void ff_pred8x8_left_dc_neon(uint8_t *src, int stride); -void ff_pred8x8_top_dc_neon(uint8_t *src, int stride); -void ff_pred8x8_l0t_dc_neon(uint8_t *src, int stride); -void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride); -void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride); -void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride); - -#if HAVE_NEON -static void ff_h264_pred_init_neon(H264PredContext *h) -{ - h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; - h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_neon; - h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; - - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; - h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; - - - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; - h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; - h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; - h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; - h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; -} -#endif - -void ff_h264_pred_init_arm(H264PredContext *h) -{ - if (HAVE_NEON) ff_h264_pred_init_neon(h); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - .macro ldcol.8 rd, rs, rt, n=8, hi=0 -.if \n == 8 || \hi == 0 - vld1.8 {\rd[0]}, [\rs], \rt - vld1.8 {\rd[1]}, [\rs], \rt - vld1.8 {\rd[2]}, [\rs], \rt - vld1.8 {\rd[3]}, [\rs], \rt -.endif -.if \n == 8 || \hi == 1 - vld1.8 {\rd[4]}, [\rs], \rt - vld1.8 {\rd[5]}, [\rs], \rt - vld1.8 {\rd[6]}, [\rs], \rt - vld1.8 {\rd[7]}, [\rs], \rt -.endif - .endm - - .macro add16x8 dq, dl, dh, rl, rh - vaddl.u8 \dq, \rl, \rh - vadd.u16 \dl, \dl, \dh - vpadd.u16 \dl, \dl, \dl - vpadd.u16 \dl, \dl, \dl - .endm - -function ff_pred16x16_128_dc_neon, export=1 - vmov.i8 q0, #128 - b .L_pred16x16_dc_end -endfunc - -function ff_pred16x16_top_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {q0}, [r2,:128] - add16x8 q0, d0, d1, d0, d1 - vrshrn.u16 d0, q0, #4 - vdup.8 q0, d0[0] - b .L_pred16x16_dc_end -endfunc - -function ff_pred16x16_left_dc_neon, export=1 - sub r2, r0, #1 - ldcol.8 d0, r2, r1 - ldcol.8 d1, r2, r1 - add16x8 q0, d0, d1, d0, d1 - vrshrn.u16 d0, q0, #4 - vdup.8 q0, d0[0] - b .L_pred16x16_dc_end -endfunc - -function ff_pred16x16_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {q0}, [r2,:128] - sub r2, r0, #1 - ldcol.8 d2, r2, r1 - ldcol.8 d3, r2, r1 - vaddl.u8 q0, d0, d1 - vaddl.u8 q1, d2, d3 - vadd.u16 q0, q0, q1 - vadd.u16 d0, d0, d1 - vpadd.u16 d0, d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #5 - vdup.8 q0, d0[0] -.L_pred16x16_dc_end: - mov r3, #8 -6: vst1.8 {q0}, [r0,:128], r1 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 6b - bx lr -endfunc - -function ff_pred16x16_hor_neon, export=1 - sub r2, r0, #1 - mov r3, #16 -1: vld1.8 {d0[],d1[]},[r2], r1 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred16x16_vert_neon, export=1 - sub r0, r0, r1 - vld1.8 {q0}, [r0,:128], r1 - mov r3, #8 -1: vst1.8 {q0}, [r0,:128], r1 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred16x16_plane_neon, export=1 - sub r3, r0, r1 - add r2, r3, #8 - sub r3, r3, #1 - vld1.8 {d0}, [r3] - vld1.8 {d2}, [r2,:64], r1 - ldcol.8 d1, r3, r1 - add r3, r3, r1 - ldcol.8 d3, r3, r1 - vrev64.8 q0, q0 - vaddl.u8 q8, d2, d3 - vsubl.u8 q2, d2, d0 - vsubl.u8 q3, d3, d1 - movrel r3, p16weight - vld1.8 {q0}, [r3,:128] - vmul.s16 q2, q2, q0 - vmul.s16 q3, q3, q0 - vadd.i16 d4, d4, d5 - vadd.i16 d5, d6, d7 - vpadd.i16 d4, d4, d5 - vpadd.i16 d4, d4, d4 - vshl.i16 d5, d4, #2 - vaddl.s16 q2, d4, d5 - vrshrn.s32 d4, q2, #6 - mov r3, #0 - vtrn.16 d4, d5 - vadd.i16 d2, d4, d5 - vshl.i16 d3, d2, #3 - vrev64.16 d16, d17 - vsub.i16 d3, d3, d2 - vadd.i16 d16, d16, d0 - vshl.i16 d2, d16, #4 - vsub.i16 d2, d2, d3 - vshl.i16 d3, d4, #4 - vext.16 q0, q0, q0, #7 - vsub.i16 d6, d5, d3 - vmov.16 d0[0], r3 - vmul.i16 q0, q0, d4[0] - vdup.16 q1, d2[0] - vdup.16 q2, d4[0] - vdup.16 q3, d6[0] - vshl.i16 q2, q2, #3 - vadd.i16 q1, q1, q0 - vadd.i16 q3, q3, q2 - mov r3, #16 -1: - vqshrun.s16 d0, q1, #5 - vadd.i16 q1, q1, q2 - vqshrun.s16 d1, q1, #5 - vadd.i16 q1, q1, q3 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - - .section .rodata - .align 4 -p16weight: - .short 1,2,3,4,5,6,7,8 - - .text - -function ff_pred8x8_hor_neon, export=1 - sub r2, r0, #1 - mov r3, #8 -1: vld1.8 {d0[]}, [r2], r1 - vst1.8 {d0}, [r0,:64], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred8x8_vert_neon, export=1 - sub r0, r0, r1 - vld1.8 {d0}, [r0,:64], r1 - mov r3, #4 -1: vst1.8 {d0}, [r0,:64], r1 - vst1.8 {d0}, [r0,:64], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred8x8_plane_neon, export=1 - sub r3, r0, r1 - add r2, r3, #4 - sub r3, r3, #1 - vld1.32 {d0[0]}, [r3] - vld1.32 {d2[0]}, [r2,:32], r1 - ldcol.8 d0, r3, r1, 4, hi=1 - add r3, r3, r1 - ldcol.8 d3, r3, r1, 4 - vaddl.u8 q8, d2, d3 - vrev32.8 d0, d0 - vtrn.32 d2, d3 - vsubl.u8 q2, d2, d0 - movrel r3, p16weight - vld1.16 {q0}, [r3,:128] - vmul.s16 d4, d4, d0 - vmul.s16 d5, d5, d0 - vpadd.i16 d4, d4, d5 - vpaddl.s16 d4, d4 - vshl.i32 d5, d4, #4 - vadd.s32 d4, d4, d5 - vrshrn.s32 d4, q2, #5 - mov r3, #0 - vtrn.16 d4, d5 - vadd.i16 d2, d4, d5 - vshl.i16 d3, d2, #2 - vrev64.16 d16, d16 - vsub.i16 d3, d3, d2 - vadd.i16 d16, d16, d0 - vshl.i16 d2, d16, #4 - vsub.i16 d2, d2, d3 - vshl.i16 d3, d4, #3 - vext.16 q0, q0, q0, #7 - vsub.i16 d6, d5, d3 - vmov.16 d0[0], r3 - vmul.i16 q0, q0, d4[0] - vdup.16 q1, d2[0] - vdup.16 q2, d4[0] - vdup.16 q3, d6[0] - vshl.i16 q2, q2, #3 - vadd.i16 q1, q1, q0 - vadd.i16 q3, q3, q2 - mov r3, #8 -1: - vqshrun.s16 d0, q1, #5 - vadd.i16 q1, q1, q3 - vst1.8 {d0}, [r0,:64], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred8x8_128_dc_neon, export=1 - vmov.i8 q0, #128 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_top_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - vpaddl.u8 d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #2 - vdup.8 d1, d0[1] - vdup.8 d0, d0[0] - vtrn.32 d0, d1 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_left_dc_neon, export=1 - sub r2, r0, #1 - ldcol.8 d0, r2, r1 - vpaddl.u8 d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #2 - vdup.8 d1, d0[1] - vdup.8 d0, d0[0] - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - sub r2, r0, #1 - ldcol.8 d1, r2, r1 - vtrn.32 d0, d1 - vpaddl.u8 q0, q0 - vpadd.u16 d0, d0, d1 - vpadd.u16 d1, d0, d0 - vrshrn.u16 d2, q0, #3 - vrshrn.u16 d3, q0, #2 - vdup.8 d0, d2[4] - vdup.8 d1, d3[3] - vdup.8 d4, d3[2] - vdup.8 d5, d2[5] - vtrn.32 q0, q2 -.L_pred8x8_dc_end: - mov r3, #4 - add r2, r0, r1, lsl #2 -6: vst1.8 {d0}, [r0,:64], r1 - vst1.8 {d1}, [r2,:64], r1 - subs r3, r3, #1 - bne 6b - bx lr -endfunc - -function ff_pred8x8_l0t_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - sub r2, r0, #1 - ldcol.8 d1, r2, r1, 4 - vtrn.32 d0, d1 - vpaddl.u8 q0, q0 - vpadd.u16 d0, d0, d1 - vpadd.u16 d1, d0, d0 - vrshrn.u16 d2, q0, #3 - vrshrn.u16 d3, q0, #2 - vdup.8 d0, d2[4] - vdup.8 d1, d3[0] - vdup.8 q2, d3[2] - vtrn.32 q0, q2 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_l00_dc_neon, export=1 - sub r2, r0, #1 - ldcol.8 d0, r2, r1, 4 - vpaddl.u8 d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #2 - vmov.i8 d1, #128 - vdup.8 d0, d0[0] - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_0lt_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - add r2, r0, r1, lsl #2 - sub r2, r2, #1 - ldcol.8 d1, r2, r1, 4, hi=1 - vtrn.32 d0, d1 - vpaddl.u8 q0, q0 - vpadd.u16 d0, d0, d1 - vpadd.u16 d1, d0, d0 - vrshrn.u16 d3, q0, #2 - vrshrn.u16 d2, q0, #3 - vdup.8 d0, d3[0] - vdup.8 d1, d3[3] - vdup.8 d4, d3[2] - vdup.8 d5, d2[5] - vtrn.32 q0, q2 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_0l0_dc_neon, export=1 - add r2, r0, r1, lsl #2 - sub r2, r2, #1 - ldcol.8 d1, r2, r1, 4 - vpaddl.u8 d2, d1 - vpadd.u16 d2, d2, d2 - vrshrn.u16 d1, q1, #2 - vmov.i8 d0, #128 - vdup.8 d1, d1[0] - b .L_pred8x8_dc_end -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -/* - * ARM NEON optimised integer operations - * Copyright (c) 2009 Kostya Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - preserve8 - .fpu neon - .text - -function ff_scalarproduct_int16_neon, export=1 - vmov.i16 q0, #0 - vmov.i16 q1, #0 - vmov.i16 q2, #0 - vmov.i16 q3, #0 - negs r3, r3 - beq 2f - - vdup.s32 q12, r3 -1: vld1.16 {d16-d17}, [r0]! - vld1.16 {d20-d21}, [r1,:128]! - vmull.s16 q12, d16, d20 - vld1.16 {d18-d19}, [r0]! - vmull.s16 q13, d17, d21 - vld1.16 {d22-d23}, [r1,:128]! - vmull.s16 q14, d18, d22 - vmull.s16 q15, d19, d23 - vshl.s32 q8, q12, q12 - vshl.s32 q9, q13, q12 - vadd.s32 q0, q0, q8 - vshl.s32 q10, q14, q12 - vadd.s32 q1, q1, q9 - vshl.s32 q11, q15, q12 - vadd.s32 q2, q2, q10 - vadd.s32 q3, q3, q11 - subs r2, r2, #16 - bne 1b - b 3f - -2: vld1.16 {d16-d17}, [r0]! - vld1.16 {d20-d21}, [r1,:128]! - vmlal.s16 q0, d16, d20 - vld1.16 {d18-d19}, [r0]! - vmlal.s16 q1, d17, d21 - vld1.16 {d22-d23}, [r1,:128]! - vmlal.s16 q2, d18, d22 - vmlal.s16 q3, d19, d23 - subs r2, r2, #16 - bne 2b - -3: vpadd.s32 d16, d0, d1 - vpadd.s32 d17, d2, d3 - vpadd.s32 d10, d4, d5 - vpadd.s32 d11, d6, d7 - vpadd.s32 d0, d16, d17 - vpadd.s32 d1, d10, d11 - vpadd.s32 d2, d0, d1 - vpaddl.s32 d3, d2 - vmov.32 r0, d3[0] - bx lr -endfunc - -@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) -function ff_scalarproduct_and_madd_int16_neon, export=1 - vld1.16 {d28[],d29[]}, [sp] - vmov.i16 q0, #0 - vmov.i16 q1, #0 - vmov.i16 q2, #0 - vmov.i16 q3, #0 - mov r12, r0 - -1: vld1.16 {d16-d17}, [r0,:128]! - vld1.16 {d18-d19}, [r1]! - vld1.16 {d20-d21}, [r2]! - vld1.16 {d22-d23}, [r0,:128]! - vld1.16 {d24-d25}, [r1]! - vld1.16 {d26-d27}, [r2]! - vmul.s16 q10, q10, q14 - vmul.s16 q13, q13, q14 - vmlal.s16 q0, d16, d18 - vmlal.s16 q1, d17, d19 - vadd.s16 q10, q8, q10 - vadd.s16 q13, q11, q13 - vmlal.s16 q2, d22, d24 - vmlal.s16 q3, d23, d25 - vst1.16 {q10}, [r12,:128]! - subs r3, r3, #16 - vst1.16 {q13}, [r12,:128]! - bne 1b - - vpadd.s32 d16, d0, d1 - vpadd.s32 d17, d2, d3 - vpadd.s32 d10, d4, d5 - vpadd.s32 d11, d6, d7 - vpadd.s32 d0, d16, d17 - vpadd.s32 d1, d10, d11 - vpadd.s32 d2, d0, d1 - vpaddl.s32 d3, d2 - vmov.32 r0, d3[0] - bx lr -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,388 +0,0 @@ -/* - C-like prototype : - void j_rev_dct_arm(DCTBLOCK data) - - With DCTBLOCK being a pointer to an array of 64 'signed shorts' - - Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*/ - -#include "asm.S" - -#define FIX_0_298631336 2446 -#define FIX_0_541196100 4433 -#define FIX_0_765366865 6270 -#define FIX_1_175875602 9633 -#define FIX_1_501321110 12299 -#define FIX_2_053119869 16819 -#define FIX_3_072711026 25172 -#define FIX_M_0_390180644 -3196 -#define FIX_M_0_899976223 -7373 -#define FIX_M_1_847759065 -15137 -#define FIX_M_1_961570560 -16069 -#define FIX_M_2_562915447 -20995 -#define FIX_0xFFFF 0xFFFF - -#define FIX_0_298631336_ID 0 -#define FIX_0_541196100_ID 4 -#define FIX_0_765366865_ID 8 -#define FIX_1_175875602_ID 12 -#define FIX_1_501321110_ID 16 -#define FIX_2_053119869_ID 20 -#define FIX_3_072711026_ID 24 -#define FIX_M_0_390180644_ID 28 -#define FIX_M_0_899976223_ID 32 -#define FIX_M_1_847759065_ID 36 -#define FIX_M_1_961570560_ID 40 -#define FIX_M_2_562915447_ID 44 -#define FIX_0xFFFF_ID 48 - .text - .align - -function ff_j_rev_dct_arm, export=1 - stmdb sp!, { r4 - r12, lr } @ all callee saved regs - - sub sp, sp, #4 @ reserve some space on the stack - str r0, [ sp ] @ save the DCT pointer to the stack - - mov lr, r0 @ lr = pointer to the current row - mov r12, #8 @ r12 = row-counter - adr r11, const_array @ r11 = base pointer to the constants array -row_loop: - ldrsh r0, [lr, # 0] @ r0 = 'd0' - ldrsh r2, [lr, # 2] @ r2 = 'd2' - - @ Optimization for row that have all items except the first set to 0 - @ (this works as the DCTELEMS are always 4-byte aligned) - ldr r5, [lr, # 0] - ldr r6, [lr, # 4] - ldr r3, [lr, # 8] - ldr r4, [lr, #12] - orr r3, r3, r4 - orr r3, r3, r6 - orrs r5, r3, r5 - beq end_of_row_loop @ nothing to be done as ALL of them are '0' - orrs r3, r3, r2 - beq empty_row - - ldrsh r1, [lr, # 8] @ r1 = 'd1' - ldrsh r4, [lr, # 4] @ r4 = 'd4' - ldrsh r6, [lr, # 6] @ r6 = 'd6' - - ldr r3, [r11, #FIX_0_541196100_ID] - add r7, r2, r6 - ldr r5, [r11, #FIX_M_1_847759065_ID] - mul r7, r3, r7 @ r7 = z1 - ldr r3, [r11, #FIX_0_765366865_ID] - mla r6, r5, r6, r7 @ r6 = tmp2 - add r5, r0, r4 @ r5 = tmp0 - mla r2, r3, r2, r7 @ r2 = tmp3 - sub r3, r0, r4 @ r3 = tmp1 - - add r0, r2, r5, lsl #13 @ r0 = tmp10 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13 - add r4, r6, r3, lsl #13 @ r4 = tmp11 - rsb r3, r6, r3, lsl #13 @ r3 = tmp12 - - stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11 - - ldrsh r3, [lr, #10] @ r3 = 'd3' - ldrsh r5, [lr, #12] @ r5 = 'd5' - ldrsh r7, [lr, #14] @ r7 = 'd7' - - add r0, r3, r5 @ r0 = 'z2' - add r2, r1, r7 @ r2 = 'z1' - add r4, r3, r7 @ r4 = 'z3' - add r6, r1, r5 @ r6 = 'z4' - ldr r9, [r11, #FIX_1_175875602_ID] - add r8, r4, r6 @ r8 = z3 + z4 - ldr r10, [r11, #FIX_M_0_899976223_ID] - mul r8, r9, r8 @ r8 = 'z5' - ldr r9, [r11, #FIX_M_2_562915447_ID] - mul r2, r10, r2 @ r2 = 'z1' - ldr r10, [r11, #FIX_M_1_961570560_ID] - mul r0, r9, r0 @ r0 = 'z2' - ldr r9, [r11, #FIX_M_0_390180644_ID] - mla r4, r10, r4, r8 @ r4 = 'z3' - ldr r10, [r11, #FIX_0_298631336_ID] - mla r6, r9, r6, r8 @ r6 = 'z4' - ldr r9, [r11, #FIX_2_053119869_ID] - mla r7, r10, r7, r2 @ r7 = tmp0 + z1 - ldr r10, [r11, #FIX_3_072711026_ID] - mla r5, r9, r5, r0 @ r5 = tmp1 + z2 - ldr r9, [r11, #FIX_1_501321110_ID] - mla r3, r10, r3, r0 @ r3 = tmp2 + z2 - add r7, r7, r4 @ r7 = tmp0 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1 - add r5, r5, r6 @ r5 = tmp1 - add r3, r3, r4 @ r3 = tmp2 - add r1, r1, r6 @ r1 = tmp3 - - ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 - - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) - add r8, r0, r1 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 0] - - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) - sub r8, r0, r1 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #14] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) - add r8, r6, r3 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 2] - - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) - sub r8, r6, r3 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #12] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) - add r8, r4, r5 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 4] - - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) - sub r8, r4, r5 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #10] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) - add r8, r2, r7 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 6] - - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) - sub r8, r2, r7 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 8] - - @ End of row loop - add lr, lr, #16 - subs r12, r12, #1 - bne row_loop - beq start_column_loop - -empty_row: - ldr r1, [r11, #FIX_0xFFFF_ID] - mov r0, r0, lsl #2 - and r0, r0, r1 - add r0, r0, r0, lsl #16 - str r0, [lr, # 0] - str r0, [lr, # 4] - str r0, [lr, # 8] - str r0, [lr, #12] - -end_of_row_loop: - @ End of loop - add lr, lr, #16 - subs r12, r12, #1 - bne row_loop - -start_column_loop: - @ Start of column loop - ldr lr, [ sp ] - mov r12, #8 -column_loop: - ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' - ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' - ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' - ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' - - ldr r3, [r11, #FIX_0_541196100_ID] - add r1, r2, r6 - ldr r5, [r11, #FIX_M_1_847759065_ID] - mul r1, r3, r1 @ r1 = z1 - ldr r3, [r11, #FIX_0_765366865_ID] - mla r6, r5, r6, r1 @ r6 = tmp2 - add r5, r0, r4 @ r5 = tmp0 - mla r2, r3, r2, r1 @ r2 = tmp3 - sub r3, r0, r4 @ r3 = tmp1 - - add r0, r2, r5, lsl #13 @ r0 = tmp10 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13 - add r4, r6, r3, lsl #13 @ r4 = tmp11 - rsb r6, r6, r3, lsl #13 @ r6 = tmp12 - - ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' - ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' - ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' - ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' - - @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) - orr r9, r1, r3 - orr r10, r5, r7 - orrs r10, r9, r10 - beq empty_odd_column - - stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11 - - add r0, r3, r5 @ r0 = 'z2' - add r2, r1, r7 @ r2 = 'z1' - add r4, r3, r7 @ r4 = 'z3' - add r6, r1, r5 @ r6 = 'z4' - ldr r9, [r11, #FIX_1_175875602_ID] - add r8, r4, r6 - ldr r10, [r11, #FIX_M_0_899976223_ID] - mul r8, r9, r8 @ r8 = 'z5' - ldr r9, [r11, #FIX_M_2_562915447_ID] - mul r2, r10, r2 @ r2 = 'z1' - ldr r10, [r11, #FIX_M_1_961570560_ID] - mul r0, r9, r0 @ r0 = 'z2' - ldr r9, [r11, #FIX_M_0_390180644_ID] - mla r4, r10, r4, r8 @ r4 = 'z3' - ldr r10, [r11, #FIX_0_298631336_ID] - mla r6, r9, r6, r8 @ r6 = 'z4' - ldr r9, [r11, #FIX_2_053119869_ID] - mla r7, r10, r7, r2 @ r7 = tmp0 + z1 - ldr r10, [r11, #FIX_3_072711026_ID] - mla r5, r9, r5, r0 @ r5 = tmp1 + z2 - ldr r9, [r11, #FIX_1_501321110_ID] - mla r3, r10, r3, r0 @ r3 = tmp2 + z2 - add r7, r7, r4 @ r7 = tmp0 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1 - add r5, r5, r6 @ r5 = tmp1 - add r3, r3, r4 @ r3 = tmp2 - add r1, r1, r6 @ r1 = tmp3 - - ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 - - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) - add r8, r0, r1 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 0*8)] - - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) - sub r8, r0, r1 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(14*8)] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) - add r8, r4, r3 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 2*8)] - - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) - sub r8, r4, r3 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(12*8)] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) - add r8, r6, r5 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 4*8)] - - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) - sub r8, r6, r5 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(10*8)] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) - add r8, r2, r7 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 6*8)] - - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) - sub r8, r2, r7 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 8*8)] - - @ End of row loop - add lr, lr, #2 - subs r12, r12, #1 - bne column_loop - beq the_end - -empty_odd_column: - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) - add r0, r0, #(1<<17) - mov r0, r0, asr #18 - strh r0, [lr, #( 0*8)] - strh r0, [lr, #(14*8)] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) - add r4, r4, #(1<<17) - mov r4, r4, asr #18 - strh r4, [lr, #( 2*8)] - strh r4, [lr, #(12*8)] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) - add r6, r6, #(1<<17) - mov r6, r6, asr #18 - strh r6, [lr, #( 4*8)] - strh r6, [lr, #(10*8)] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) - add r2, r2, #(1<<17) - mov r2, r2, asr #18 - strh r2, [lr, #( 6*8)] - strh r2, [lr, #( 8*8)] - - @ End of row loop - add lr, lr, #2 - subs r12, r12, #1 - bne column_loop - -the_end: - @ The end.... - add sp, sp, #4 - ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return - -const_array: - .align - .word FIX_0_298631336 - .word FIX_0_541196100 - .word FIX_0_765366865 - .word FIX_1_175875602 - .word FIX_1_501321110 - .word FIX_2_053119869 - .word FIX_3_072711026 - .word FIX_M_0_390180644 - .word FIX_M_0_899976223 - .word FIX_M_1_847759065 - .word FIX_M_1_961570560 - .word FIX_M_2_562915447 - .word FIX_0xFFFF diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mathops.h --- a/ffmpeg_smp/h264dec/libavcodec/arm/mathops.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2006 Michael Niedermayer et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_MATHOPS_H -#define AVCODEC_ARM_MATHOPS_H - -#include -#include "config.h" -#include "libavutil/common.h" - -#if HAVE_INLINE_ASM - -# define MULL MULL -static inline av_const int MULL(int a, int b, unsigned shift) -{ - int lo, hi; - __asm__("smull %0, %1, %2, %3 \n\t" - "mov %0, %0, lsr %4 \n\t" - "add %1, %0, %1, lsl %5 \n\t" - : "=&r"(lo), "=&r"(hi) - : "r"(b), "r"(a), "ir"(shift), "ir"(32-shift)); - return hi; -} - -#define MULH MULH -#if HAVE_ARMV6 -static inline av_const int MULH(int a, int b) -{ - int r; - __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); - return r; -} -#else -static inline av_const int MULH(int a, int b) -{ - int lo, hi; - __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a)); - return hi; -} -#endif - -static inline av_const int64_t MUL64(int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x; - __asm__ ("smull %0, %1, %2, %3" - : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b)); - return x.x; -} -#define MUL64 MUL64 - -static inline av_const int64_t MAC64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - __asm__ ("smlal %0, %1, %2, %3" - : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b)); - return x.x; -} -#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) -#define MLS64(d, a, b) MAC64(d, -(a), b) - -#if HAVE_ARMV5TE - -/* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); - -/* signed 16x16 -> 32 multiply */ -# define MUL16 MUL16 -static inline av_const int MUL16(int ra, int rb) -{ - int rt; - __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); - return rt; -} - -#endif - -#define mid_pred mid_pred -static inline av_const int mid_pred(int a, int b, int c) -{ - int m; - __asm__ volatile ( - "mov %0, %2 \n\t" - "cmp %1, %2 \n\t" - "movgt %0, %1 \n\t" - "movgt %1, %2 \n\t" - "cmp %1, %3 \n\t" - "movle %1, %3 \n\t" - "cmp %0, %1 \n\t" - "movgt %0, %1 \n\t" - : "=&r"(m), "+r"(a) - : "r"(b), "r"(c)); - return m; -} - -#endif /* HAVE_INLINE_ASM */ - -#endif /* AVCODEC_ARM_MATHOPS_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,303 +0,0 @@ -/* - * ARM NEON optimised MDCT - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - preserve8 - - .text - -#define ff_fft_calc_neon X(ff_fft_calc_neon) - -function ff_imdct_half_neon, export=1 - push {r4-r8,lr} - - mov r12, #1 - ldr lr, [r0, #28] @ mdct_bits - ldr r4, [r0, #32] @ tcos - ldr r3, [r0, #8] @ revtab - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #2 @ n4 = n >> 2 - add r7, r2, r12, lsl #1 - mov r12, #-16 - sub r7, r7, #16 - - vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 - vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x - vrev64.32 d17, d17 - vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 - vmul.f32 d6, d17, d2 - vmul.f32 d7, d0, d2 -1: - subs lr, lr, #2 - ldr r6, [r3], #4 - vmul.f32 d4, d0, d3 - vmul.f32 d5, d17, d3 - vsub.f32 d4, d6, d4 - vadd.f32 d5, d5, d7 - uxth r8, r6, ror #16 - uxth r6, r6 - add r8, r1, r8, lsl #3 - add r6, r1, r6, lsl #3 - beq 1f - vld2.32 {d16-d17},[r7,:128],r12 - vld2.32 {d0-d1}, [r2,:128]! - vrev64.32 d17, d17 - vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 - vmul.f32 d6, d17, d2 - vmul.f32 d7, d0, d2 - vst2.32 {d4[0],d5[0]}, [r6,:64] - vst2.32 {d4[1],d5[1]}, [r8,:64] - b 1b -1: - vst2.32 {d4[0],d5[0]}, [r6,:64] - vst2.32 {d4[1],d5[1]}, [r8,:64] - - mov r4, r0 - mov r6, r1 - bl ff_fft_calc_neon - - mov r12, #1 - ldr lr, [r4, #28] @ mdct_bits - ldr r4, [r4, #32] @ tcos - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #3 @ n8 = n >> 3 - - add r4, r4, lr, lsl #3 - add r6, r6, lr, lsl #3 - sub r1, r4, #16 - sub r3, r6, #16 - - mov r7, #-16 - mov r8, r6 - mov r0, r3 - - vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 - vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 - vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 -1: - subs lr, lr, #2 - vmul.f32 d7, d0, d18 - vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 - vmul.f32 d4, d1, d18 - vmul.f32 d5, d21, d19 - vmul.f32 d6, d20, d19 - vmul.f32 d22, d1, d16 - vmul.f32 d23, d21, d17 - vmul.f32 d24, d0, d16 - vmul.f32 d25, d20, d17 - vadd.f32 d7, d7, d22 - vadd.f32 d6, d6, d23 - vsub.f32 d4, d4, d24 - vsub.f32 d5, d5, d25 - beq 1f - vld2.32 {d0-d1}, [r3,:128], r7 - vld2.32 {d20-d21},[r6,:128]! - vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128], r7 - vst2.32 {d5,d7}, [r8,:128]! - b 1b -1: - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128] - vst2.32 {d5,d7}, [r8,:128] - - pop {r4-r8,pc} -endfunc - -function ff_imdct_calc_neon, export=1 - push {r4-r6,lr} - - ldr r3, [r0, #28] - mov r4, #1 - mov r5, r1 - lsl r4, r4, r3 - add r1, r1, r4 - - bl ff_imdct_half_neon - - add r0, r5, r4, lsl #2 - add r1, r5, r4, lsl #1 - sub r0, r0, #8 - sub r2, r1, #16 - mov r3, #-16 - mov r6, #-8 - vmov.i32 d30, #1<<31 -1: - vld1.32 {d0-d1}, [r2,:128], r3 - pld [r0, #-16] - vrev64.32 q0, q0 - vld1.32 {d2-d3}, [r1,:128]! - veor d4, d1, d30 - pld [r2, #-16] - vrev64.32 q1, q1 - veor d5, d0, d30 - vst1.32 {d2}, [r0,:64], r6 - vst1.32 {d3}, [r0,:64], r6 - vst1.32 {d4-d5}, [r5,:128]! - subs r4, r4, #16 - bgt 1b - - pop {r4-r6,pc} -endfunc - -function ff_mdct_calc_neon, export=1 - push {r4-r10,lr} - - mov r12, #1 - ldr lr, [r0, #28] @ mdct_bits - ldr r4, [r0, #32] @ tcos - ldr r3, [r0, #8] @ revtab - lsl lr, r12, lr @ n = 1 << nbits - add r7, r2, lr @ in4u - sub r9, r7, #16 @ in4d - add r2, r7, lr, lsl #1 @ in3u - add r8, r9, lr, lsl #1 @ in3d - add r5, r4, lr, lsl #1 - sub r5, r5, #16 - sub r3, r3, #4 - mov r12, #-16 - - vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 - vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 - vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 - vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 - vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 - vsub.f32 d0, d18, d0 @ in4d-in4u I - vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 - vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 - vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 - vadd.f32 d1, d1, d19 @ in3u+in3d -R - vsub.f32 d16, d16, d2 @ in0u-in2d R - vadd.f32 d17, d17, d3 @ in2u+in1d -I -1: - vmul.f32 d7, d0, d21 @ I*s - ldr r10, [r3, lr, lsr #1] - vmul.f32 d6, d1, d20 @ -R*c - ldr r6, [r3, #4]! - vmul.f32 d4, d1, d21 @ -R*s - vmul.f32 d5, d0, d20 @ I*c - vmul.f32 d24, d16, d30 @ R*c - vmul.f32 d25, d17, d31 @ -I*s - vmul.f32 d22, d16, d31 @ R*s - vmul.f32 d23, d17, d30 @ I*c - subs lr, lr, #16 - vsub.f32 d6, d6, d7 @ -R*c-I*s - vadd.f32 d7, d4, d5 @ -R*s+I*c - vsub.f32 d24, d25, d24 @ I*s-R*c - vadd.f32 d25, d22, d23 @ R*s-I*c - beq 1f - mov r12, #-16 - vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 - vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 - vneg.f32 d7, d7 @ R*s-I*c - vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 - vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 - vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 - vsub.f32 d0, d18, d0 @ in4d-in4u I - vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 - vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 - vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 - vadd.f32 d1, d1, d19 @ in3u+in3d -R - vsub.f32 d16, d16, d2 @ in0u-in2d R - vadd.f32 d17, d17, d3 @ in2u+in1d -I - uxth r12, r6, ror #16 - uxth r6, r6 - add r12, r1, r12, lsl #3 - add r6, r1, r6, lsl #3 - vst2.32 {d6[0],d7[0]}, [r6,:64] - vst2.32 {d6[1],d7[1]}, [r12,:64] - uxth r6, r10, ror #16 - uxth r10, r10 - add r6 , r1, r6, lsl #3 - add r10, r1, r10, lsl #3 - vst2.32 {d24[0],d25[0]},[r10,:64] - vst2.32 {d24[1],d25[1]},[r6,:64] - b 1b -1: - vneg.f32 d7, d7 @ R*s-I*c - uxth r12, r6, ror #16 - uxth r6, r6 - add r12, r1, r12, lsl #3 - add r6, r1, r6, lsl #3 - vst2.32 {d6[0],d7[0]}, [r6,:64] - vst2.32 {d6[1],d7[1]}, [r12,:64] - uxth r6, r10, ror #16 - uxth r10, r10 - add r6 , r1, r6, lsl #3 - add r10, r1, r10, lsl #3 - vst2.32 {d24[0],d25[0]},[r10,:64] - vst2.32 {d24[1],d25[1]},[r6,:64] - - mov r4, r0 - mov r6, r1 - bl ff_fft_calc_neon - - mov r12, #1 - ldr lr, [r4, #28] @ mdct_bits - ldr r4, [r4, #32] @ tcos - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #3 @ n8 = n >> 3 - - add r4, r4, lr, lsl #3 - add r6, r6, lr, lsl #3 - sub r1, r4, #16 - sub r3, r6, #16 - - mov r7, #-16 - mov r8, r6 - mov r0, r3 - - vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 - vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 - vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 -1: - subs lr, lr, #2 - vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 - vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 - vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 - vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 - vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 - vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 - vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 - vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 - vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 - vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 - vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 - vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 - vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 - vneg.f32 q2, q2 - beq 1f - vld2.32 {d0-d1}, [r3,:128], r7 - vld2.32 {d20-d21},[r6,:128]! - vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128], r7 - vst2.32 {d5,d7}, [r8,:128]! - b 1b -1: - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128] - vst2.32 {d5,d7}, [r8,:128] - - pop {r4-r10,pc} -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2002 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" -#include "mpegvideo_arm.h" - -void MPV_common_init_arm(MpegEncContext *s) -{ - /* IWMMXT support is a superset of armv5te, so - * allow optimized functions for armv5te unless - * a better iwmmxt function exists - */ -#if HAVE_ARMV5TE - MPV_common_init_armv5te(s); -#endif -#if HAVE_IWMMXT - MPV_common_init_iwmmxt(s); -#endif -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h --- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_MPEGVIDEO_H -#define AVCODEC_ARM_MPEGVIDEO_H - -#include "libavcodec/mpegvideo.h" - -void MPV_common_init_iwmmxt(MpegEncContext *s); -void MPV_common_init_armv5te(MpegEncContext *s); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -/* - * Optimization of some functions from mpegvideo.c for armv5te - * Copyright (c) 2007 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" -#include "mpegvideo_arm.h" - -void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count); - -#ifdef ENABLE_ARM_TESTS -/** - * h263 dequantizer supplementary function, it is performance critical and needs to - * have optimized implementations for each architecture. Is also used as a reference - * implementation in regression tests - */ -static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count) -{ - int i, level; - for (i = 0; i < count; i++) { - level = block[i]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[i] = level; - } - } -} -#endif - -static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qmul = qscale << 1; - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level = block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); - block[0] = level; -} - -static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); -} - -void MPV_common_init_armv5te(MpegEncContext *s) -{ - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ -/* - * Optimization of some functions from mpegvideo.c for armv5te - * Copyright (c) 2007 Siarhei Siamashka - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "asm.S" - -/* - * Special optimized version of dct_unquantize_h263_helper_c, it - * requires the block to be at least 8 bytes aligned, and may process - * more elements than requested. But it is guaranteed to never - * process more than 64 elements provided that count argument is <= 64, - * so it is safe. This function is optimized for a common distribution - * of values for nCoeffs (they are mostly multiple of 8 plus one or - * two extra elements). So this function processes data as 8 elements - * per loop iteration and contains optional 2 elements processing in - * the end. - * - * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) - */ -function ff_dct_unquantize_h263_armv5te, export=1 - push {r4-r9,lr} - mov ip, #0 - subs r3, r3, #2 - ble 2f - ldrd r4, [r0, #0] -1: - ldrd r6, [r0, #8] - - rsbs r9, ip, r4, asr #16 - addgt r9, r2, #0 - rsblt r9, r2, #0 - smlatbne r9, r4, r1, r9 - - rsbs lr, ip, r5, asr #16 - addgt lr, r2, #0 - rsblt lr, r2, #0 - smlatbne lr, r5, r1, lr - - rsbs r8, ip, r4, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r4, r4, r1, r8 - - rsbs r8, ip, r5, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r5, r5, r1, r8 - - strh r4, [r0], #2 - strh r9, [r0], #2 - strh r5, [r0], #2 - strh lr, [r0], #2 - - rsbs r9, ip, r6, asr #16 - addgt r9, r2, #0 - rsblt r9, r2, #0 - smlatbne r9, r6, r1, r9 - - rsbs lr, ip, r7, asr #16 - addgt lr, r2, #0 - rsblt lr, r2, #0 - smlatbne lr, r7, r1, lr - - rsbs r8, ip, r6, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r6, r6, r1, r8 - - rsbs r8, ip, r7, asl #16 - addgt r8, r2, #0 - rsblt r8, r2, #0 - smlabbne r7, r7, r1, r8 - - strh r6, [r0], #2 - strh r9, [r0], #2 - strh r7, [r0], #2 - strh lr, [r0], #2 - - subs r3, r3, #8 - ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ - bgt 1b - - adds r3, r3, #2 - pople {r4-r9,pc} -2: - ldrsh r9, [r0, #0] - ldrsh lr, [r0, #2] - mov r8, r2 - cmp r9, #0 - rsblt r8, r2, #0 - smlabbne r9, r9, r1, r8 - mov r8, r2 - cmp lr, #0 - rsblt r8, r2, #0 - smlabbne lr, lr, r1, r8 - strh r9, [r0], #2 - strh lr, [r0], #2 - pop {r4-r9,pc} -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c --- a/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ -/* - * copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" -#include "mpegvideo_arm.h" - -static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int level, qmul, qadd; - int nCoeffs; - DCTELEM *block_orig = block; - - assert(s->block_last_index[n]>=0); - - qmul = qscale << 1; - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level = block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - __asm__ volatile ( -/* "movd %1, %%mm6 \n\t" //qmul */ -/* "packssdw %%mm6, %%mm6 \n\t" */ -/* "packssdw %%mm6, %%mm6 \n\t" */ - "tbcsth wr6, %[qmul] \n\t" -/* "movd %2, %%mm5 \n\t" //qadd */ -/* "packssdw %%mm5, %%mm5 \n\t" */ -/* "packssdw %%mm5, %%mm5 \n\t" */ - "tbcsth wr5, %[qadd] \n\t" - "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ - "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ - "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ - "1: \n\t" - "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ - "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ - "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ - "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ -/* "movq (%0, %3), %%mm2 \n\t" */ -/* "movq 8(%0, %3), %%mm3 \n\t" */ - "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ - "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ - "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ - "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ - "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ - "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ - "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ - "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ - "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ - "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ - "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ - "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ - "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ - "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ - "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ - "subs %[i], %[i], #1 \n\t" - "bne 1b \n\t" /* "jng 1b \n\t" */ - :[block]"+r"(block) - :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) - :"memory"); - - block_orig[0] = level; -} - -#if 0 -static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); -} -#endif - -void MPV_common_init_iwmmxt(MpegEncContext *s) -{ - if (!(mm_flags & FF_MM_IWMMXT)) return; - - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; -#if 0 - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; -#endif -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,151 +0,0 @@ -/* - * ARM NEON optimised RDFT - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - preserve8 - -function ff_rdft_calc_neon, export=1 - push {r4-r8,lr} - - ldr r6, [r0, #4] @ inverse - mov r4, r0 - mov r5, r1 - - lsls r6, r6, #31 - bne 1f - add r0, r4, #20 - bl X(ff_fft_permute_neon) - add r0, r4, #20 - mov r1, r5 - bl X(ff_fft_calc_neon) -1: - ldr r12, [r4, #0] @ nbits - mov r2, #1 - lsl r12, r2, r12 - add r0, r5, #8 - add r1, r5, r12, lsl #2 - lsr r12, r12, #2 - ldr r2, [r4, #12] @ tcos - sub r12, r12, #2 - ldr r3, [r4, #16] @ tsin - mov r7, r0 - sub r1, r1, #8 - mov lr, r1 - mov r8, #-8 - vld1.32 {d0}, [r0,:64]! @ d1[0,1] - vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] - vld1.32 {d4}, [r2,:64]! @ tcos[i] - vld1.32 {d5}, [r3,:64]! @ tsin[i] - vmov.f32 d18, #0.5 @ k1 - vdup.32 d19, r6 - pld [r0, #32] - veor d19, d18, d19 @ k2 - vmov.i32 d16, #0 - vmov.i32 d17, #1<<31 - pld [r1, #-32] - vtrn.32 d16, d17 - pld [r2, #32] - vrev64.32 d16, d16 @ d16=1,0 d17=0,1 - pld [r3, #32] -2: - veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] - vld1.32 {d24}, [r0,:64]! @ d1[0,1] - vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] - vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] - vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] - veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] - pld [r0, #32] - vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re - pld [r1, #-32] - vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] - vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] - vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re - veor d7, d21, d16 @ -od.im, od.re - vrev64.32 d3, d21 @ od.re, od.im - veor d6, d20, d17 @ ev.re,-ev.im - veor d2, d3, d16 @ -od.re, od.im - vmla.f32 d20, d3, d4[1] - vmla.f32 d20, d7, d5[1] - vmla.f32 d6, d2, d4[1] - vmla.f32 d6, d21, d5[1] - vld1.32 {d4}, [r2,:64]! @ tcos[i] - veor d7, d23, d16 @ -od.im, od.re - vld1.32 {d5}, [r3,:64]! @ tsin[i] - veor d24, d22, d17 @ ev.re,-ev.im - vrev64.32 d3, d23 @ od.re, od.im - pld [r2, #32] - veor d2, d3, d16 @ -od.re, od.im - pld [r3, #32] - vmla.f32 d22, d3, d4[0] - vmla.f32 d22, d7, d5[0] - vmla.f32 d24, d2, d4[0] - vmla.f32 d24, d23, d5[0] - vld1.32 {d0}, [r0,:64]! @ d1[0,1] - vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] - vst1.32 {d20}, [r7,:64]! - vst1.32 {d6}, [lr,:64], r8 - vst1.32 {d22}, [r7,:64]! - vst1.32 {d24}, [lr,:64], r8 - subs r12, r12, #2 - bgt 2b - - veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] - vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] - vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] - ldr r2, [r4, #8] @ sign_convention - vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re - add r0, r0, #4 - bfc r2, #0, #31 - vld1.32 {d0[0]}, [r0,:32] - veor d7, d21, d16 @ -od.im, od.re - vrev64.32 d3, d21 @ od.re, od.im - veor d6, d20, d17 @ ev.re,-ev.im - vld1.32 {d22}, [r5,:64] - vdup.32 d1, r2 - vmov d23, d22 - veor d2, d3, d16 @ -od.re, od.im - vtrn.32 d22, d23 - veor d0, d0, d1 - veor d23, d23, d17 - vmla.f32 d20, d3, d4[1] - vmla.f32 d20, d7, d5[1] - vmla.f32 d6, d2, d4[1] - vmla.f32 d6, d21, d5[1] - vadd.f32 d22, d22, d23 - vst1.32 {d20}, [r7,:64] - vst1.32 {d6}, [lr,:64] - vst1.32 {d0[0]}, [r0,:32] - vst1.32 {d22}, [r5,:64] - - cmp r6, #0 - popeq {r4-r8,pc} - - vmul.f32 d22, d22, d18 - vst1.32 {d22}, [r5,:64] - add r0, r4, #20 - mov r1, r5 - bl X(ff_fft_permute_neon) - add r0, r4, #20 - mov r1, r5 - pop {r4-r8,lr} - b X(ff_fft_calc_neon) -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,486 +0,0 @@ -/* - * simple_idct_arm.S - * Copyright (C) 2002 Frederic 'dilb' Boulay - * - * Author: Frederic Boulay - * - * The function defined in this file is derived from the simple_idct function - * from the libavcodec library part of the FFmpeg project. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -/* useful constants for the algorithm, they are save in __constant_ptr__ at */ -/* the end of the source code.*/ -#define W1 22725 -#define W2 21407 -#define W3 19266 -#define W4 16383 -#define W5 12873 -#define W6 8867 -#define W7 4520 -#define MASK_MSHW 0xFFFF0000 - -/* offsets of the constants in the vector */ -#define offW1 0 -#define offW2 4 -#define offW3 8 -#define offW4 12 -#define offW5 16 -#define offW6 20 -#define offW7 24 -#define offMASK_MSHW 28 - -#define ROW_SHIFT 11 -#define ROW_SHIFT2MSHW (16-11) -#define COL_SHIFT 20 -#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ -#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ - - - .text - -function ff_simple_idct_arm, export=1 - @@ void simple_idct_arm(int16_t *block) - @@ save stack for reg needed (take all of them), - @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block - @@ so it must not be overwritten, if it is not saved!! - @@ R12 is another scratch register, so it should not be saved too - @@ save all registers - stmfd sp!, {r4-r11, r14} @ R14 is also called LR - @@ at this point, R0=block, other registers are free. - add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. - adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it - @@ add 2 temporary variables in the stack: R0 and R14 - sub sp, sp, #8 @ allow 2 local variables - str r0, [sp, #0] @ save block in sp[0] - @@ stack status - @@ sp+4 free - @@ sp+0 R0 (block) - - - @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free - - -__row_loop: - @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) - ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) - ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] - ldr r3, [r14, #8] @ R3=ROWr32[2] - ldr r4, [r14, #12] @ R4=ROWr32[3] - @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), - @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) - @@ else follow the complete algorithm. - @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], - @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free - orr r5, r4, r3 @ R5=R4 | R3 - orr r5, r5, r2 @ R5=R4 | R3 | R2 - orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) - beq __end_row_loop - mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) - ldrsh r6, [r14, #0] @ R6=ROWr16[0] - orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 - beq __almost_empty_row - -__b_evaluation: - @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], - @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, - @@ R12=__const_ptr_, R14=&block[n] - @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 - - @@ MUL16(b0, W1, row[1]); - @@ MUL16(b1, W3, row[1]); - @@ MUL16(b2, W5, row[1]); - @@ MUL16(b3, W7, row[1]); - @@ MAC16(b0, W3, row[3]); - @@ MAC16(b1, -W7, row[3]); - @@ MAC16(b2, -W1, row[3]); - @@ MAC16(b3, -W5, row[3]); - ldr r8, [r12, #offW1] @ R8=W1 - mov r2, r2, asr #16 @ R2=ROWr16[3] - mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, [r12, #offW3] @ R9=W3 - ldr r10, [r12, #offW5] @ R10=W5 - mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, [r12, #offW7] @ R11=W7 - mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - teq r2, #0 @ if null avoid muls - mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - rsbne r2, r2, #0 @ R2=-ROWr16[3] - mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - - @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], - @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; - @@ if (temp != 0) {} - orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] - beq __end_b_evaluation - - @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], - @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ MAC16(b0, W5, row[5]); - @@ MAC16(b2, W7, row[5]); - @@ MAC16(b3, W3, row[5]); - @@ MAC16(b1, -W1, row[5]); - @@ MAC16(b0, W7, row[7]); - @@ MAC16(b2, W3, row[7]); - @@ MAC16(b3, -W1, row[7]); - @@ MAC16(b1, -W5, row[7]); - mov r3, r3, asr #16 @ R3=ROWr16[5] - teq r3, #0 @ if null avoid muls - mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 - mov r4, r4, asr #16 @ R4=ROWr16[7] - mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 - mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 - rsbne r3, r3, #0 @ R3=-ROWr16[5] - mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 - @@ R3 is free now - teq r4, #0 @ if null avoid muls - mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 - mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 - rsbne r4, r4, #0 @ R4=-ROWr16[7] - mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 - mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 - @@ R4 is free now -__end_b_evaluation: - @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), - @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - -__a_evaluation: - @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); - @@ a1 = a0 + W6 * row[2]; - @@ a2 = a0 - W6 * row[2]; - @@ a3 = a0 - W2 * row[2]; - @@ a0 = a0 + W2 * row[2]; - ldr r9, [r12, #offW4] @ R9=W4 - mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, [r12, #offW6] @ R10=W6 - ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) - add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) - - mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, [r12, #offW2] @ R8=W2 - sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) - @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; - @@ if (temp != 0) {} - teq r2, #0 - beq __end_bef_a_evaluation - - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - - - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - - - @@ a0 += W4*row[4] - @@ a1 -= W4*row[4] - @@ a2 -= W4*row[4] - @@ a3 += W4*row[4] - ldrsh r11, [r14, #8] @ R11=ROWr16[4] - teq r11, #0 @ if null avoid muls - mulne r11, r9, r11 @ R11=W4*ROWr16[4] - @@ R9 is free now - ldrsh r9, [r14, #12] @ R9=ROWr16[6] - addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) - subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) - subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) - addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) - @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead - teq r9, #0 @ if null avoid muls - mulne r11, r10, r9 @ R11=W6*ROWr16[6] - addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) - mulne r10, r8, r9 @ R10=W2*ROWr16[6] - @@ a0 += W6*row[6]; - @@ a3 -= W6*row[6]; - @@ a1 -= W2*row[6]; - @@ a2 += W2*row[6]; - subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) - subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) - addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) - -__end_a_evaluation: - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ row[0] = (a0 + b0) >> ROW_SHIFT; - @@ row[1] = (a1 + b1) >> ROW_SHIFT; - @@ row[2] = (a2 + b2) >> ROW_SHIFT; - @@ row[3] = (a3 + b3) >> ROW_SHIFT; - @@ row[4] = (a3 - b3) >> ROW_SHIFT; - @@ row[5] = (a2 - b2) >> ROW_SHIFT; - @@ row[6] = (a1 - b1) >> ROW_SHIFT; - @@ row[7] = (a0 - b0) >> ROW_SHIFT; - add r8, r6, r0 @ R8=a0+b0 - add r9, r2, r1 @ R9=a1+b1 - @@ put 2 16 bits half-words in a 32bits word - @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) - ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) - mvn r11, r10 @ R11= NOT R10= 0x0000FFFF - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) - orr r8, r8, r9 - str r8, [r14, #0] - - add r8, r3, r5 @ R8=a2+b2 - add r9, r4, r7 @ R9=a3+b3 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) - orr r8, r8, r9 - str r8, [r14, #4] - - sub r8, r4, r7 @ R8=a3-b3 - sub r9, r3, r5 @ R9=a2-b2 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) - orr r8, r8, r9 - str r8, [r14, #8] - - sub r8, r2, r1 @ R8=a1-b1 - sub r9, r6, r0 @ R9=a0-b0 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) - orr r8, r8, r9 - str r8, [r14, #12] - - bal __end_row_loop - -__almost_empty_row: - @@ the row was empty, except ROWr16[0], now, management of this special case - @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], - @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], - @@ R8=0xFFFF (temp), R9-R11 free - mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). - sub r8, r8, #1 @ R8 is now ready. - and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF - orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) - str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 - str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 - str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 - str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 - -__end_row_loop: - @@ at this point, R0-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - ldr r0, [sp, #0] @ R0=block - teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. - sub r14, r14, #16 - bne __row_loop - - - - @@ at this point, R0=block, R1-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. -__col_loop: - -__b_evaluation2: - @@ at this point, R0=block (temp), R1-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - @@ proceed with b0-b3 first, followed by a0-a3 - @@ MUL16(b0, W1, col[8x1]); - @@ MUL16(b1, W3, col[8x1]); - @@ MUL16(b2, W5, col[8x1]); - @@ MUL16(b3, W7, col[8x1]); - @@ MAC16(b0, W3, col[8x3]); - @@ MAC16(b1, -W7, col[8x3]); - @@ MAC16(b2, -W1, col[8x3]); - @@ MAC16(b3, -W5, col[8x3]); - ldr r8, [r12, #offW1] @ R8=W1 - ldrsh r7, [r14, #16] - mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, [r12, #offW3] @ R9=W3 - ldr r10, [r12, #offW5] @ R10=W5 - mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, [r12, #offW7] @ R11=W7 - mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldrsh r2, [r14, #48] - mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - teq r2, #0 @ if 0, then avoid muls - mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - rsbne r2, r2, #0 @ R2=-ROWr16[3] - mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - - @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), - @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ MAC16(b0, W5, col[5x8]); - @@ MAC16(b2, W7, col[5x8]); - @@ MAC16(b3, W3, col[5x8]); - @@ MAC16(b1, -W1, col[5x8]); - @@ MAC16(b0, W7, col[7x8]); - @@ MAC16(b2, W3, col[7x8]); - @@ MAC16(b3, -W1, col[7x8]); - @@ MAC16(b1, -W5, col[7x8]); - ldrsh r3, [r14, #80] @ R3=COLr16[5x8] - teq r3, #0 @ if 0 then avoid muls - mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 - mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 - mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 - rsbne r3, r3, #0 @ R3=-ROWr16[5x8] - ldrsh r4, [r14, #112] @ R4=COLr16[7x8] - mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 - @@ R3 is free now - teq r4, #0 @ if 0 then avoid muls - mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 - mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 - rsbne r4, r4, #0 @ R4=-ROWr16[7x8] - mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 - mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 - @@ R4 is free now -__end_b_evaluation2: - @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), - @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - -__a_evaluation2: - @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); - @@ a1 = a0 + W6 * row[2]; - @@ a2 = a0 - W6 * row[2]; - @@ a3 = a0 - W2 * row[2]; - @@ a0 = a0 + W2 * row[2]; - ldrsh r6, [r14, #0] - ldr r9, [r12, #offW4] @ R9=W4 - mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, [r12, #offW6] @ R10=W6 - ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) - add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) - mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, [r12, #offW2] @ R8=W2 - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ a0 += W4*row[4] - @@ a1 -= W4*row[4] - @@ a2 -= W4*row[4] - @@ a3 += W4*row[4] - ldrsh r11, [r14, #64] @ R11=ROWr16[4] - teq r11, #0 @ if null avoid muls - mulne r11, r9, r11 @ R11=W4*ROWr16[4] - @@ R9 is free now - addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) - subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) - subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) - ldrsh r9, [r14, #96] @ R9=ROWr16[6] - addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) - @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead - teq r9, #0 @ if null avoid muls - mulne r11, r10, r9 @ R11=W6*ROWr16[6] - addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) - mulne r10, r8, r9 @ R10=W2*ROWr16[6] - @@ a0 += W6*row[6]; - @@ a3 -= W6*row[6]; - @@ a1 -= W2*row[6]; - @@ a2 += W2*row[6]; - subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) - subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) - addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) -__end_a_evaluation2: - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); - @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); - @@ col[16] = ((a2 + b2) >> COL_SHIFT); - @@ col[24] = ((a3 + b3) >> COL_SHIFT); - @@ col[32] = ((a3 - b3) >> COL_SHIFT); - @@ col[40] = ((a2 - b2) >> COL_SHIFT); - @@ col[48] = ((a1 - b1) >> COL_SHIFT); - @@ col[56] = ((a0 - b0) >> COL_SHIFT); - @@@@@ no optimization here @@@@@ - add r8, r6, r0 @ R8=a0+b0 - add r9, r2, r1 @ R9=a1+b1 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #0] - strh r9, [r14, #16] - add r8, r3, r5 @ R8=a2+b2 - add r9, r4, r7 @ R9=a3+b3 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #32] - strh r9, [r14, #48] - sub r8, r4, r7 @ R8=a3-b3 - sub r9, r3, r5 @ R9=a2-b2 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #64] - strh r9, [r14, #80] - sub r8, r2, r1 @ R8=a1-b1 - sub r9, r6, r0 @ R9=a0-b0 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #96] - strh r9, [r14, #112] - -__end_col_loop: - @@ at this point, R0-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - ldr r0, [sp, #0] @ R0=block - teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. - sub r14, r14, #2 - bne __col_loop - - - - -__end_simple_idct_arm: - @@ restore registers to previous status! - add sp, sp, #8 @@ the local variables! - ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. - - - -@@ kind of sub-function, here not to overload the common case. -__end_bef_a_evaluation: - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - bal __end_a_evaluation - - -__constant_ptr__: @@ see #defines at the beginning of the source code for values. - .align - .word W1 - .word W2 - .word W3 - .word W4 - .word W5 - .word W6 - .word W7 - .word MASK_MSHW diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,703 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer - * Copyright (c) 2006 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define W13 (W1 | (W3 << 16)) -#define W26 (W2 | (W6 << 16)) -#define W57 (W5 | (W7 << 16)) - - .text - .align -w13: .long W13 -w26: .long W26 -w57: .long W57 - -function idct_row_armv5te - str lr, [sp, #-4]! - - ldrd v1, [a1, #8] - ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ - orrs v1, v1, v2 - cmpeq v1, a4 - cmpeq v1, a3, lsr #16 - beq row_dc_only - - mov v1, #(1<<(ROW_SHIFT-1)) - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ - smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ - ldr ip, w26 /* ip = W2 | (W6 << 16) */ - smultb a2, ip, a4 - smulbb lr, ip, a4 - add v2, v1, a2 - sub v3, v1, a2 - sub v4, v1, lr - add v1, v1, lr - - ldr ip, w13 /* ip = W1 | (W3 << 16) */ - ldr lr, w57 /* lr = W5 | (W7 << 16) */ - smulbt v5, ip, a3 - smultt v6, lr, a4 - smlatt v5, ip, a4, v5 - smultt a2, ip, a3 - smulbt v7, lr, a3 - sub v6, v6, a2 - smulbt a2, ip, a4 - smultt fp, lr, a3 - sub v7, v7, a2 - smulbt a2, lr, a4 - ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ - sub fp, fp, a2 - - orrs a2, a3, a4 - beq 1f - - smlabt v5, lr, a3, v5 - smlabt v6, ip, a3, v6 - smlatt v5, lr, a4, v5 - smlabt v6, lr, a4, v6 - smlatt v7, lr, a3, v7 - smlatt fp, ip, a3, fp - smulbt a2, ip, a4 - smlatt v7, ip, a4, v7 - sub fp, fp, a2 - - ldr ip, w26 /* ip = W2 | (W6 << 16) */ - mov a2, #16384 - sub a2, a2, #1 /* a2 = W4 */ - smulbb a2, a2, a3 /* a2 = W4*row[4] */ - smultb lr, ip, a4 /* lr = W6*row[6] */ - add v1, v1, a2 /* v1 += W4*row[4] */ - add v1, v1, lr /* v1 += W6*row[6] */ - add v4, v4, a2 /* v4 += W4*row[4] */ - sub v4, v4, lr /* v4 -= W6*row[6] */ - smulbb lr, ip, a4 /* lr = W2*row[6] */ - sub v2, v2, a2 /* v2 -= W4*row[4] */ - sub v2, v2, lr /* v2 -= W2*row[6] */ - sub v3, v3, a2 /* v3 -= W4*row[4] */ - add v3, v3, lr /* v3 += W2*row[6] */ - -1: add a2, v1, v5 - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v2, v6 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v3, v7 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - add a2, v4, fp - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, [a1] - - sub a2, v4, fp - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v3, v7 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v2, v6 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - sub a2, v1, v5 - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, [a1, #8] - - ldr pc, [sp], #4 - -row_dc_only: - orr a3, a3, a3, lsl #16 - bic a3, a3, #0xe000 - mov a3, a3, lsl #3 - mov a4, a3 - strd a3, [a1] - strd a3, [a1, #8] - - ldr pc, [sp], #4 -endfunc - - .macro idct_col - ldr a4, [a1] /* a4 = col[1:0] */ - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ -#if 0 - mov v1, #(1<<(COL_SHIFT-1)) - smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ - smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ - ldr a4, [a1, #(16*4)] -#else - mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ - add v2, v1, a4, asr #16 - rsb v2, v2, v2, lsl #14 - mov a4, a4, lsl #16 - add v1, v1, a4, asr #16 - ldr a4, [a1, #(16*4)] - rsb v1, v1, v1, lsl #14 -#endif - - smulbb lr, ip, a4 - smulbt a3, ip, a4 - sub v3, v1, lr - sub v5, v1, lr - add v7, v1, lr - add v1, v1, lr - sub v4, v2, a3 - sub v6, v2, a3 - add fp, v2, a3 - ldr ip, w26 - ldr a4, [a1, #(16*2)] - add v2, v2, a3 - - smulbb lr, ip, a4 - smultb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - add v3, v3, a3 - sub v5, v5, a3 - smulbt lr, ip, a4 - smultt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - add v4, v4, a3 - ldr a4, [a1, #(16*6)] - sub v6, v6, a3 - - smultb lr, ip, a4 - smulbb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - sub v3, v3, a3 - add v5, v5, a3 - smultt lr, ip, a4 - smulbt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - sub v4, v4, a3 - add v6, v6, a3 - - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} - - ldr ip, w13 - ldr a4, [a1, #(16*1)] - ldr lr, w57 - smulbb v1, ip, a4 - smultb v3, ip, a4 - smulbb v5, lr, a4 - smultb v7, lr, a4 - smulbt v2, ip, a4 - smultt v4, ip, a4 - smulbt v6, lr, a4 - smultt fp, lr, a4 - rsb v4, v4, #0 - ldr a4, [a1, #(16*3)] - rsb v3, v3, #0 - - smlatb v1, ip, a4, v1 - smlatb v3, lr, a4, v3 - smulbb a3, ip, a4 - smulbb a2, lr, a4 - sub v5, v5, a3 - sub v7, v7, a2 - smlatt v2, ip, a4, v2 - smlatt v4, lr, a4, v4 - smulbt a3, ip, a4 - smulbt a2, lr, a4 - sub v6, v6, a3 - ldr a4, [a1, #(16*5)] - sub fp, fp, a2 - - smlabb v1, lr, a4, v1 - smlabb v3, ip, a4, v3 - smlatb v5, lr, a4, v5 - smlatb v7, ip, a4, v7 - smlabt v2, lr, a4, v2 - smlabt v4, ip, a4, v4 - smlatt v6, lr, a4, v6 - ldr a3, [a1, #(16*7)] - smlatt fp, ip, a4, fp - - smlatb v1, lr, a3, v1 - smlabb v3, lr, a3, v3 - smlatb v5, ip, a3, v5 - smulbb a4, ip, a3 - smlatt v2, lr, a3, v2 - sub v7, v7, a4 - smlabt v4, lr, a3, v4 - smulbt a4, ip, a3 - smlatt v6, ip, a3, v6 - sub fp, fp, a4 - .endm - -function idct_col_armv5te - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - adds a2, a3, v1 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, v2 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1] - subs a3, a3, v1 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, v2 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*7)] - - subs a2, a3, v3 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - sub ip, a4, v4 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*1)] - adds a3, a3, v3 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - add a4, a4, v4 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*6)] - - adds a2, a3, v5 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, v6 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*2)] - subs a3, a3, v5 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, v6 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*5)] - - adds a2, a3, v7 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, fp - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*3)] - subs a3, a3, v7 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, fp - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - str a2, [a1, #(16*4)] - - ldr pc, [sp], #4 -endfunc - -function idct_col_put_armv5te - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - ldr lr, [sp, #32] - add a2, a3, v1 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, v2 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - sub a3, a3, v1 - movs a3, a3, asr #20 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - sub a4, a4, v2 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - ldr v1, [sp, #28] - movgt a4, #255 - strh a2, [v1] - add a2, v1, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - rsb v2, lr, lr, lsl #3 - ldmfd sp!, {a3, a4} - strh a2, [v2, v1]! - - sub a2, a3, v3 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub ip, a4, v4 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! - add a3, a3, v3 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add a4, a4, v4 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! - - add a2, a3, v5 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, v6 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! - sub a3, a3, v5 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub a4, a4, v6 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! - - add a2, a3, v7 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, fp - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr] - sub a3, a3, v7 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub a4, a4, fp - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - strh a2, [v2, -lr] - - ldr pc, [sp], #4 -endfunc - -function idct_col_add_armv5te - str lr, [sp, #-4]! - - idct_col - - ldr lr, [sp, #36] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr] - add a2, a3, v1 - mov a2, a2, asr #20 - sub a3, a3, v1 - and v1, ip, #255 - adds a2, a2, v1 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v1, a4, v2 - mov v1, v1, asr #20 - adds v1, v1, ip, lsr #8 - movmi v1, #0 - cmp v1, #255 - movgt v1, #255 - orr a2, a2, v1, lsl #8 - ldr v1, [sp, #32] - sub a4, a4, v2 - rsb v2, v1, v1, lsl #3 - ldrh ip, [v2, lr]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - add a2, lr, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - sub a2, a3, v3 - mov a2, a2, asr #20 - add a3, a3, v3 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub v3, a4, v4 - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - add a4, a4, v4 - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - add a2, a3, v5 - mov a2, a2, asr #20 - sub a3, a3, v5 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v3, a4, v6 - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - sub a4, a4, v6 - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - add a2, a3, v7 - mov a2, a2, asr #20 - sub a3, a3, v7 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v3, a4, fp - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - sub a4, a4, fp - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldr pc, [sp], #4 -endfunc - -function ff_simple_idct_armv5te, export=1 - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} -endfunc - -function ff_simple_idct_add_armv5te, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} -endfunc - -function ff_simple_idct_put_armv5te, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,433 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer - * Copyright (c) 2007 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define W13 (W1 | (W3 << 16)) -#define W26 (W2 | (W6 << 16)) -#define W42 (W4 | (W2 << 16)) -#define W42n (-W4&0xffff | (-W2 << 16)) -#define W46 (W4 | (W6 << 16)) -#define W57 (W5 | (W7 << 16)) - - .text - .align -w13: .long W13 -w26: .long W26 -w42: .long W42 -w42n: .long W42n -w46: .long W46 -w57: .long W57 - -/* - Compute partial IDCT of single row. - shift = left-shift amount - r0 = source address - r2 = row[2,0] <= 2 cycles - r3 = row[3,1] - ip = w42 <= 2 cycles - - Output in registers r4--r11 -*/ - .macro idct_row shift - ldr lr, w46 /* lr = W4 | (W6 << 16) */ - mov r1, #(1<<(\shift-1)) - smlad r4, r2, ip, r1 - smlsd r7, r2, ip, r1 - ldr ip, w13 /* ip = W1 | (W3 << 16) */ - ldr r10,w57 /* r10 = W5 | (W7 << 16) */ - smlad r5, r2, lr, r1 - smlsd r6, r2, lr, r1 - - smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ - smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ - ldr lr, [r0, #12] /* lr = row[7,5] */ - pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ - pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ - smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ - smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ - smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ - - ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */ - smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ - ldr r2, [r0, #4] /* r2 = row[6,4] */ - smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ - ldr ip, w46 /* ip = W4 | (W6 << 16) */ - smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ - - smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ - smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ - smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ - smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ - .endm - -/* - Compute partial IDCT of half row. - shift = left-shift amount - r2 = row[2,0] - r3 = row[3,1] - ip = w42 - - Output in registers r4--r11 -*/ - .macro idct_row4 shift - ldr lr, w46 /* lr = W4 | (W6 << 16) */ - ldr r10,w57 /* r10 = W5 | (W7 << 16) */ - mov r1, #(1<<(\shift-1)) - smlad r4, r2, ip, r1 - smlsd r7, r2, ip, r1 - ldr ip, w13 /* ip = W1 | (W3 << 16) */ - smlad r5, r2, lr, r1 - smlsd r6, r2, lr, r1 - smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ - smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ - pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ - pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ - smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ - smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ - .endm - -/* - Compute final part of IDCT single row without shift. - Input in registers r4--r11 - Output in registers ip, r4--r6, lr, r8--r10 -*/ - .macro idct_finish - add ip, r4, r8 /* r1 = A0 + B0 */ - sub lr, r4, r8 /* r2 = A0 - B0 */ - sub r4, r5, r9 /* r2 = A1 + B1 */ - add r8, r5, r9 /* r2 = A1 - B1 */ - add r5, r6, r10 /* r1 = A2 + B2 */ - sub r9, r6, r10 /* r1 = A2 - B2 */ - add r6, r7, r11 /* r2 = A3 + B3 */ - sub r10,r7, r11 /* r2 = A3 - B3 */ - .endm - -/* - Compute final part of IDCT single row. - shift = right-shift amount - Input/output in registers r4--r11 -*/ - .macro idct_finish_shift shift - add r3, r4, r8 /* r3 = A0 + B0 */ - sub r2, r4, r8 /* r2 = A0 - B0 */ - mov r4, r3, asr #\shift - mov r8, r2, asr #\shift - - sub r3, r5, r9 /* r3 = A1 + B1 */ - add r2, r5, r9 /* r2 = A1 - B1 */ - mov r5, r3, asr #\shift - mov r9, r2, asr #\shift - - add r3, r6, r10 /* r3 = A2 + B2 */ - sub r2, r6, r10 /* r2 = A2 - B2 */ - mov r6, r3, asr #\shift - mov r10,r2, asr #\shift - - add r3, r7, r11 /* r3 = A3 + B3 */ - sub r2, r7, r11 /* r2 = A3 - B3 */ - mov r7, r3, asr #\shift - mov r11,r2, asr #\shift - .endm - -/* - Compute final part of IDCT single row, saturating results at 8 bits. - shift = right-shift amount - Input/output in registers r4--r11 -*/ - .macro idct_finish_shift_sat shift - add r3, r4, r8 /* r3 = A0 + B0 */ - sub ip, r4, r8 /* ip = A0 - B0 */ - usat r4, #8, r3, asr #\shift - usat r8, #8, ip, asr #\shift - - sub r3, r5, r9 /* r3 = A1 + B1 */ - add ip, r5, r9 /* ip = A1 - B1 */ - usat r5, #8, r3, asr #\shift - usat r9, #8, ip, asr #\shift - - add r3, r6, r10 /* r3 = A2 + B2 */ - sub ip, r6, r10 /* ip = A2 - B2 */ - usat r6, #8, r3, asr #\shift - usat r10,#8, ip, asr #\shift - - add r3, r7, r11 /* r3 = A3 + B3 */ - sub ip, r7, r11 /* ip = A3 - B3 */ - usat r7, #8, r3, asr #\shift - usat r11,#8, ip, asr #\shift - .endm - -/* - Compute IDCT of single row, storing as column. - r0 = source - r1 = dest -*/ -function idct_row_armv6 - push {lr} - - ldr lr, [r0, #12] /* lr = row[7,5] */ - ldr ip, [r0, #4] /* ip = row[6,4] */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - ldr r2, [r0] /* r2 = row[2,0] */ - orrs lr, lr, ip - cmpeq lr, r3 - cmpeq lr, r2, lsr #16 - beq 1f - push {r1} - ldr ip, w42 /* ip = W4 | (W2 << 16) */ - cmp lr, #0 - beq 2f - - idct_row ROW_SHIFT - b 3f - -2: idct_row4 ROW_SHIFT - -3: pop {r1} - idct_finish_shift ROW_SHIFT - - strh r4, [r1] - strh r5, [r1, #(16*2)] - strh r6, [r1, #(16*4)] - strh r7, [r1, #(16*6)] - strh r11,[r1, #(16*1)] - strh r10,[r1, #(16*3)] - strh r9, [r1, #(16*5)] - strh r8, [r1, #(16*7)] - - pop {pc} - -1: mov r2, r2, lsl #3 - strh r2, [r1] - strh r2, [r1, #(16*2)] - strh r2, [r1, #(16*4)] - strh r2, [r1, #(16*6)] - strh r2, [r1, #(16*1)] - strh r2, [r1, #(16*3)] - strh r2, [r1, #(16*5)] - strh r2, [r1, #(16*7)] - pop {pc} -endfunc - -/* - Compute IDCT of single column, read as row. - r0 = source - r1 = dest -*/ -function idct_col_armv6 - push {r1, lr} - - ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, w42 /* ip = W4 | (W2 << 16) */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - idct_row COL_SHIFT - pop {r1} - idct_finish_shift COL_SHIFT - - strh r4, [r1] - strh r5, [r1, #(16*1)] - strh r6, [r1, #(16*2)] - strh r7, [r1, #(16*3)] - strh r11,[r1, #(16*4)] - strh r10,[r1, #(16*5)] - strh r9, [r1, #(16*6)] - strh r8, [r1, #(16*7)] - - pop {pc} -endfunc - -/* - Compute IDCT of single column, read as row, store saturated 8-bit. - r0 = source - r1 = dest - r2 = line size -*/ -function idct_col_put_armv6 - push {r1, r2, lr} - - ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, w42 /* ip = W4 | (W2 << 16) */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - idct_row COL_SHIFT - pop {r1, r2} - idct_finish_shift_sat COL_SHIFT - - strb r4, [r1], r2 - strb r5, [r1], r2 - strb r6, [r1], r2 - strb r7, [r1], r2 - strb r11,[r1], r2 - strb r10,[r1], r2 - strb r9, [r1], r2 - strb r8, [r1], r2 - - sub r1, r1, r2, lsl #3 - - pop {pc} -endfunc - -/* - Compute IDCT of single column, read as row, add/store saturated 8-bit. - r0 = source - r1 = dest - r2 = line size -*/ -function idct_col_add_armv6 - push {r1, r2, lr} - - ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, w42 /* ip = W4 | (W2 << 16) */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - idct_row COL_SHIFT - pop {r1, r2} - idct_finish - - ldrb r3, [r1] - ldrb r7, [r1, r2] - ldrb r11,[r1, r2, lsl #2] - add ip, r3, ip, asr #COL_SHIFT - usat ip, #8, ip - add r4, r7, r4, asr #COL_SHIFT - strb ip, [r1], r2 - ldrb ip, [r1, r2] - usat r4, #8, r4 - ldrb r11,[r1, r2, lsl #2] - add r5, ip, r5, asr #COL_SHIFT - usat r5, #8, r5 - strb r4, [r1], r2 - ldrb r3, [r1, r2] - ldrb ip, [r1, r2, lsl #2] - strb r5, [r1], r2 - ldrb r7, [r1, r2] - ldrb r4, [r1, r2, lsl #2] - add r6, r3, r6, asr #COL_SHIFT - usat r6, #8, r6 - add r10,r7, r10,asr #COL_SHIFT - usat r10,#8, r10 - add r9, r11,r9, asr #COL_SHIFT - usat r9, #8, r9 - add r8, ip, r8, asr #COL_SHIFT - usat r8, #8, r8 - add lr, r4, lr, asr #COL_SHIFT - usat lr, #8, lr - strb r6, [r1], r2 - strb r10,[r1], r2 - strb r9, [r1], r2 - strb r8, [r1], r2 - strb lr, [r1], r2 - - sub r1, r1, r2, lsl #3 - - pop {pc} -endfunc - -/* - Compute 8 IDCT row transforms. - func = IDCT row->col function - width = width of columns in bytes -*/ - .macro idct_rows func width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - sub r0, r0, #(16*5) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - - sub r0, r0, #(16*7) - .endm - -/* void ff_simple_idct_armv6(DCTELEM *data); */ -function ff_simple_idct_armv6, export=1 - push {r4-r11, lr} - sub sp, sp, #128 - - mov r1, sp - idct_rows idct_row_armv6, 2 - mov r1, r0 - mov r0, sp - idct_rows idct_col_armv6, 2 - - add sp, sp, #128 - pop {r4-r11, pc} -endfunc - -/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ -function ff_simple_idct_add_armv6, export=1 - push {r0, r1, r4-r11, lr} - sub sp, sp, #128 - - mov r0, r2 - mov r1, sp - idct_rows idct_row_armv6, 2 - mov r0, sp - ldr r1, [sp, #128] - ldr r2, [sp, #(128+4)] - idct_rows idct_col_add_armv6, 1 - - add sp, sp, #(128+8) - pop {r4-r11, pc} -endfunc - -/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ -function ff_simple_idct_put_armv6, export=1 - push {r0, r1, r4-r11, lr} - sub sp, sp, #128 - - mov r0, r2 - mov r1, sp - idct_rows idct_row_armv6, 2 - mov r0, sp - ldr r1, [sp, #128] - ldr r2, [sp, #(128+4)] - idct_rows idct_col_put_armv6, 1 - - add sp, sp, #(128+8) - pop {r4-r11, pc} -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,373 +0,0 @@ -/* - * ARM NEON IDCT - * - * Copyright (c) 2008 Mans Rullgard - * - * Based on Simple IDCT - * Copyright (c) 2001 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4c ((1<<(COL_SHIFT-1))/W4) -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define w1 d0[0] -#define w2 d0[1] -#define w3 d0[2] -#define w4 d0[3] -#define w5 d1[0] -#define w6 d1[1] -#define w7 d1[2] -#define w4c d1[3] - - .macro idct_col4_top - vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ - vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ - vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ - vadd.i32 q11, q15, q7 - vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ - vadd.i32 q12, q15, q8 - vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ - vsub.i32 q13, q15, q8 - vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ - vsub.i32 q14, q15, q7 - - vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ - vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ - vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ - vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ - .endm - - .text - .align 6 - -function idct_row4_pld_neon - pld [r0] - add r3, r0, r1, lsl #2 - pld [r0, r1] - pld [r0, r1, lsl #1] - pld [r3, -r1] - pld [r3] - pld [r3, r1] - add r3, r3, r1, lsl #1 - pld [r3] - pld [r3, r1] -endfunc - -function idct_row4_neon - vmov.i32 q15, #(1<<(ROW_SHIFT-1)) - vld1.64 {d2-d5}, [r2,:128]! - vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ - vld1.64 {d6,d7}, [r2,:128]! - vorr d10, d3, d5 - vld1.64 {d8,d9}, [r2,:128]! - add r2, r2, #-64 - - vorr d11, d7, d9 - vorr d10, d10, d11 - vmov r3, r4, d10 - - idct_col4_top - - orrs r3, r3, r4 - beq 1f - - vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ - vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ - vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ - vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ - vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ - vadd.i32 q11, q11, q7 - vsub.i32 q12, q12, q7 - vsub.i32 q13, q13, q7 - vadd.i32 q14, q14, q7 - vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ - vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ - vmlal.s16 q9, d9, w7 - vmlsl.s16 q10, d9, w5 - vmlal.s16 q5, d9, w3 - vmlsl.s16 q6, d9, w1 - vadd.i32 q11, q11, q7 - vsub.i32 q12, q12, q8 - vadd.i32 q13, q13, q8 - vsub.i32 q14, q14, q7 - -1: vadd.i32 q3, q11, q9 - vadd.i32 q4, q12, q10 - vshrn.i32 d2, q3, #ROW_SHIFT - vshrn.i32 d4, q4, #ROW_SHIFT - vadd.i32 q7, q13, q5 - vadd.i32 q8, q14, q6 - vtrn.16 d2, d4 - vshrn.i32 d6, q7, #ROW_SHIFT - vshrn.i32 d8, q8, #ROW_SHIFT - vsub.i32 q14, q14, q6 - vsub.i32 q11, q11, q9 - vtrn.16 d6, d8 - vsub.i32 q13, q13, q5 - vshrn.i32 d3, q14, #ROW_SHIFT - vtrn.32 d2, d6 - vsub.i32 q12, q12, q10 - vtrn.32 d4, d8 - vshrn.i32 d5, q13, #ROW_SHIFT - vshrn.i32 d7, q12, #ROW_SHIFT - vshrn.i32 d9, q11, #ROW_SHIFT - - vtrn.16 d3, d5 - vtrn.16 d7, d9 - vtrn.32 d3, d7 - vtrn.32 d5, d9 - - vst1.64 {d2-d5}, [r2,:128]! - vst1.64 {d6-d9}, [r2,:128]! - - bx lr -endfunc - -function idct_col4_neon - mov ip, #16 - vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ - vdup.16 d30, w4c - vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ - vadd.i16 d30, d30, d2 - vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ - vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1< - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - - preserve8 - -function ff_synth_filter_float_neon, export=1 - push {r3-r11,lr} - - ldr r4, [r2] @ synth_buf_offset - add r1, r1, r4, lsl #2 @ synth_buf - sub r12, r4, #32 - bfc r12, #9, #23 - bic r4, r4, #63 - str r12, [r2] - - ldr r2, [sp, #12*4] @ in - mov r9, r1 @ synth_buf - -VFP vpush {d0} - bl ff_imdct_half_neon -VFP vpop {d0} - pop {r3} - - ldr r5, [sp, #9*4] @ window - ldr r2, [sp, #10*4] @ out -NOVFP vldr d0, [sp, #12*4] @ scale, bias - add r8, r9, #12*4 - - mov lr, #64*4 - mov r1, #4 -1: - add r10, r9, #16*4 @ synth_buf - add r11, r8, #16*4 - add r0, r5, #16*4 @ window - add r6, r5, #32*4 - add r7, r5, #48*4 - - vld1.32 {q10}, [r3,:128] @ a - add r3, r3, #16*4 - vld1.32 {q1}, [r3,:128] @ b - vmov.f32 q2, #0.0 @ c - vmov.f32 q3, #0.0 @ d - - mov r12, #512 -2: - vld1.32 {q9}, [r8, :128], lr - vrev64.32 q9, q9 - vld1.32 {q8}, [r5, :128], lr - vmls.f32 d20, d16, d19 - vld1.32 {q11}, [r0, :128], lr - vmls.f32 d21, d17, d18 - vld1.32 {q12}, [r9, :128], lr - vmla.f32 d2, d22, d24 - vld1.32 {q8}, [r6, :128], lr - vmla.f32 d3, d23, d25 - vld1.32 {q9}, [r10,:128], lr - vmla.f32 d4, d16, d18 - vld1.32 {q12}, [r11,:128], lr - vmla.f32 d5, d17, d19 - vrev64.32 q12, q12 - vld1.32 {q11}, [r7, :128], lr - vmla.f32 d6, d22, d25 - vmla.f32 d7, d23, d24 - subs r12, r12, #64 - beq 3f - cmp r12, r4 - bne 2b - sub r8, r8, #512*4 - sub r9, r9, #512*4 - sub r10, r10, #512*4 - sub r11, r11, #512*4 - b 2b -3: - vdup.32 q8, d0[1] - vdup.32 q9, d0[1] - vmla.f32 q8, q10, d0[0] - vmla.f32 q9, q1, d0[0] - vst1.32 {q3}, [r3,:128] - sub r3, r3, #16*4 - vst1.32 {q2}, [r3,:128] - vst1.32 {q8}, [r2,:128] - add r2, r2, #16*4 - vst1.32 {q9}, [r2,:128] - - subs r1, r1, #1 - popeq {r4-r11,pc} - - cmp r4, #0 - subeq r8, r8, #512*4 - subeq r9, r9, #512*4 - sub r5, r5, #512*4 - sub r2, r2, #12*4 @ out - add r3, r3, #4*4 @ synth_buf2 - add r5, r5, #4*4 @ window - add r9, r9, #4*4 @ synth_buf - sub r8, r8, #4*4 @ synth_buf - b 1b -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S --- a/ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,420 +0,0 @@ -/* - * Copyright (c) 2009 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "asm.S" - -.section .rodata -.align 4 - -vp3_idct_constants: -.short 64277, 60547, 54491, 46341, 36410, 25080, 12785 - -#define xC1S7 d0[0] -#define xC2S6 d0[1] -#define xC3S5 d0[2] -#define xC4S4 d0[3] -#define xC5S3 d1[0] -#define xC6S2 d1[1] -#define xC7S1 d1[2] - -.text - -.macro vp3_loop_filter - vsubl.u8 q3, d18, d17 - vsubl.u8 q2, d16, d19 - vadd.i16 q1, q3, q3 - vadd.i16 q2, q2, q3 - vadd.i16 q0, q1, q2 - vrshr.s16 q0, q0, #3 - vmovl.u8 q9, d18 - vdup.u16 q15, r2 - - vabs.s16 q1, q0 - vshr.s16 q0, q0, #15 - vqsub.u16 q2, q15, q1 - vqsub.u16 q3, q2, q1 - vsub.i16 q1, q2, q3 - veor q1, q1, q0 - vsub.i16 q0, q1, q0 - - vaddw.u8 q2, q0, d17 - vsub.i16 q3, q9, q0 - vqmovun.s16 d0, q2 - vqmovun.s16 d1, q3 -.endm - -function ff_vp3_v_loop_filter_neon, export=1 - sub ip, r0, r1 - sub r0, r0, r1, lsl #1 - vld1.64 {d16}, [r0,:64], r1 - vld1.64 {d17}, [r0,:64], r1 - vld1.64 {d18}, [r0,:64], r1 - vld1.64 {d19}, [r0,:64], r1 - ldrb r2, [r2, #129*4] - - vp3_loop_filter - - vst1.64 {d0}, [ip,:64], r1 - vst1.64 {d1}, [ip,:64], r1 - bx lr -endfunc - -function ff_vp3_h_loop_filter_neon, export=1 - sub ip, r0, #1 - sub r0, r0, #2 - vld1.32 {d16[]}, [r0], r1 - vld1.32 {d17[]}, [r0], r1 - vld1.32 {d18[]}, [r0], r1 - vld1.32 {d19[]}, [r0], r1 - vld1.32 {d16[1]}, [r0], r1 - vld1.32 {d17[1]}, [r0], r1 - vld1.32 {d18[1]}, [r0], r1 - vld1.32 {d19[1]}, [r0], r1 - ldrb r2, [r2, #129*4] - - vtrn.8 d16, d17 - vtrn.8 d18, d19 - vtrn.16 d16, d18 - vtrn.16 d17, d19 - - vp3_loop_filter - - vtrn.8 d0, d1 - - vst1.16 {d0[0]}, [ip], r1 - vst1.16 {d1[0]}, [ip], r1 - vst1.16 {d0[1]}, [ip], r1 - vst1.16 {d1[1]}, [ip], r1 - vst1.16 {d0[2]}, [ip], r1 - vst1.16 {d1[2]}, [ip], r1 - vst1.16 {d0[3]}, [ip], r1 - vst1.16 {d1[3]}, [ip], r1 - bx lr -endfunc - - -function vp3_idct_start_neon - vpush {d8-d15} - movrel r3, vp3_idct_constants - vld1.64 {d0-d1}, [r3,:128] - vld1.64 {d16-d19}, [r2,:128]! - vld1.64 {d20-d23}, [r2,:128]! - vld1.64 {d24-d27}, [r2,:128]! - vadd.s16 q1, q8, q12 - vsub.s16 q8, q8, q12 - vld1.64 {d28-d31}, [r2,:128]! -endfunc - -function vp3_idct_core_neon - vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16 - vmull.s16 q3, d19, xC1S7 - vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16 - vmull.s16 q5, d3, xC4S4 - vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16 - vmull.s16 q7, d17, xC4S4 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 - vshrn.s32 d9, q7, #16 - vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4 - vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4 - vadd.s16 q1, q2, q9 // ip[1] * C1 - - vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16 - vmull.s16 q3, d31, xC1S7 - vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16 - vmull.s16 q5, d31, xC7S1 - vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16 - vmull.s16 q7, d19, xC7S1 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 // ip[7] * C7 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 // ip[1] * C7 - vshrn.s32 d9, q7, #16 - vadd.s16 q2, q2, q15 // ip[7] * C1 - vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7 - vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1 - - vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16 - vmull.s16 q3, d23, xC5S3 - vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16 - vmull.s16 q5, d23, xC3S5 - vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16 - vmull.s16 q7, d27, xC5S3 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 - vshrn.s32 d9, q7, #16 - vadd.s16 q3, q3, q11 // ip[3] * C3 - vadd.s16 q4, q4, q13 // ip[5] * C5 - vadd.s16 q1, q2, q11 // ip[3] * C5 - vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5 - - vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16 - vmull.s16 q3, d27, xC3S5 - vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16 - vmull.s16 q5, d21, xC2S6 - vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16 - vmull.s16 q7, d29, xC6S2 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 // ip[6] * C6 - vshrn.s32 d9, q7, #16 - vadd.s16 q2, q2, q13 // ip[5] * C3 - vadd.s16 q3, q3, q10 // ip[2] * C2 - vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5 - vsub.s16 q1, q9, q11 // (A - C) - vadd.s16 q11, q9, q11 // Cd = A + C - vsub.s16 q9, q15, q13 // (B - D) - vadd.s16 q13, q15, q13 // Dd = B + D - vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6 - - vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16 - vmull.s16 q3, d3, xC4S4 - vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16 - vmull.s16 q5, d29, xC2S6 - vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16 - vmull.s16 q7, d21, xC6S2 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 // ip[2] * C6 - vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16 - vmull.s16 q6, d19, xC4S4 - vshrn.s32 d9, q7, #16 - vadd.s16 q3, q3, q14 // ip[6] * C2 - vadd.s16 q10, q1, q2 // Ad = (A - C) * C4 - vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2 - bx lr -endfunc - -.macro VP3_IDCT_END type -function vp3_idct_end_\type\()_neon -.ifc \type, col - vdup.16 q0, r3 - vadd.s16 q12, q12, q0 - vadd.s16 q8, q8, q0 -.endif - - vshrn.s32 d2, q5, #16 - vshrn.s32 d3, q6, #16 - vadd.s16 q2, q12, q15 // Gd = E + G - vadd.s16 q9, q1, q9 // (B - D) * C4 - vsub.s16 q12, q12, q15 // Ed = E - G - vsub.s16 q3, q8, q10 // Fd = F - Ad - vadd.s16 q10, q8, q10 // Add = F + Ad - vadd.s16 q4, q9, q14 // Hd = Bd + H - vsub.s16 q14, q9, q14 // Bdd = Bd - H - vadd.s16 q8, q2, q11 // [0] = Gd + Cd - vsub.s16 q15, q2, q11 // [7] = Gd - Cd - vadd.s16 q9, q10, q4 // [1] = Add + Hd - vsub.s16 q10, q10, q4 // [2] = Add - Hd - vadd.s16 q11, q12, q13 // [3] = Ed + Dd - vsub.s16 q12, q12, q13 // [4] = Ed - Dd -.ifc \type, row - vtrn.16 q8, q9 -.endif - vadd.s16 q13, q3, q14 // [5] = Fd + Bdd - vsub.s16 q14, q3, q14 // [6] = Fd - Bdd - -.ifc \type, row - // 8x8 transpose - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vswp d17, d24 - vswp d19, d26 - vadd.s16 q1, q8, q12 - vswp d21, d28 - vsub.s16 q8, q8, q12 - vswp d23, d30 -.endif - bx lr -endfunc -.endm - -VP3_IDCT_END row -VP3_IDCT_END col - -function ff_vp3_idct_neon, export=1 - mov ip, lr - mov r2, r0 - bl vp3_idct_start_neon - bl vp3_idct_end_row_neon - mov r3, #8 - bl vp3_idct_core_neon - bl vp3_idct_end_col_neon - mov lr, ip - vpop {d8-d15} - - vshr.s16 q8, q8, #4 - vshr.s16 q9, q9, #4 - vshr.s16 q10, q10, #4 - vshr.s16 q11, q11, #4 - vshr.s16 q12, q12, #4 - vst1.64 {d16-d19}, [r0,:128]! - vshr.s16 q13, q13, #4 - vshr.s16 q14, q14, #4 - vst1.64 {d20-d23}, [r0,:128]! - vshr.s16 q15, q15, #4 - vst1.64 {d24-d27}, [r0,:128]! - vst1.64 {d28-d31}, [r0,:128]! - bx lr -endfunc - -function ff_vp3_idct_put_neon, export=1 - mov ip, lr - bl vp3_idct_start_neon - bl vp3_idct_end_row_neon - mov r3, #8 - add r3, r3, #2048 // convert signed pixel to unsigned - bl vp3_idct_core_neon - bl vp3_idct_end_col_neon - mov lr, ip - vpop {d8-d15} - - vqshrun.s16 d0, q8, #4 - vqshrun.s16 d1, q9, #4 - vqshrun.s16 d2, q10, #4 - vqshrun.s16 d3, q11, #4 - vst1.64 {d0}, [r0,:64], r1 - vqshrun.s16 d4, q12, #4 - vst1.64 {d1}, [r0,:64], r1 - vqshrun.s16 d5, q13, #4 - vst1.64 {d2}, [r0,:64], r1 - vqshrun.s16 d6, q14, #4 - vst1.64 {d3}, [r0,:64], r1 - vqshrun.s16 d7, q15, #4 - vst1.64 {d4}, [r0,:64], r1 - vst1.64 {d5}, [r0,:64], r1 - vst1.64 {d6}, [r0,:64], r1 - vst1.64 {d7}, [r0,:64], r1 - bx lr -endfunc - -function ff_vp3_idct_add_neon, export=1 - mov ip, lr - bl vp3_idct_start_neon - bl vp3_idct_end_row_neon - mov r3, #8 - bl vp3_idct_core_neon - bl vp3_idct_end_col_neon - mov lr, ip - vpop {d8-d15} - mov r2, r0 - - vld1.64 {d0}, [r0,:64], r1 - vshr.s16 q8, q8, #4 - vld1.64 {d1}, [r0,:64], r1 - vshr.s16 q9, q9, #4 - vld1.64 {d2}, [r0,:64], r1 - vaddw.u8 q8, q8, d0 - vld1.64 {d3}, [r0,:64], r1 - vaddw.u8 q9, q9, d1 - vld1.64 {d4}, [r0,:64], r1 - vshr.s16 q10, q10, #4 - vld1.64 {d5}, [r0,:64], r1 - vshr.s16 q11, q11, #4 - vld1.64 {d6}, [r0,:64], r1 - vqmovun.s16 d0, q8 - vld1.64 {d7}, [r0,:64], r1 - vqmovun.s16 d1, q9 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vshr.s16 q12, q12, #4 - vshr.s16 q13, q13, #4 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vshr.s16 q14, q14, #4 - vshr.s16 q15, q15, #4 - vst1.64 {d0}, [r2,:64], r1 - vqmovun.s16 d4, q12 - vst1.64 {d1}, [r2,:64], r1 - vqmovun.s16 d5, q13 - vst1.64 {d2}, [r2,:64], r1 - vaddw.u8 q14, q14, d6 - vst1.64 {d3}, [r2,:64], r1 - vaddw.u8 q15, q15, d7 - vst1.64 {d4}, [r2,:64], r1 - vqmovun.s16 d6, q14 - vst1.64 {d5}, [r2,:64], r1 - vqmovun.s16 d7, q15 - vst1.64 {d6}, [r2,:64], r1 - vst1.64 {d7}, [r2,:64], r1 - bx lr -endfunc - -function ff_vp3_idct_dc_add_neon, export=1 - ldrsh r2, [r2] - movw r3, #46341 - mul r2, r3, r2 - smulwt r2, r3, r2 - mov r3, r0 - vdup.16 q15, r2 - vrshr.s16 q15, q15, #4 - - vld1.8 {d0}, [r0,:64], r1 - vld1.8 {d1}, [r0,:64], r1 - vld1.8 {d2}, [r0,:64], r1 - vaddw.u8 q8, q15, d0 - vld1.8 {d3}, [r0,:64], r1 - vaddw.u8 q9, q15, d1 - vld1.8 {d4}, [r0,:64], r1 - vaddw.u8 q10, q15, d2 - vld1.8 {d5}, [r0,:64], r1 - vaddw.u8 q11, q15, d3 - vld1.8 {d6}, [r0,:64], r1 - vaddw.u8 q12, q15, d4 - vld1.8 {d7}, [r0,:64], r1 - vaddw.u8 q13, q15, d5 - vqmovun.s16 d0, q8 - vaddw.u8 q14, q15, d6 - vqmovun.s16 d1, q9 - vaddw.u8 q15, q15, d7 - vqmovun.s16 d2, q10 - vst1.8 {d0}, [r3,:64], r1 - vqmovun.s16 d3, q11 - vst1.8 {d1}, [r3,:64], r1 - vqmovun.s16 d4, q12 - vst1.8 {d2}, [r3,:64], r1 - vqmovun.s16 d5, q13 - vst1.8 {d3}, [r3,:64], r1 - vqmovun.s16 d6, q14 - vst1.8 {d4}, [r3,:64], r1 - vqmovun.s16 d7, q15 - vst1.8 {d5}, [r3,:64], r1 - vst1.8 {d6}, [r3,:64], r1 - vst1.8 {d7}, [r3,:64], r1 - bx lr -endfunc diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/avcodec.h --- a/ffmpeg_smp/h264dec/libavcodec/avcodec.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,407 +0,0 @@ -#ifndef AVCODEC_AVCODEC_H -#define AVCODEC_AVCODEC_H - -#include -#include -#include "config.h" - -#include "libavutil/mem.h" - -#define MAX_SPS_COUNT 32 -#define MAX_PPS_COUNT 256 - - -#ifndef CABAC -#define CABAC h->pps.cabac -#endif - -#define EXTENDED_SAR 255 - -#define MB_TYPE_REF0 MB_TYPE_ACPRED //dirty but it fits in 16 bit -#define MB_TYPE_8x8DCT 0x01000000 -#define IS_REF0(a) ((a) & MB_TYPE_REF0) -#define IS_8x8DCT(a) ((a) & MB_TYPE_8x8DCT) - -#define LIST_NOT_USED -1 -#define PART_NOT_AVAILABLE -2 - -/* dct code */ -typedef short DCTELEM; - -/** -* Required number of additionally allocated bytes at the end of the input bitstream for decoding. -* This is mainly needed because some optimized bitstream readers read -* 32 or 64 bit at once and could read over the end.
-* Note: If the first 23 bits of the additional bytes are not 0, then damaged -* MPEG bitstreams could cause overread and segfault. -*/ -#define FF_INPUT_BUFFER_PADDING_SIZE 8 - -enum AVColorPrimaries{ - AVCOL_PRI_BT709 =1, ///< also ITU-R BT1361 / IEC 61966-2-4 / SMPTE RP177 Annex B - AVCOL_PRI_UNSPECIFIED=2, - AVCOL_PRI_BT470M =4, - AVCOL_PRI_BT470BG =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM - AVCOL_PRI_SMPTE170M =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC - AVCOL_PRI_SMPTE240M =7, ///< functionally identical to above - AVCOL_PRI_FILM =8, - AVCOL_PRI_NB , ///< Not part of ABI -}; - -enum AVColorTransferCharacteristic{ - AVCOL_TRC_BT709 =1, ///< also ITU-R BT1361 - AVCOL_TRC_UNSPECIFIED=2, - AVCOL_TRC_GAMMA22 =4, ///< also ITU-R BT470M / ITU-R BT1700 625 PAL & SECAM - AVCOL_TRC_GAMMA28 =5, ///< also ITU-R BT470BG - AVCOL_TRC_NB , ///< Not part of ABI -}; - -enum AVColorSpace{ - AVCOL_SPC_RGB =0, - AVCOL_SPC_BT709 =1, ///< also ITU-R BT1361 / IEC 61966-2-4 xvYCC709 / SMPTE RP177 Annex B - AVCOL_SPC_UNSPECIFIED=2, - AVCOL_SPC_FCC =4, - AVCOL_SPC_BT470BG =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM / IEC 61966-2-4 xvYCC601 - AVCOL_SPC_SMPTE170M =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC / functionally identical to above - AVCOL_SPC_SMPTE240M =7, - AVCOL_SPC_NB , ///< Not part of ABI -}; - -enum AVColorRange{ - AVCOL_RANGE_UNSPECIFIED=0, - AVCOL_RANGE_MPEG =1, ///< the normal 219*2^(n-8) "MPEG" YUV ranges - AVCOL_RANGE_JPEG =2, ///< the normal 2^n-1 "JPEG" YUV ranges - AVCOL_RANGE_NB , ///< Not part of ABI -}; - -#define MAX_MMCO_COUNT 66 -/** -* Memory management control operation opcode. -*/ -typedef enum MMCOOpcode{ - MMCO_END=0, - MMCO_SHORT2UNUSED, - MMCO_LONG2UNUSED, - MMCO_SHORT2LONG, - MMCO_SET_MAX_LONG, - MMCO_RESET, - MMCO_LONG, -} MMCOOpcode; - -/* NAL unit types */ -enum { - NAL_SLICE=1, - NAL_DPA, - NAL_DPB, - NAL_DPC, - NAL_IDR_SLICE, - NAL_SEI, - NAL_SPS, - NAL_PPS, - NAL_AUD, - NAL_END_SEQUENCE, - NAL_END_STREAM, - NAL_FILLER_DATA, - NAL_SPS_EXT, - NAL_AUXILIARY_SLICE=19 -}; - -/** -* SEI message types -*/ -typedef enum { - SEI_BUFFERING_PERIOD = 0, ///< buffering period (H.264, D.1.1) - SEI_TYPE_PIC_TIMING = 1, ///< picture timing - SEI_TYPE_USER_DATA_UNREGISTERED = 5, ///< unregistered user data - SEI_TYPE_RECOVERY_POINT = 6 ///< recovery point (frame # to decoder sync) -} SEI_Type; - -/** -* pic_struct in picture timing SEI message -*/ -typedef enum { - SEI_PIC_STRUCT_FRAME = 0, ///< 0: %frame - SEI_PIC_STRUCT_TOP_FIELD = 1, ///< 1: top field - SEI_PIC_STRUCT_BOTTOM_FIELD = 2, ///< 2: bottom field - SEI_PIC_STRUCT_TOP_BOTTOM = 3, ///< 3: top field, bottom field, in that order - SEI_PIC_STRUCT_BOTTOM_TOP = 4, ///< 4: bottom field, top field, in that order - SEI_PIC_STRUCT_TOP_BOTTOM_TOP = 5, ///< 5: top field, bottom field, top field repeated, in that order - SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM = 6, ///< 6: bottom field, top field, bottom field repeated, in that order - SEI_PIC_STRUCT_FRAME_DOUBLING = 7, ///< 7: %frame doubling - SEI_PIC_STRUCT_FRAME_TRIPLING = 8 ///< 8: %frame tripling -} SEI_PicStructType; - -#define FF_MAX_B_FRAMES 16 - - -//The following defines may change, don't expect compatibility if you use them. -#define MB_TYPE_INTRA4x4 0x0001 -#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific -#define MB_TYPE_INTRA_PCM 0x0004 //FIXME H.264-specific -#define MB_TYPE_16x16 0x0008 -#define MB_TYPE_16x8 0x0010 -#define MB_TYPE_8x16 0x0020 -#define MB_TYPE_8x8 0x0040 -#define MB_TYPE_INTERLACED 0x0080 -#define MB_TYPE_DIRECT2 0x0100 //FIXME -#define MB_TYPE_ACPRED 0x0200 -#define MB_TYPE_GMC 0x0400 -#define MB_TYPE_SKIP 0x0800 -#define MB_TYPE_P0L0 0x1000 -#define MB_TYPE_P1L0 0x2000 -#define MB_TYPE_P0L1 0x4000 -#define MB_TYPE_P1L1 0x8000 -#define MB_TYPE_L0 (MB_TYPE_P0L0 | MB_TYPE_P1L0) -#define MB_TYPE_L1 (MB_TYPE_P0L1 | MB_TYPE_P1L1) -#define MB_TYPE_L0L1 (MB_TYPE_L0 | MB_TYPE_L1) -#define MB_TYPE_QUANT 0x00010000 -#define MB_TYPE_CBP 0x00020000 -//Note bits 24-31 are reserved for codec specific use (h264 ref0, mpeg1 0mv, ...) - -#define FF_BUFFER_TYPE_INTERNAL 1 -#define FF_BUFFER_TYPE_USER 2 ///< direct rendering buffers (image is (de)allocated by user) -#define FF_BUFFER_TYPE_SHARED 4 ///< Buffer from somewhere else; don't deallocate image (data/base), all other tables are not shared. -#define FF_BUFFER_TYPE_COPY 8 ///< Just a (modified) copy of some other buffer, don't deallocate anything. - - -#define FF_I_TYPE 1 ///< Intra -#define FF_P_TYPE 2 ///< Predicted -#define FF_B_TYPE 3 ///< Bi-dir predicted -#define FF_S_TYPE 4 ///< S(GMC)-VOP MPEG4 -#define FF_SI_TYPE 5 ///< Switching Intra -#define FF_SP_TYPE 6 ///< Switching Predicted -#define FF_BI_TYPE 7 - -#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if there is just one type -#define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4) -#define IS_INTRA16x16(a) ((a)&MB_TYPE_INTRA16x16) -#define IS_PCM(a) ((a)&MB_TYPE_INTRA_PCM) -#define IS_INTRA(a) ((a)&7) -#define IS_INTER(a) ((a)&(MB_TYPE_16x16|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8)) -#define IS_SKIP(a) ((a)&MB_TYPE_SKIP) -#define IS_INTRA_PCM(a) ((a)&MB_TYPE_INTRA_PCM) -#define IS_INTERLACED(a) ((a)&MB_TYPE_INTERLACED) -#define IS_DIRECT(a) ((a)&MB_TYPE_DIRECT2) -#define IS_GMC(a) ((a)&MB_TYPE_GMC) -#define IS_16X16(a) ((a)&MB_TYPE_16x16) -#define IS_16X8(a) ((a)&MB_TYPE_16x8) -#define IS_8X16(a) ((a)&MB_TYPE_8x16) -#define IS_8X8(a) ((a)&MB_TYPE_8x8) -#define IS_SUB_8X8(a) ((a)&MB_TYPE_16x16) //note reused -#define IS_SUB_8X4(a) ((a)&MB_TYPE_16x8) //note reused -#define IS_SUB_4X8(a) ((a)&MB_TYPE_8x16) //note reused -#define IS_SUB_4X4(a) ((a)&MB_TYPE_8x8) //note reused -#define IS_ACPRED(a) ((a)&MB_TYPE_ACPRED) -#define IS_QUANT(a) ((a)&MB_TYPE_QUANT) -#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list)))) -#define USES_LIST(a, list) ((a) & ((MB_TYPE_P0L0|MB_TYPE_P1L0)<<(2*(list)))) ///< does this mb use listX, note does not work if subMBs -#define HAS_CBP(a) ((a)&MB_TYPE_CBP) - - -#define FF_MM_FORCE 0x80000000 /* Force usage of selected flags (OR) */ - /* lower 16 bits - CPU features */ -#define FF_MM_MMX 0x0001 ///< standard MMX -#define FF_MM_3DNOW 0x0004 ///< AMD 3DNOW -#define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext -#define FF_MM_SSE 0x0008 ///< SSE functions -#define FF_MM_SSE2 0x0010 ///< PIV SSE2 functions -#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt -#define FF_MM_SSE3 0x0040 ///< Prescott SSE3 functions -#define FF_MM_SSSE3 0x0080 ///< Conroe SSSE3 functions -#define FF_MM_SSE4 0x0100 ///< Penryn SSE4.1 functions -#define FF_MM_SSE42 0x0200 ///< Nehalem SSE4.2 functions -#define FF_MM_IWMMXT 0x0100 ///< XScale IWMMXT -#define FF_MM_ALTIVEC 0x0001 ///< standard AltiVec - - -/** -* Sequence parameter set -*/ -typedef struct SPS{ - - int profile_idc; - int level_idc; - int chroma_format_idc; - int transform_bypass; ///< qpprime_y_zero_transform_bypass_flag - int log2_max_frame_num; ///< log2_max_frame_num_minus4 + 4 - int poc_type; ///< pic_order_cnt_type - int log2_max_poc_lsb; ///< log2_max_pic_order_cnt_lsb_minus4 - int delta_pic_order_always_zero_flag; - int offset_for_non_ref_pic; - int offset_for_top_to_bottom_field; - int poc_cycle_length; ///< num_ref_frames_in_pic_order_cnt_cycle - int ref_frame_count; ///< num_ref_frames - int gaps_in_frame_num_allowed_flag; - int mb_width; ///< pic_width_in_mbs_minus1 + 1 - int mb_height; ///< pic_height_in_map_units_minus1 + 1 - int frame_mbs_only_flag; - int mb_aff; /// free, 1 -> needs to be displayed, 2 -> needed for reference, 3 -> 1 && 2 - int key_frame; - int mmco_reset; ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET. - -} DecodedPicture; - - -#endif /* AVCODEC_AVCODEC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cabac.c --- a/ffmpeg_smp/h264dec/libavcodec/cabac.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,242 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * Context Adaptive Binary Arithmetic Coder. - */ - -#include - -#include "libavutil/common.h" -//#include "get_bits.h" -#include "cabac.h" - -static const uint8_t lps_range[64][4]= { -{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, -{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, -{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135}, -{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110}, -{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89}, -{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72}, -{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59}, -{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48}, -{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39}, -{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31}, -{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25}, -{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21}, -{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17}, -{ 10, 12, 14, 16}, { 9, 11, 13, 15}, { 9, 11, 12, 14}, { 8, 10, 12, 14}, -{ 8, 9, 11, 13}, { 7, 9, 11, 12}, { 7, 9, 10, 12}, { 7, 8, 10, 11}, -{ 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, -}; - -uint8_t ff_h264_mlps_state[4*64]; -uint8_t ff_h264_lps_range[4*2*64]; -uint8_t ff_h264_lps_state[2*64]; -uint8_t ff_h264_mps_state[2*64]; - -static const uint8_t mps_state[64]= { - 1, 2, 3, 4, 5, 6, 7, 8, - 9,10,11,12,13,14,15,16, - 17,18,19,20,21,22,23,24, - 25,26,27,28,29,30,31,32, - 33,34,35,36,37,38,39,40, - 41,42,43,44,45,46,47,48, - 49,50,51,52,53,54,55,56, - 57,58,59,60,61,62,62,63, -}; - -static const uint8_t lps_state[64]= { - 0, 0, 1, 2, 2, 4, 4, 5, - 6, 7, 8, 9, 9,11,11,12, - 13,13,15,15,16,16,18,18, - 19,19,21,21,22,22,23,24, - 24,25,26,26,27,27,28,29, - 29,30,30,30,31,32,32,33, - 33,33,34,34,35,35,35,36, - 36,36,37,37,37,38,38,63, -}; - -const uint8_t ff_h264_norm_shift[512]= { - 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, - 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -}; - -/** - * - * @param buf_size size of buf in bits - */ -void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){ - c->bytestream_start= - c->bytestream= buf; - c->bytestream_end= buf + buf_size; - -#if CABAC_BITS == 16 - c->low = (*c->bytestream++)<<18; - c->low+= (*c->bytestream++)<<10; -#else - c->low = (*c->bytestream++)<<10; -#endif - c->low+= ((*c->bytestream++)<<2) + 2; - c->range= 0x1FE; -} - -void ff_init_cabac_states(){ - int i, j; - - for(i=0; i<64; i++){ - for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save - ff_h264_lps_range[j*2*64+2*i+0]= - ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j]; - } - - ff_h264_mlps_state[128+2*i+0]= - ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0; - ff_h264_mlps_state[128+2*i+1]= - ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1; - - if( i ){ -#ifdef BRANCHLESS_CABAC_DECODER - ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0; - ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1; - }else{ - ff_h264_mlps_state[128-2*i-1]= 1; - ff_h264_mlps_state[128-2*i-2]= 0; -#else - ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0; - ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1; - }else{ - ff_h264_lps_state[2*i+0]= 1; - ff_h264_lps_state[2*i+1]= 0; -#endif - } - } -} - -#ifdef TEST -#define SIZE 10240 -#define START_TIMER -#define STOP_TIMER(...) -#define av_log(...) -// #include "libavutil/lfg.h" -#include "avcodec.h" -#include "cabac.h" - -int main(void){ - CABACContext c; - uint8_t b[9*SIZE]; - uint8_t r[9*SIZE]; - int i; - uint8_t state[10]= {0}; -// AVLFG prng; - -// // av_lfg_init(&prng, 1); -// ff_init_cabac_encoder(&c, b, SIZE); -// ff_init_cabac_states(); -// -// for(i=0; i - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * Context Adaptive Binary Arithmetic Coder. - */ - -#ifndef AVCODEC_CABAC_H -#define AVCODEC_CABAC_H - -//#undef NDEBUG -#include -#include "libavutil/x86_cpu.h" -#include "libavutil/attributes.h" - -#define CABAC_BITS 16 -#define CABAC_MASK ((1<low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); -#else - c->low+= c->bytestream[0]<<1; -#endif - c->low -= CABAC_MASK; - c->bytestream+= CABAC_BITS/8; -} - -static void refill2(CABACContext *c){ - int i, x; - - x= c->low ^ (c->low-1); - i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; - - x= -CABAC_MASK; - -#if CABAC_BITS == 16 - x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); -#else - x+= c->bytestream[0]<<1; -#endif - - c->low += x<bytestream+= CABAC_BITS/8; -} - -static inline void renorm_cabac_decoder(CABACContext *c){ - while(c->range < 0x100){ - c->range+= c->range; - c->low+= c->low; - if(!(c->low & CABAC_MASK)) - refill(c); - } -} - -static inline void renorm_cabac_decoder_once(CABACContext *c){ - - int shift= (uint32_t)(c->range - 0x100)>>31; - c->range<<= shift; - c->low <<= shift; - - if(!(c->low & CABAC_MASK)) - refill(c); -} - -static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){ - - int s = *state; - int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; - int bit, lps_mask av_unused; - - c->range -= RangeLPS; -#ifndef BRANCHLESS_CABAC_DECODER - if(c->low < (c->range<<(CABAC_BITS+1))){ - bit= s&1; - *state= ff_h264_mps_state[s]; - renorm_cabac_decoder_once(c); - }else{ - bit= ff_h264_norm_shift[RangeLPS]; - c->low -= (c->range<<(CABAC_BITS+1)); - *state= ff_h264_lps_state[s]; - c->range = RangeLPS<low <<= bit; - bit= (s&1)^1; - - if(!(c->low & CABAC_MASK)){ - refill2(c); - } - } -#else /* BRANCHLESS_CABAC_DECODER */ - lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31; - - c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask; - c->range += (RangeLPS - c->range) & lps_mask; - - s^=lps_mask; - *state= (ff_h264_mlps_state+128)[s]; - bit= s&1; - - lps_mask= ff_h264_norm_shift[c->range]; - c->range<<= lps_mask; - c->low <<= lps_mask; - if(!(c->low & CABAC_MASK)) - refill2(c); -#endif /* BRANCHLESS_CABAC_DECODER */ - - return bit; -} - -static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){ - return get_cabac_inline(c, state); -} - -static int av_unused get_cabac(CABACContext *c, uint8_t * const state){ - return get_cabac_inline(c, state); -} - -static int av_unused get_cabac_bypass(CABACContext *c){ - - int range; - c->low += c->low; - - if(!(c->low & CABAC_MASK)) - refill(c); - - range= c->range<<(CABAC_BITS+1); - if(c->low < range){ - return 0; - }else{ - c->low -= range; - return 1; - } -} - -static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ - int range, mask; - c->low += c->low; - - if(!(c->low & CABAC_MASK)) - refill(c); - - range= c->range<<(CABAC_BITS+1); - c->low -= range; - mask= c->low >> 31; - range &= mask; - c->low += range; - return (val^mask)-mask; -} - -/** - * - * @return the number of bytes read or 0 if no end - */ -static int av_unused get_cabac_terminate(CABACContext *c){ - c->range -= 2; - if(c->low < c->range<<(CABAC_BITS+1)){ - renorm_cabac_decoder_once(c); - return 0; - }else{ - return c->bytestream - c->bytestream_start; - } -} - -#endif /* AVCODEC_CABAC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,140 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * Context Adaptive Binary Arithmetic Coder. - */ - -#include - -#include "libavutil/common.h" -//#include "get_bits.h" -#include "cabac_spu.h" -#define av_log(...) - -int bytecount =0; -static const uint8_t lps_range[64][4]= { -{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, -{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, -{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135}, -{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110}, -{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89}, -{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72}, -{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59}, -{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48}, -{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39}, -{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31}, -{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25}, -{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21}, -{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17}, -{ 10, 12, 14, 16}, { 9, 11, 13, 15}, { 9, 11, 12, 14}, { 8, 10, 12, 14}, -{ 8, 9, 11, 13}, { 7, 9, 11, 12}, { 7, 9, 10, 12}, { 7, 8, 10, 11}, -{ 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, -}; - -uint8_t ff_h264_mlps_state[4*64]; -uint8_t ff_h264_lps_range[4*2*64]; -uint8_t ff_h264_lps_state[2*64]; -uint8_t ff_h264_mps_state[2*64]; - -static const uint8_t mps_state[64]= { - 1, 2, 3, 4, 5, 6, 7, 8, - 9,10,11,12,13,14,15,16, - 17,18,19,20,21,22,23,24, - 25,26,27,28,29,30,31,32, - 33,34,35,36,37,38,39,40, - 41,42,43,44,45,46,47,48, - 49,50,51,52,53,54,55,56, - 57,58,59,60,61,62,62,63, -}; - -static const uint8_t lps_state[64]= { - 0, 0, 1, 2, 2, 4, 4, 5, - 6, 7, 8, 9, 9,11,11,12, - 13,13,15,15,16,16,18,18, - 19,19,21,21,22,22,23,24, - 24,25,26,26,27,27,28,29, - 29,30,30,30,31,32,32,33, - 33,33,34,34,35,35,35,36, - 36,36,37,37,37,38,38,63, -}; - -const uint8_t ff_h264_norm_shift[512]= { - 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, - 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -}; - -/** - * - * @param buf_size size of buf in bits - */ - -void ff_init_cabac_states(){ - int i, j; - - for(i=0; i<64; i++){ - for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save - ff_h264_lps_range[j*2*64+2*i+0]= - ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j]; - } - - ff_h264_mlps_state[128+2*i+0]= - ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0; - ff_h264_mlps_state[128+2*i+1]= - ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1; - - if( i ){ -#ifdef BRANCHLESS_CABAC_DECODER - ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0; - ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1; - }else{ - ff_h264_mlps_state[128-2*i-1]= 1; - ff_h264_mlps_state[128-2*i-2]= 0; -#else - ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0; - ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1; - }else{ - ff_h264_lps_state[2*i+0]= 1; - ff_h264_lps_state[2*i+1]= 0; -#endif - } - } -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * Context Adaptive Binary Arithmetic Coder. - */ - -#ifndef AVCODEC_CABAC_H -#define AVCODEC_CABAC_H - -//#undef NDEBUG -#include -#include "h264_dma.h" -#include "libavutil/x86_cpu.h" -#include "libavutil/attributes.h" - -#define CABAC_BITS 16 -#define CABAC_MASK ((1<bytestream == c->bytestream_end){ - if (c->bufsize>0){ - int size = (c->bufsize > sizeof(bytestream_ls)) ? sizeof(bytestream_ls) : c->bufsize; - int align = size &0xF; - int dma_size = size + (align? 16-align : 0); - - spu_dma_get(bytestream_ls, (unsigned) c->bytestream_ea, dma_size, ED_raw); - wait_dma_id(ED_raw); - c->bytestream = bytestream_ls; - c->bytestream_end = &bytestream_ls[size]; - c->bytestream_ea += dma_size; - c->bufsize -= size; - } - bytecount =0; - }else if((unsigned)c->bytestream > (unsigned)c->bytestream_end +2){ - //fprintf(stderr, "Read beyond end of frame %d\n", c->bufsize); - bytecount =0; - } -} - -static void refill(CABACContext *c){ - dma_cabac(c); - - c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); - - c->low -= CABAC_MASK; - c->bytestream+= CABAC_BITS/8; -} - -static void refill2(CABACContext *c){ - int i, x; - - dma_cabac(c); - - x= c->low ^ (c->low-1); - i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; - - x= -CABAC_MASK; - - x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); - - c->low += x<bytestream+= CABAC_BITS/8; -} - -static inline void renorm_cabac_decoder(CABACContext *c){ - while(c->range < 0x100){ - c->range+= c->range; - c->low+= c->low; - if(!(c->low & CABAC_MASK)) - refill(c); - } -} - -static inline void renorm_cabac_decoder_once(CABACContext *c){ - - int shift= (uint32_t)(c->range - 0x100)>>31; - c->range<<= shift; - c->low <<= shift; - - if(!(c->low & CABAC_MASK)) - refill(c); -} - -static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){ - - int s = *state; - int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; - int bit, lps_mask av_unused; - - c->range -= RangeLPS; -#ifndef BRANCHLESS_CABAC_DECODER - if(c->low < (c->range<<(CABAC_BITS+1))){ - bit= s&1; - *state= ff_h264_mps_state[s]; - renorm_cabac_decoder_once(c); - }else{ - bit= ff_h264_norm_shift[RangeLPS]; - c->low -= (c->range<<(CABAC_BITS+1)); - *state= ff_h264_lps_state[s]; - c->range = RangeLPS<low <<= bit; - bit= (s&1)^1; - - if(!(c->low & CABAC_MASK)){ - refill2(c); - } - } -#else /* BRANCHLESS_CABAC_DECODER */ - lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31; - - c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask; - c->range += (RangeLPS - c->range) & lps_mask; - - s^=lps_mask; - *state= (ff_h264_mlps_state+128)[s]; - bit= s&1; - - lps_mask= ff_h264_norm_shift[c->range]; - c->range<<= lps_mask; - c->low <<= lps_mask; - if(!(c->low & CABAC_MASK)) - refill2(c); -#endif /* BRANCHLESS_CABAC_DECODER */ - - return bit; -} - -static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){ - return get_cabac_inline(c, state); -} - -static int av_unused get_cabac(CABACContext *c, uint8_t * const state){ - return get_cabac_inline(c, state); -} - -static int av_unused get_cabac_bypass(CABACContext *c){ - - int range; - c->low += c->low; - - if(!(c->low & CABAC_MASK)) - refill(c); - - range= c->range<<(CABAC_BITS+1); - if(c->low < range){ - return 0; - }else{ - c->low -= range; - return 1; - } -} - -static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ - int range, mask; - c->low += c->low; - - if(!(c->low & CABAC_MASK)) - refill(c); - - range= c->range<<(CABAC_BITS+1); - c->low -= range; - mask= c->low >> 31; - range &= mask; - c->low += range; - return (val^mask)-mask; -} - -/** - * - * @return the number of bytes read or 0 if no end - */ -static int av_unused get_cabac_terminate(CABACContext *c){ - c->range -= 2; - if(c->low < c->range<<(CABAC_BITS+1)){ - renorm_cabac_decoder_once(c); - return 0; - }else{ - return c->bytestream - c->bytestream_start; - } -} - -#endif /* AVCODEC_CABAC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1147 +0,0 @@ -/* - * Copyright (c) 2009 TUDelft - * - * Cell Parallel SPU - 2DWave Macroblock Decoding. - */ - -/** - * @file libavcodec/cell/spu/h264_main_spu.c - * Cell Parallel SPU - 2DWave Macroblock Decoding - * @author C C Chi - * - * SIMD SPU kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - - -#include "dsputil_spu.h" -#include "h264_idct_spu.h" -#include "h264_deblock_spu.h" -#include "types_spu.h" -#include "libavutil/intreadwrite.h" - -#include -#include -#include -#include - -//Luma interpolation -#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s -#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s) - -#define OP_U8_SPU PUT_OP_U8_SPU -#define PREFIX_h264_qpel16_h_lowpass_spu put_h264_qpel16_h_lowpass_spu -#define PREFIX_h264_qpel16_v_lowpass_spu put_h264_qpel16_v_lowpass_spu -#define PREFIX_h264_qpel16_hv_lowpass_spu put_h264_qpel16_hv_lowpass_spu -#define PREFIX_h264_qpel8_h_lowpass_spu put_h264_qpel8_h_lowpass_spu -#define PREFIX_h264_qpel8_v_lowpass_spu put_h264_qpel8_v_lowpass_spu -#define PREFIX_h264_qpel8_hv_lowpass_spu put_h264_qpel8_hv_lowpass_spu -#define PREFIX_h264_qpel4_h_lowpass_spu put_h264_qpel4_h_lowpass_spu -#define PREFIX_h264_qpel4_v_lowpass_spu put_h264_qpel4_v_lowpass_spu -#define PREFIX_h264_qpel4_hv_lowpass_spu put_h264_qpel4_hv_lowpass_spu -#include "h264_luma_template_spu.c" -#undef OP_U8_SPU -#undef PREFIX_h264_qpel16_h_lowpass_spu -#undef PREFIX_h264_qpel16_v_lowpass_spu -#undef PREFIX_h264_qpel16_hv_lowpass_spu -#undef PREFIX_h264_qpel8_h_lowpass_spu -#undef PREFIX_h264_qpel8_v_lowpass_spu -#undef PREFIX_h264_qpel8_hv_lowpass_spu -#undef PREFIX_h264_qpel4_h_lowpass_spu -#undef PREFIX_h264_qpel4_v_lowpass_spu -#undef PREFIX_h264_qpel4_hv_lowpass_spu - -#define OP_U8_SPU AVG_OP_U8_SPU -#define PREFIX_h264_qpel16_h_lowpass_spu avg_h264_qpel16_h_lowpass_spu -#define PREFIX_h264_qpel16_v_lowpass_spu avg_h264_qpel16_v_lowpass_spu -#define PREFIX_h264_qpel16_hv_lowpass_spu avg_h264_qpel16_hv_lowpass_spu -#define PREFIX_h264_qpel8_h_lowpass_spu avg_h264_qpel8_h_lowpass_spu -#define PREFIX_h264_qpel8_v_lowpass_spu avg_h264_qpel8_v_lowpass_spu -#define PREFIX_h264_qpel8_hv_lowpass_spu avg_h264_qpel8_hv_lowpass_spu -#define PREFIX_h264_qpel4_h_lowpass_spu avg_h264_qpel4_h_lowpass_spu -#define PREFIX_h264_qpel4_v_lowpass_spu avg_h264_qpel4_v_lowpass_spu -#define PREFIX_h264_qpel4_hv_lowpass_spu avg_h264_qpel4_hv_lowpass_spu -#include "h264_luma_template_spu.c" -#undef OP_U8_SPU -#undef PREFIX_h264_qpel16_h_lowpass_spu -#undef PREFIX_h264_qpel16_v_lowpass_spu -#undef PREFIX_h264_qpel16_hv_lowpass_spu -#undef PREFIX_h264_qpel8_h_lowpass_spu -#undef PREFIX_h264_qpel8_v_lowpass_spu -#undef PREFIX_h264_qpel8_hv_lowpass_spu -#undef PREFIX_h264_qpel4_h_lowpass_spu -#undef PREFIX_h264_qpel4_v_lowpass_spu -#undef PREFIX_h264_qpel4_hv_lowpass_spu - -#define H264_MC(OPNAME, SIZE, CODETYPE) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \ - DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ - DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ - DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ - DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ - DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ - DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ -}\ - - -/**************************/ -/* put pixels functions */ -/*************************/ - -static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - - const int perm_src1 = (unsigned int) src1 & 15; - - for (i=0; i> log2_denom ) -#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) -#define H264_WEIGHT(W,H) \ -static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ - int y; \ - offset <<= log2_denom; \ - if(log2_denom) offset += 1<<(log2_denom-1); \ - for(y=0; y> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); - tc++; - } - if( FFABS( q2 - q0 ) < beta ) { - pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); - tc++; - } - - i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ - pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ - } - pix += ystride; - } - } -} -static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); -} -static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); -} - -static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) -{ - int d; - for( d = 0; d < 16; d++ ) { - const int p2 = pix[-3*xstride]; - const int p1 = pix[-2*xstride]; - const int p0 = pix[-1*xstride]; - - const int q0 = pix[ 0*xstride]; - const int q1 = pix[ 1*xstride]; - const int q2 = pix[ 2*xstride]; - - if( FFABS( p0 - q0 ) < alpha && - FFABS( p1 - p0 ) < beta && - FFABS( q1 - q0 ) < beta ) { - - if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ - if( FFABS( p2 - p0 ) < beta) - { - const int p3 = pix[-4*xstride]; - /* p0', p1', p2' */ - pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; - pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; - pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; - } else { - /* p0' */ - pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - } - if( FFABS( q2 - q0 ) < beta) - { - const int q3 = pix[3*xstride]; - /* q0', q1', q2' */ - pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; - pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; - pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; - } else { - /* q0' */ - pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } - }else{ - /* p0', q0' */ - pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } - } - pix += ystride; - } -} -static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); -} -static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); -} - -static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) -{ - int i, d; - for( i = 0; i < 4; i++ ) { - const int tc = tc0[i]; - if( tc <= 0 ) { - pix += 2*ystride; - continue; - } - for( d = 0; d < 2; d++ ) { - const int p0 = pix[-1*xstride]; - const int p1 = pix[-2*xstride]; - const int q0 = pix[0]; - const int q1 = pix[1*xstride]; - - if( FFABS( p0 - q0 ) < alpha && - FFABS( p1 - p0 ) < beta && - FFABS( q1 - q0 ) < beta ) { - - int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - - pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ - pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ - } - pix += ystride; - } - } -} -static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); -} -static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); -} - -static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) -{ - int d; - for( d = 0; d < 8; d++ ) { - const int p0 = pix[-1*xstride]; - const int p1 = pix[-2*xstride]; - const int q0 = pix[0]; - const int q1 = pix[1*xstride]; - - if( FFABS( p0 - q0 ) < alpha && - FFABS( p1 - p0 ) < beta && - FFABS( q1 - q0 ) < beta ) { - - pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ - pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ - } - pix += ystride; - } -} -static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); -} -static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); -} - - -void dsputil_h264_init_cell(DSPContext_spu* c) { - - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; - c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; - c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; - c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; - c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; - c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; - c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; - - c->h264_idct_add[0] = h264_idct8_add_spu; - c->h264_idct_add[1] = h264_idct4_add_spu; - - - c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu; - c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu; - c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu; - c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu; - c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu; - c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu; - - c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; - c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; - c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; - c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; - c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; - c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; - c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; - c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; - c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; - c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; - c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; - c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; - c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; - c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; - c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; - c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; - c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; - c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; - c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; - c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; - - -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu - - dspfunc(put_h264_qpel, 0, 16); - dspfunc(put_h264_qpel, 1, 8); - dspfunc(put_h264_qpel, 2, 4); - - dspfunc(avg_h264_qpel, 0, 16); - dspfunc(avg_h264_qpel, 1, 8); - dspfunc(avg_h264_qpel, 2, 4); - -#undef dspfunc - - -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -#ifndef DSPUTIL_CELL_H -#define DSPUTIL_CELL_H - -#include "types_spu.h" - -typedef struct DSPContext_spu { - - void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); - void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0); - /* v/h_loop_filter_luma_intra: align 16 */ - void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); - void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); - void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0); - void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0); - void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); - void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); - - qpel_mc_func put_h264_qpel_pixels_tab[3][16]; - qpel_mc_func avg_h264_qpel_pixels_tab[3][16]; - - h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; - h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; - - h264_idct_func h264_idct_add[2]; - - h264_weight_func weight_h264_pixels_tab[10]; - h264_biweight_func biweight_h264_pixels_tab[10]; - -} DSPContext_spu; - - -void dsputil_h264_init_cell(DSPContext_spu* c); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2633 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 cabac decoding. - * @author Michael Niedermayer - */ -#define CELL_SPE -#include -#include -#include "libavutil/intreadwrite.h" -#include "libavutil/mem.h" -#include "libavcodec/avcodec.h" -#include "h264_deblock_spu.h" -#include "h264_pred_spu.h" -#include "h264_direct_spu.h" -#include "h264_tables.h" -#include "mathops_spu.h" -//#include "libavcodec/h264_data.h" -#include "cabac_spu.h" -#include "rectangle_spu.h" -#include "libavutil/log.h" - -//#undef NDEBUG -#include -#define INT_BIT (sizeof(int) * 8) -/* Cabac pre state table */ -typedef struct IMbInfo{ - uint16_t type; - uint8_t pred_mode; - uint8_t cbp; -} IMbInfo; - -extern int bytecount; - -static const IMbInfo i_mb_type_info[26]={ -{MB_TYPE_INTRA4x4 , -1, -1}, -{MB_TYPE_INTRA16x16, 2, 0}, -{MB_TYPE_INTRA16x16, 1, 0}, -{MB_TYPE_INTRA16x16, 0, 0}, -{MB_TYPE_INTRA16x16, 3, 0}, -{MB_TYPE_INTRA16x16, 2, 16}, -{MB_TYPE_INTRA16x16, 1, 16}, -{MB_TYPE_INTRA16x16, 0, 16}, -{MB_TYPE_INTRA16x16, 3, 16}, -{MB_TYPE_INTRA16x16, 2, 32}, -{MB_TYPE_INTRA16x16, 1, 32}, -{MB_TYPE_INTRA16x16, 0, 32}, -{MB_TYPE_INTRA16x16, 3, 32}, -{MB_TYPE_INTRA16x16, 2, 15+0}, -{MB_TYPE_INTRA16x16, 1, 15+0}, -{MB_TYPE_INTRA16x16, 0, 15+0}, -{MB_TYPE_INTRA16x16, 3, 15+0}, -{MB_TYPE_INTRA16x16, 2, 15+16}, -{MB_TYPE_INTRA16x16, 1, 15+16}, -{MB_TYPE_INTRA16x16, 0, 15+16}, -{MB_TYPE_INTRA16x16, 3, 15+16}, -{MB_TYPE_INTRA16x16, 2, 15+32}, -{MB_TYPE_INTRA16x16, 1, 15+32}, -{MB_TYPE_INTRA16x16, 0, 15+32}, -{MB_TYPE_INTRA16x16, 3, 15+32}, -{MB_TYPE_INTRA_PCM , -1, -1}, -}; - -typedef struct PMbInfo{ - uint16_t type; - uint8_t partition_count; -} PMbInfo; - -static const PMbInfo p_mb_type_info[5]={ -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 4}, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4}, -}; - -static const PMbInfo p_sub_mb_type_info[4]={ -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, -{MB_TYPE_16x8 |MB_TYPE_P0L0 , 2}, -{MB_TYPE_8x16 |MB_TYPE_P0L0 , 2}, -{MB_TYPE_8x8 |MB_TYPE_P0L0 , 4}, -}; - -static const PMbInfo b_mb_type_info[23]={ -{MB_TYPE_DIRECT2|MB_TYPE_L0L1 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, -{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, -}; - -static const PMbInfo b_sub_mb_type_info[13]={ -{MB_TYPE_DIRECT2 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, -{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 4, }, -{MB_TYPE_8x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 4, }, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, -}; - -static const int8_t cabac_context_init_I[460][2] = -{ - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 unsused for I */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, - - /* 24- 39 */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - - /* 40 - 53 */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, - - /* 54 - 59 */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 -> 87 */ - { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, - { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, - { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, - { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, - { -12, 115 },{ -16, 122 }, - - /* 88 -> 104 */ - { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, - { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, - { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, - { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, - { -22, 125 }, - - /* 105 -> 135 */ - { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, - { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, - { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, - { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, - { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, - { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, - { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, - { 14, 62 }, { -13, 108 },{ -15, 100 }, - - /* 136 -> 165 */ - { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, - { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, - { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, - { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, - { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, - { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, - { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, - { 0, 62 }, { 12, 72 }, - - /* 166 -> 196 */ - { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, - { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, - { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, - { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, - { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, - { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, - { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, - { 0, 89 }, { 26, -19 }, { 22, -17 }, - - /* 197 -> 226 */ - { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, - { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, - { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, - { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, - { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, - { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, - { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, - { 12, 68 }, { 2, 97 }, - - /* 227 -> 251 */ - { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, - { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, - { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, - { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, - { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, - { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, - { -4, 65 }, - - /* 252 -> 275 */ - { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, - { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, - { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, - { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, - { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, - { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 -> 307 */ - { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, - { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, - { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, - { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, - { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, - { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, - { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, - { 9, 64 }, { -12, 104 },{ -11, 97 }, - - /* 308 -> 337 */ - { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, - { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, - { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, - { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, - { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, - { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, - { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, - { 5, 64 }, { 12, 70 }, - - /* 338 -> 368 */ - { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, - { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, - { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, - { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, - { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, - { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, - { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, - { -12, 109 },{ 36, -35 }, { 36, -34 }, - - /* 369 -> 398 */ - { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, - { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, - { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, - { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, - { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, - { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, - { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, - { 29, 39 }, { 19, 66 }, - - /* 399 -> 435 */ - { 31, 21 }, { 31, 31 }, { 25, 50 }, - { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, - { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, - { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, - { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, - { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, - { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, - { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, - { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, - { 0, 68 }, { -9, 92 }, - - /* 436 -> 459 */ - { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, - { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, - { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, - { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, - { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, - { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 } -}; - -static const int8_t cabac_context_init_PB[3][460][2] = -{ - /* i_cabac_init_idc == 0 */ - { - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 */ - { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, - { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, - { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, - { 17, 50 }, - - /* 24 - 39 */ - { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, - { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, - { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, - { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, - - /* 40 - 53 */ - { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, - { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, - { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, - { -3, 81 }, { 0, 88 }, - - /* 54 - 59 */ - { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, - { -7, 72 }, { 1, 58 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 - 87 */ - { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, - { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, - { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, - { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, - { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, - { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, - { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, - { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, - { 0, 68 }, { -4, 69 }, { -8, 88 }, - - /* 105 -> 165 */ - { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, - { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, - { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, - { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, - { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, - { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, - { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, - { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, - { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, - { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, - { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, - { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, - { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, - { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, - { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, - { 9, 69 }, - - /* 166 - 226 */ - { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, - { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, - { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, - { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, - { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, - { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, - { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, - { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, - { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, - { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, - { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, - { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, - { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, - { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, - { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, - { -9, 108 }, - - /* 227 - 275 */ - { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, - { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, - { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, - { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, - { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, - { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, - { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, - { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, - { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, - { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, - { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, - { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, - { -8, 85 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 - 337 */ - { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, - { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, - { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, - { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, - { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, - { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, - { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, - { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, - { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, - { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, - { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, - { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, - { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, - { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, - { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, - { 26, 43 }, - - /* 338 - 398 */ - { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, - { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, - { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, - { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, - { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, - { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, - { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, - { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, - { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, - { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, - { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, - { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, - { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, - { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, - { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, - { 11, 86 }, - - /* 399 - 435 */ - { 12, 40 }, { 11, 51 }, { 14, 59 }, - { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, - { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, - { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, - { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, - { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, - { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, - { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, - { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, - { -8, 66 }, { -8, 76 }, - - /* 436 - 459 */ - { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, - { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, - { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, - { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, - { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, - { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, - }, - - /* i_cabac_init_idc == 1 */ - { - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 */ - { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, - { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, - { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, - { 10, 54 }, - - /* 24 - 39 */ - { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, - { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, - { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, - { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, - - /* 40 - 53 */ - { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, - { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, - { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, - { -7, 86 },{ -5, 95 }, - - /* 54 - 59 */ - { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, - { -5, 72 },{ 0, 61 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 - 104 */ - { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, - { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, - { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, - { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, - { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, - { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, - { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, - { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, - { 0, 68 }, { -7, 74 }, { -9, 88 }, - - /* 105 -> 165 */ - { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, - { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, - { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, - { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, - { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, - { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, - { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, - { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, - { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, - { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, - { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, - { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, - { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, - { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, - { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, - { 0, 89 }, - - /* 166 - 226 */ - { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, - { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, - { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, - { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, - { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, - { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, - { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, - { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, - { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, - { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, - { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, - { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, - { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, - { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, - { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, - { -10, 116 }, - - /* 227 - 275 */ - { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, - { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, - { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, - { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, - { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, - { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, - { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, - { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, - { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, - { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, - { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, - { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, - { -4, 78 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 - 337 */ - { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, - { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, - { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, - { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, - { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, - { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, - { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, - { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, - { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, - { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, - { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, - { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, - { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, - { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, - { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, - { 18, 50 }, - - /* 338 - 398 */ - { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, - { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, - { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, - { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, - { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, - { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, - { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, - { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, - { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, - { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, - { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, - { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, - { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, - { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, - { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, - { 11, 83 }, - - /* 399 - 435 */ - { 25, 32 }, { 21, 49 }, { 21, 54 }, - { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, - { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, - { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, - { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, - { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, - { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, - { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, - { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, - { -4, 67 }, { -7, 82 }, - - /* 436 - 459 */ - { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, - { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, - { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, - { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, - { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, - { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, - }, - - /* i_cabac_init_idc == 2 */ - { - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 */ - { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, - { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, - { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, - { 14, 57 }, - - /* 24 - 39 */ - { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, - { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, - { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, - { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, - - /* 40 - 53 */ - { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, - { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, - { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, - { -3, 90 },{ -1, 101 }, - - /* 54 - 59 */ - { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, - { -7, 50 },{ 1, 60 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 - 104 */ - { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, - { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, - { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, - { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, - { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, - { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, - { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, - { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, - { 3, 68 }, { -8, 71 }, { -13, 98 }, - - /* 105 -> 165 */ - { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, - { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, - { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, - { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, - { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, - { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, - { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, - { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, - { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, - { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, - { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, - { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, - { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, - { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, - { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, - { -22, 127 }, - - /* 166 - 226 */ - { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, - { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, - { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, - { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, - { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, - { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, - { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, - { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, - { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, - { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, - { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, - { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, - { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, - { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, - { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, - { -24, 127 }, - - /* 227 - 275 */ - { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, - { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, - { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, - { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, - { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, - { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, - { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, - { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, - { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, - { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, - { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, - { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, - { -10, 87 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 - 337 */ - { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, - { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, - { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, - { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, - { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, - { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, - { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, - { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, - { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, - { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, - { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, - { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, - { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, - { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, - { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, - { 25, 42 }, - - /* 338 - 398 */ - { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, - { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, - { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, - { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, - { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, - { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, - { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, - { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, - { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, - { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, - { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, - { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, - { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, - { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, - { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, - { 25, 61 }, - - /* 399 - 435 */ - { 21, 33 }, { 19, 50 }, { 17, 61 }, - { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, - { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, - { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, - { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, - { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, - { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, - { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, - { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, - { -6, 68 }, { -10, 79 }, - - /* 436 - 459 */ - { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, - { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, - { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, - { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, - { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, - { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, - } -}; - -static const uint8_t left_block_options[4][16]={ - {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, - {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, - {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, - {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} -}; - -void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c) { - int i; - const int8_t (*tab)[2]; - - if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I; - else tab = cabac_context_init_PB[s->cabac_init_idc]; - - /* calculate pre-state */ - for( i= 0; i < 460; i++ ) { - int pre = 2*(((tab[i][0] * s->qscale) >>4 ) + tab[i][1]) - 127; - - pre^= pre>>31; - if(pre > 124) - pre= 124 + (pre&1); - - c->cabac_state[i] = pre; - } -} - -static void fill_decode_neighbors(H264Cabac_spu *hc, EDSlice_spu *s){ - H264Mb *m = s->m; - const int mb_x = m->mb_x; - const int mb_y = m->mb_y; - - m->top_type = hc->mb_type_top[mb_x]; - m->left_type = hc->mb_type[mb_x-1] ; - -} - -static void fill_decode_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ - H264Mb *m = s->m; - int topleft_xy, top_xy, topright_xy, left_xy; - int topleft_type, top_type, topright_type, left_type; - const uint8_t * left_block= left_block_options[0]; - const int mb_x = m->mb_x; - const int mb_y = m->mb_y; - const int b_stride = hc->b_stride; - int i; - - topleft_type = hc->mb_type_top[mb_x-1] ; - top_type = m->top_type ; - topright_type= hc->mb_type_top[mb_x+1] ; - left_type = m->left_type ; - - if (s->slice_type_nos == FF_B_TYPE){ - get_list = get_list_buf; - for(int i=0; i<2; i++){ - get_dma_list(hc->list1_motion_val[i], s->list1.motion_val[i][4*mb_x + 4*mb_y*b_stride], 16, 4, b_stride*2*sizeof(int16_t), ED_get_mv, 0); - } - if (hc->blocking) wait_dma_id(ED_get_mv); - } - - if(!IS_SKIP(mb_type)){ - if(IS_INTRA(mb_type)){ - int type_mask= s->pps.constrained_intra_pred ? IS_INTRA(-1) : -1; - m->topleft_samples_available= - m->top_samples_available= - m->left_samples_available= 0xFFFF; - m->topright_samples_available= 0xEEEA; - - if(!(top_type & type_mask)){ - m->topleft_samples_available= 0xB3FF; - m->top_samples_available= 0x33FF; - m->topright_samples_available= 0x26EA; - } - if(!(left_type & type_mask)){ - m->topleft_samples_available&= 0xDF5F; - m->left_samples_available&= 0x5F5F; - } - - if(!(topleft_type & type_mask)) - m->topleft_samples_available&= 0x7FFF; - - if(!(topright_type & type_mask)) - m->topright_samples_available&= 0xFBFF; - - if(IS_INTRA4x4(mb_type)){ - if(IS_INTRA4x4(top_type)){ - AV_COPY32(m->intra4x4_pred_mode_cache+4+8*0, &hc->intra4x4_pred_mode_top[8*mb_x]); - }else{ - m->intra4x4_pred_mode_cache[4+8*0]= - m->intra4x4_pred_mode_cache[5+8*0]= - m->intra4x4_pred_mode_cache[6+8*0]= - m->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask); - } - for(i=0; i<2; i++){ - if(IS_INTRA4x4(left_type)){ - int8_t *mode= &hc->intra4x4_pred_mode[8*(mb_x-1)]; - m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= mode[6-left_block[0+2*i]]; - m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= mode[6-left_block[1+2*i]]; - }else{ - m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= - m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= 2 - 3*!(left_type & type_mask); - } - } - } - } - if(top_type){ - AV_COPY32(&m->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]); - m->non_zero_count_cache[1+8*0]= hc->non_zero_count_top[mb_x][1+1*8]; - m->non_zero_count_cache[2+8*0]= hc->non_zero_count_top[mb_x][2+1*8]; - m->non_zero_count_cache[1+8*3]= hc->non_zero_count_top[mb_x][1+2*8]; - m->non_zero_count_cache[2+8*3]= hc->non_zero_count_top[mb_x][2+2*8]; - }else { - m->non_zero_count_cache[1+8*0]= - m->non_zero_count_cache[2+8*0]= - m->non_zero_count_cache[1+8*3]= - m->non_zero_count_cache[2+8*3]= - AV_WN32A(&m->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040); - } - - for (i=0; i<2; i++) { - if(left_type){ - m->non_zero_count_cache[3+8*1 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+0+2*i]]; - m->non_zero_count_cache[3+8*2 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+1+2*i]]; - m->non_zero_count_cache[0+8*1 + 8*i]= hc->non_zero_count[mb_x-1][left_block[8+4+2*i]]; - m->non_zero_count_cache[0+8*4 + 8*i]= hc->non_zero_count[mb_x-1][left_block[8+5+2*i]]; - }else{ - m->non_zero_count_cache[3+8*1 + 2*8*i]= - m->non_zero_count_cache[3+8*2 + 2*8*i]= - m->non_zero_count_cache[0+8*1 + 8*i]= - m->non_zero_count_cache[0+8*4 + 8*i]= !IS_INTRA(mb_type) ? 0 : 64; - } - } - - - // top_cbp - if(top_type) { - hc->top_cbp = hc->cbp_top[mb_x]; - } else { - hc->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; - } - // left_cbp - if (left_type) { - hc->left_cbp = (hc->cbp[mb_x-1] & 0x1f0) - | ((hc->cbp[mb_x-1]>>(left_block[0]&(~1)))&2) - | (((hc->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2); - } else { - hc->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; - } - } - - if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ - int list; - - m->ref_cache[0][scan8[5 ]+1] = m->ref_cache[0][scan8[7 ]+1] = m->ref_cache[0][scan8[13]+1] = - m->ref_cache[1][scan8[5 ]+1] = m->ref_cache[1][scan8[7 ]+1] = m->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; - - for(list=0; listlist_count; list++){ - if(!USES_LIST(mb_type, list)){ - continue; - } - assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); - - if(USES_LIST(top_type, list)){ - const int b_xy= 4*mb_x + 3*hc->b_stride; - AV_COPY128(m->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]); - m->ref_cache[list][scan8[0] + 0 - 1*8]= - m->ref_cache[list][scan8[0] + 1 - 1*8]= hc->ref_index_top[list][4*mb_x + 2]; - m->ref_cache[list][scan8[0] + 2 - 1*8]= - m->ref_cache[list][scan8[0] + 3 - 1*8]= hc->ref_index_top[list][4*mb_x + 3]; - }else{ - AV_ZERO128(m->mv_cache[list][scan8[0] + 0 - 1*8]); - AV_WN32A(&m->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); - } - - if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ - for(i=0; i<2; i++){ - int cache_idx = scan8[0] - 1 + i*2*8; - if(USES_LIST(left_type, list)){ - const int b_xy= 4*(mb_x-1) + 3; - const int b8_x= 4*(mb_x-1) + 1; - AV_COPY32(m->mv_cache[list][cache_idx ], hc->motion_val[list][b_xy + hc->b_stride*left_block[0+i*2]]); - AV_COPY32(m->mv_cache[list][cache_idx+8], hc->motion_val[list][b_xy + hc->b_stride*left_block[1+i*2]]); - m->ref_cache[list][cache_idx ]= hc->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; - m->ref_cache[list][cache_idx+8]= hc->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; - }else{ - AV_ZERO32(m->mv_cache [list][cache_idx ]); - AV_ZERO32(m->mv_cache [list][cache_idx+8]); - m->ref_cache[list][cache_idx ]= - m->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); - } - } - }else{ - if(USES_LIST(left_type, list)){ - const int b_x = 4*(mb_x-1) + 3; - const int b8_x= 4*(mb_x-1) + 1; - AV_COPY32(m->mv_cache[list][scan8[0] - 1], hc->motion_val[list][b_x + hc->b_stride*left_block[0]]); - m->ref_cache[list][scan8[0] - 1]= hc->ref_index[list][b8_x + (left_block[0]&~1)]; - }else{ - AV_ZERO32(m->mv_cache [list][scan8[0] - 1]); - m->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - } - - if(USES_LIST(topright_type, list)){ - const int b_xy= 4*(mb_x+1) + 3*hc->b_stride; - AV_COPY32(m->mv_cache[list][scan8[0] + 4 - 1*8], hc->motion_val_top[list][b_xy]); - m->ref_cache[list][scan8[0] + 4 - 1*8]= hc->ref_index_top[list][4*(mb_x+1) + 2]; - }else{ - AV_ZERO32(m->mv_cache [list][scan8[0] + 4 - 1*8]); - m->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - if(m->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ - int topleft_partition= -1; - if(USES_LIST(topleft_type, list)){ - const int b_xy = 4*(mb_x-1) + 3 + hc->b_stride + (topleft_partition & 2*hc->b_stride); - const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); - AV_COPY32(m->mv_cache[list][scan8[0] - 1 - 1*8], hc->motion_val_top[list][b_xy]); - m->ref_cache[list][scan8[0] - 1 - 1*8]= hc->ref_index_top[list][b8_x]; - }else{ - AV_ZERO32(m->mv_cache[list][scan8[0] - 1 - 1*8]); - m->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - } - - if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) - continue; - - if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { - m->ref_cache[list][scan8[4 ]] = - m->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; - AV_ZERO32(m->mv_cache [list][scan8[4 ]]); - AV_ZERO32(m->mv_cache [list][scan8[12]]); - - - /* XXX beurk, Load mvd */ - if(USES_LIST(top_type, list)){ -// const int b_xy= hc->mb2br_top_xy; - AV_COPY64(hc->mvd_cache[list][scan8[0] + 0 - 1*8], hc->mvd_top[list][8*mb_x + 0]); - }else{ - AV_ZERO64(hc->mvd_cache[list][scan8[0] + 0 - 1*8]); - } - if(USES_LIST(left_type, list)){ -// const int b_xy= hc->mb2br_left_xy + 6; - AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 0*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[0]]); - AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 1*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[1]]); - }else{ - AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 0*8]); - AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 1*8]); - } - if(USES_LIST(left_type, list)){ -// const int b_xy= hc->mb2br_left_xy + 6; - AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 2*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[2]]); - AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 3*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[3]]); - }else{ - AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 2*8]); - AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 3*8]); - } - AV_ZERO16(hc->mvd_cache [list][scan8[4 ]]); - AV_ZERO16(hc->mvd_cache [list][scan8[12]]); - if(s->slice_type_nos == FF_B_TYPE){ - fill_rectangle(&hc->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); - - if(IS_DIRECT(top_type)){ - AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1)); - }else if(IS_8X8(top_type)){ - int b8_x = 4*mb_x; - hc->direct_cache[scan8[0] + 0 - 1*8]= hc->direct_top[b8_x + 2]; - hc->direct_cache[scan8[0] + 2 - 1*8]= hc->direct_top[b8_x + 3]; - }else{ - AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1)); - } - - if(IS_DIRECT(left_type)) - hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1; - else if(IS_8X8(left_type)) - hc->direct_cache[scan8[0] - 1 + 0*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)]; - else - hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1; - - if(IS_DIRECT(left_type)) - hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1; - else if(IS_8X8(left_type)) - hc->direct_cache[scan8[0] - 1 + 2*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)]; - else - hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1; - } - } - } - } - hc->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type); - - if (s->slice_type_nos == FF_B_TYPE){ - wait_dma_id(ED_get_mv); - } -} - -static int check_mv(H264Cabac_spu *hc, EDSlice_spu *s, long b_idx, long bn_idx, int mvy_limit){ - int v; - - v= hc->ref_cache[0][b_idx] != hc->ref_cache[0][bn_idx]; - if(!v && hc->ref_cache[0][b_idx]!=-1) - // absolute value >= 7 | ... - v= ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) | - ((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit); - - if(s->list_count==2){ - if(!v) - v = (hc->ref_cache[1][b_idx] != hc->ref_cache[1][bn_idx]) | - ((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) | - ((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit); - - if(v){ - if((hc->ref_cache[0][b_idx] != hc->ref_cache[1][bn_idx]) | - (hc->ref_cache[1][b_idx] != hc->ref_cache[0][bn_idx])) - return 1; - return - ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) | - ((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit) | - ((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) | - ((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit); - } - } - - return v; -} - -static void calc_bS_values(H264Cabac_spu *hc, EDSlice_spu *s, int mvy_limit, int dir) { - H264Mb *m = s->m; - int mb_type = m->mb_type; - int edge; - const int mbm_type = dir == 0 ? m->left_type : m->top_type; - - // how often to recheck mv-based bS when iterating between edges - static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1}, - {0,3,1,1,3,3,3,3}}; - const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7]; - const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4; - // how often to recheck mv-based bS when iterating along each edge - const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); - - m->edges[dir]= edges; - - if(mbm_type){ - int16_t* bS=m->bS[dir][0]; - if( IS_INTRA(mb_type|mbm_type)) { - AV_WN64A(bS, 0x0004000400040004ULL); - } else { - int i; - int mv_done; - if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { - int b_idx= 8 + 4; - int bn_idx= b_idx - (dir ? 8:1); - - bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, 8 + 4, bn_idx, mvy_limit); - mv_done = 1; - } - else - mv_done = 0; - - for( i = 0; i < 4; i++ ) { - int x = dir == 0 ? 0 : i; - int y = dir == 0 ? i : 0; - int b_idx= 8 + 4 + x + 8*y; - int bn_idx= b_idx - (dir ? 8:1); - - if( hc->non_zero_count_cache[b_idx] | - hc->non_zero_count_cache[bn_idx] ) { - bS[i] = 2; - } - else if(!mv_done) - { - bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); - } - } - } - } - - /* Calculate bS */ - for( edge = 1; edge < edges; edge++ ) { - int16_t* bS=m->bS[dir][edge]; - - if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) - continue; - - if( IS_INTRA(mb_type)) { - AV_WN64A(bS, 0x0003000300030003ULL); - } else { - int i; - int mv_done; - - if( edge & mask_edge ) { - AV_ZERO64(bS); - mv_done = 1; - } - else if( mask_par0 ) { - int b_idx= 8 + 4 + edge * (dir ? 8:1); - int bn_idx= b_idx - (dir ? 8:1); - - bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); - mv_done = 1; - } - else - mv_done = 0; - - for( i = 0; i < 4; i++ ) { - int x = dir == 0 ? edge : i; - int y = dir == 0 ? i : edge; - int b_idx= 8 + 4 + x + 8*y; - int bn_idx= b_idx - (dir ? 8:1); - - if( hc->non_zero_count_cache[b_idx] | - hc->non_zero_count_cache[bn_idx] ) { - bS[i] = 2; - } - else if(!mv_done) - { - bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); - } - } - - if(bS[0]+bS[1]+bS[2]+bS[3] == 0) - continue; - } - - } -} - -/** -* -* @return zero if the loop filter can be skiped -*/ -static int fill_filter_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ - H264Mb *m = s->m; - const int mb_x = m->mb_x; - const int mb_y = m->mb_y; - int top_type, left_type; - int qp, top_qp, left_qp; - int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice - - m->dequant4_coeff_y = hc->dequant4_coeff[0][s->qscale][0]; - m->dequant4_coeff_cb = hc->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][s->chroma_qp[0]][0]; - m->dequant4_coeff_cr = hc->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][s->chroma_qp[1]][0]; - - m->qscale_mb_xy = qp = hc->qscale[mb_x]; - m->qscale_left_mb_xy = left_qp = hc->qscale[mb_x-1]; - m->qscale_top_mb_xy = top_qp = hc->qscale_top[mb_x]; - - //for sufficiently low qp, filtering wouldn't do anything - //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp - if(qp <= qp_thresh - && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh) - && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){ - m->deblock_mb = 0; - return 0; - } - - - m->deblock_mb = 1; - - top_type = hc->mb_type_top[mb_x] ; - left_type = hc->mb_type[mb_x -1]; - - m->top_type = top_type ; - m->left_type = left_type; - - if(IS_INTRA(mb_type)){ - calc_bS_values(hc, s, 4, 0); - calc_bS_values(hc, s, 4, 1); - return 1; - } - - AV_COPY64(&hc->non_zero_count_cache[0+8*1], &hc->non_zero_count[mb_x][ 0]); - AV_COPY64(&hc->non_zero_count_cache[0+8*2], &hc->non_zero_count[mb_x][ 8]); - AV_COPY32(&hc->non_zero_count_cache[0+8*5], &hc->non_zero_count[mb_x][16]); - AV_COPY32(&hc->non_zero_count_cache[4+8*3], &hc->non_zero_count[mb_x][20]); - AV_COPY64(&hc->non_zero_count_cache[0+8*4], &hc->non_zero_count[mb_x][24]); - - m->cbp= hc->cbp[mb_x]; - - { - int list; - for(list=0; listlist_count; list++){ - int8_t *ref; - int y, b_stride; - int16_t (*mv_dst)[2]; - int16_t (*mv_src)[2]; - - if(!USES_LIST(mb_type, list)){ - fill_rectangle( hc->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); - AV_WN32A(&hc->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - AV_WN32A(&hc->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - AV_WN32A(&hc->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - AV_WN32A(&hc->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - continue; - } - - ref = &hc->ref_index[list][4*mb_x]; - { - int (*ref2frm)[64] =(void *) (s->ref2frm[0] + 2); - AV_WN32A(&hc->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - AV_WN32A(&hc->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - ref += 2; - AV_WN32A(&hc->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - AV_WN32A(&hc->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - } - b_stride = hc->b_stride; - mv_dst = &hc->mv_cache[list][scan8[0]]; - mv_src = &hc->motion_val[list][4*mb_x]; - for(y=0; y<4; y++){ - AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride); - } - - } - } - - /* - 0 . T T. T T T T - 1 L . .L . . . . - 2 L . .L . . . . - 3 . T TL . . . . - 4 L . .L . . . . - 5 L . .. . . . . - */ - //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) - if(top_type){ - AV_COPY32(&hc->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]); - } - - if(left_type){ - hc->non_zero_count_cache[3+8*1]= hc->non_zero_count[mb_x-1][7+0*8]; - hc->non_zero_count_cache[3+8*2]= hc->non_zero_count[mb_x-1][7+1*8]; - hc->non_zero_count_cache[3+8*3]= hc->non_zero_count[mb_x-1][7+2*8]; - hc->non_zero_count_cache[3+8*4]= hc->non_zero_count[mb_x-1][7+3*8]; - } - - if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ - int list; - for(list=0; listlist_count; list++){ - if(USES_LIST(top_type, list)){ - const int b_xy= 4*mb_x + 3*hc->b_stride; - const int b8_x= 4*mb_x + 2; - int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); - AV_COPY128(hc->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]); - hc->ref_cache[list][scan8[0] + 0 - 1*8]= - hc->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 0]]; - hc->ref_cache[list][scan8[0] + 2 - 1*8]= - hc->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 1]]; - }else{ - AV_ZERO128(hc->mv_cache[list][scan8[0] + 0 - 1*8]); - AV_WN32A(&hc->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u); - } - - if(USES_LIST(left_type, list)){ - const int b_x = 4*(mb_x-1) + 3; - const int b8_x= 4*(mb_x-1) + 1; - int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); - AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 0 ], hc->motion_val[list][b_x + hc->b_stride*0]); - AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 8 ], hc->motion_val[list][b_x + hc->b_stride*1]); - AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +16 ], hc->motion_val[list][b_x + hc->b_stride*2]); - AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +24 ], hc->motion_val[list][b_x + hc->b_stride*3]); - hc->ref_cache[list][scan8[0] - 1 + 0 ]= - hc->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*0]]; - hc->ref_cache[list][scan8[0] - 1 +16 ]= - hc->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*1]]; - }else{ - AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 0 ]); - AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 8 ]); - AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +16 ]); - AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +24 ]); - hc->ref_cache[list][scan8[0] - 1 + 0 ]= - hc->ref_cache[list][scan8[0] - 1 + 8 ]= - hc->ref_cache[list][scan8[0] - 1 + 16 ]= - hc->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED; - } - } - } - calc_bS_values(hc, s, 4, 0); - calc_bS_values(hc, s, 4, 1); - return 1; -} - - -/** -* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. -*/ -static int check_intra4x4_pred_mode(EDSlice_spu *s){ - H264Mb *m = s->m; - static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0}; - static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED}; - int i; - - if(!(m->top_samples_available&0x8000)){ - for(i=0; i<4; i++){ - int status= top[ m->intra4x4_pred_mode_cache[scan8[0] + i] ]; - if(status<0){ - fprintf(stderr, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); - return -1; - } else if(status){ - m->intra4x4_pred_mode_cache[scan8[0] + i]= status; - } - } - } - - if((m->left_samples_available&0x8888)!=0x8888){ - static const int mask[4]={0x8000,0x2000,0x80,0x20}; - for(i=0; i<4; i++){ - if(!(m->left_samples_available&mask[i])){ - int status= left[ m->intra4x4_pred_mode_cache[scan8[0] + 8*i] ]; - if(status<0){ - fprintf(stderr, "left block unavailable for requested intra4x4 mode %d at %d %d, %x\n", status, m->mb_x, m->mb_y, m->left_samples_available); - return -1; - } else if(status){ - m->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status; - } - } - } - } - return 0; -} - -/** -* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. -*/ -static int check_intra_pred_mode(EDSlice_spu *s, int mode){ - H264Mb *m = s->m; - static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1}; - static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8}; - - if(mode > 6) { - fprintf(stderr, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y); - return -1; - } - - if(!(m->top_samples_available&0x8000)){ - mode= top[ mode ]; - if(mode<0){ - fprintf(stderr, "top block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y); - return -1; - } - } - - if((m->left_samples_available&0x8080) != 0x8080){ - mode= left[ mode ]; - if(m->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred - mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(m->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8); - } - if(mode<0){ - fprintf(stderr, "left block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y); - return -1; - } - } - return mode; -} - -/** - * gets the predicted intra4x4 prediction mode. - */ -static inline int pred_intra_mode(EDSlice_spu *s, int n){ - H264Mb *m = s->m; - const int index8= scan8[n]; - const int left= m->intra4x4_pred_mode_cache[index8 - 1]; - const int top = m->intra4x4_pred_mode_cache[index8 - 8]; - const int min= FFMIN(left, top); - - if(min<0) return DC_PRED; - else return min; -} - -static void write_back_intra_pred_mode(H264Cabac_spu *hc, EDSlice_spu *s){ - H264Mb *m = s->m; - const int mb_x = m->mb_x; - int8_t *mode= &hc->intra4x4_pred_mode[8*mb_x]; - - AV_COPY32(mode, m->intra4x4_pred_mode_cache + 4 + 8*4); - mode[4]= m->intra4x4_pred_mode_cache[7+8*3]; - mode[5]= m->intra4x4_pred_mode_cache[7+8*2]; - mode[6]= m->intra4x4_pred_mode_cache[7+8*1]; -} - -static inline void write_back_non_zero_count(H264Cabac_spu *hc, EDSlice_spu *s){ - H264Mb *m = s->m; - const int mb_x= m->mb_x; - - AV_COPY64(&hc->non_zero_count[mb_x][ 0], &m->non_zero_count_cache[0+8*1]); - AV_COPY64(&hc->non_zero_count[mb_x][ 8], &m->non_zero_count_cache[0+8*2]); - AV_COPY32(&hc->non_zero_count[mb_x][16], &m->non_zero_count_cache[0+8*5]); - AV_COPY32(&hc->non_zero_count[mb_x][20], &m->non_zero_count_cache[4+8*3]); - AV_COPY64(&hc->non_zero_count[mb_x][24], &m->non_zero_count_cache[0+8*4]); -} - -static inline void write_back_motion(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ - H264Mb *m = s->m; - const int mb_x = m->mb_x; - int b_stride = hc->b_stride; - const int b_x = 4*m->mb_x; //try mb2b(8)_xy - const int b8_x= 4*m->mb_x; - int list; - - if(!USES_LIST(mb_type, 0)) - fill_rectangle(&hc->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); - - for(list=0; listlist_count; list++){ - int y; - int16_t (*mv_dst)[2]; - int16_t (*mv_src)[2]; - - if(!USES_LIST(mb_type, list)) - continue; - - mv_dst = &hc->motion_val[list][b_x]; - mv_src = &m->mv_cache[list][scan8[0]]; - for(y=0; y<4; y++){ - AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); - } - { - uint8_t (*mvd_dst)[2] = (void *) hc->mvd[list][8*mb_x]; - uint8_t (*mvd_src)[2] = &hc->mvd_cache[list][scan8[0]]; - if(IS_SKIP(mb_type)) - AV_ZERO128(mvd_dst); - else{ - AV_COPY64(mvd_dst, mvd_src + 8*3); - AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); - AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); - AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); - } - } - - { - int8_t *ref_index = &hc->ref_index[list][b8_x]; - ref_index[0+0*2]= m->ref_cache[list][scan8[0]]; - ref_index[1+0*2]= m->ref_cache[list][scan8[4]]; - ref_index[0+1*2]= m->ref_cache[list][scan8[8]]; - ref_index[1+1*2]= m->ref_cache[list][scan8[12]]; - } - } - - if(s->slice_type_nos == FF_B_TYPE){ - if(IS_8X8(mb_type)){ - uint8_t *direct = &hc->direct[4*mb_x]; - direct[1] = m->sub_mb_type[1]>>1; - direct[2] = m->sub_mb_type[2]>>1; - direct[3] = m->sub_mb_type[3]>>1; - } - } -} - -static inline int get_dct8x8_allowed(EDSlice_spu *s){ - H264Mb *m = s->m; - if(s->direct_8x8_inference_flag) - return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); - else - return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); -} - -static inline int fetch_diagonal_mv(EDSlice_spu *s, const int16_t **C, int i, int list, int part_width){ - H264Mb *m = s->m; - const int topright_ref= m->ref_cache[list][ i - 8 + part_width ]; - - if(topright_ref != PART_NOT_AVAILABLE){ - *C= m->mv_cache[list][ i - 8 + part_width ]; - return topright_ref; - }else{ - *C= m->mv_cache[list][ i - 8 - 1 ]; - return m->ref_cache[list][ i - 8 - 1 ]; - } -} - -/** - * gets the predicted MV. - * @param n the block index - * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4) - * @param mx the x component of the predicted motion vector - * @param my the y component of the predicted motion vector - */ -static inline void pred_motion(EDSlice_spu *s, int n, int part_width, int list, int ref, int * const mx, int * const my){ - H264Mb *m = s->m; - const int index8= scan8[n]; - const int top_ref= m->ref_cache[list][ index8 - 8 ]; - const int left_ref= m->ref_cache[list][ index8 - 1 ]; - const int16_t * const A= m->mv_cache[list][ index8 - 1 ]; - const int16_t * const B= m->mv_cache[list][ index8 - 8 ]; - const int16_t * C; - int diagonal_ref, match_count; - - assert(part_width==1 || part_width==2 || part_width==4); - -/* mv_cache - B . . A T T T T - U . . L . . , . - U . . L . . . . - U . . L . . , . - . . . L . . . . -*/ - - diagonal_ref= fetch_diagonal_mv(s, &C, index8, list, part_width); - match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref); - - if(match_count > 1){ //most common - *mx= mid_pred(A[0], B[0], C[0]); - *my= mid_pred(A[1], B[1], C[1]); - }else if(match_count==1){ - if(left_ref==ref){ - *mx= A[0]; - *my= A[1]; - }else if(top_ref==ref){ - *mx= B[0]; - *my= B[1]; - }else{ - *mx= C[0]; - *my= C[1]; - } - }else{ - if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){ - *mx= A[0]; - *my= A[1]; - }else{ - *mx= mid_pred(A[0], B[0], C[0]); - *my= mid_pred(A[1], B[1], C[1]); - } - } - -} - -/** - * gets the directionally predicted 16x8 MV. - * @param n the block index - * @param mx the x component of the predicted motion vector - * @param my the y component of the predicted motion vector - */ -static inline void pred_16x8_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){ - H264Mb *m = s->m; - if(n==0){ - const int top_ref= m->ref_cache[list][ scan8[0] - 8 ]; - const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ]; - - if(top_ref == ref){ - *mx= B[0]; - *my= B[1]; - return; - } - }else{ - const int left_ref= m->ref_cache[list][ scan8[8] - 1 ]; - const int16_t * const A= m->mv_cache[list][ scan8[8] - 1 ]; - - if(left_ref == ref){ - *mx= A[0]; - *my= A[1]; - return; - } - } - - //RARE - pred_motion(s, n, 4, list, ref, mx, my); -} - -/** - * gets the directionally predicted 8x16 MV. - * @param n the block index - * @param mx the x component of the predicted motion vector - * @param my the y component of the predicted motion vector - */ -static inline void pred_8x16_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){ - H264Mb *m = s->m; - if(n==0){ - const int left_ref= m->ref_cache[list][ scan8[0] - 1 ]; - const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ]; - - if(left_ref == ref){ - *mx= A[0]; - *my= A[1]; - return; - } - }else{ - const int16_t * C; - int diagonal_ref; - - diagonal_ref= fetch_diagonal_mv(s, &C, scan8[4], list, 2); - if(diagonal_ref == ref){ - *mx= C[0]; - *my= C[1]; - return; - } - } - - //RARE - pred_motion(s, n, 2, list, ref, mx, my); -} - -static inline void pred_pskip_motion(EDSlice_spu *s, int * const mx, int * const my){ - H264Mb *m = s->m; - const int top_ref = m->ref_cache[0][ scan8[0] - 8 ]; - const int left_ref= m->ref_cache[0][ scan8[0] - 1 ]; - - if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE - || !( top_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 8 ])) - || !(left_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 1 ]))){ - - *mx = *my = 0; - return; - } - - pred_motion(s, 0, 4, 0, 0, mx, my); - - return; -} - -/** - * decodes a P_SKIP or B_SKIP macroblock - */ -static void decode_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s){ - H264Mb *m = s->m; - const int mb_x = m->mb_x; - int mb_type=0; - - memset(hc->non_zero_count[mb_x], 0, 32); - memset(m->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui - - if( s->slice_type_nos == FF_B_TYPE ) - { - // just for fill_caches. pred_direct_motion will set the real mb_type - mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; - fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ... - - ff_h264_pred_direct_motion(hc, s, &mb_type); - mb_type|= MB_TYPE_SKIP; - } - else - { - int mx, my; - mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; - - fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ... - pred_pskip_motion(s, &mx, &my); - fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); - fill_rectangle( m->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); - } - - write_back_motion(hc, s, mb_type); - hc->mb_type[mb_x]= mb_type; - m->mb_type = mb_type; - hc->qscale[mb_x]= s->qscale; - fill_filter_caches(hc, s, mb_type); -} - -static int decode_cabac_intra_mb_type(EDSlice_spu *s, CABACContext *c, int ctx_base, int intra_slice) { - H264Mb *m =s->m; - uint8_t *state= &c->cabac_state[ctx_base]; - int mb_type; - - if(intra_slice){ - int ctx=0; - if( m->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) - ctx++; - if( m->top_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) - ctx++; - if( get_cabac_noinline( c, &state[ctx] ) == 0 ) - return 0; /* I4x4 */ - state += 2; - }else{ - if( get_cabac_noinline( c, state ) == 0 ) - return 0; /* I4x4 */ - } - - if( get_cabac_terminate( c ) ) - return 25; /* PCM */ - - mb_type = 1; /* I16x16 */ - mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */ - if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */ - mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] ); - mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] ); - mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] ); - return mb_type; -} - -static int decode_cabac_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s, H264Mb *m, CABACContext *c) { - int ctx = 0; - const int mb_x = m->mb_x; - - if( m->mb_x>0 && !IS_SKIP( hc->mb_type[mb_x-1] )) - ctx++; - if( m->mb_y>0 && !IS_SKIP( hc->mb_type_top[mb_x] )) - ctx++; - - if( s->slice_type_nos == FF_B_TYPE ) - ctx += 13; - return get_cabac_noinline(c, &c->cabac_state[11+ctx] ); -} - -static int decode_cabac_mb_intra4x4_pred_mode( CABACContext *c, int pred_mode ) { - int mode = 0; - - if( get_cabac(c, &c->cabac_state[68] ) ) - return pred_mode; - - mode += 1 * get_cabac(c, &c->cabac_state[69] ); - mode += 2 * get_cabac(c, &c->cabac_state[69] ); - mode += 4 * get_cabac(c, &c->cabac_state[69] ); - - return mode + ( mode >= pred_mode ); -} - -static int decode_cabac_mb_chroma_pre_mode(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) { - H264Mb *m = s->m; - const int mb_x = m->mb_x; - - int ctx = 0; - - /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */ - if( m->left_type && hc->chroma_pred_mode[mb_x-1] != 0 ) - ctx++; - - if( m->top_type && hc->chroma_pred_mode_top[mb_x] != 0 ) - ctx++; - - if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 ) - return 0; - - if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) - return 1; - if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) - return 2; - else - return 3; -} - -static int decode_cabac_mb_cbp_luma(H264Cabac_spu *hc, CABACContext *c) { - int cbp_b, cbp_a, ctx, cbp = 0; - - cbp_a = hc->left_cbp; - cbp_b = hc->top_cbp; - - ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]); - ctx = !(cbp & 0x01) + 2 * !(cbp_b & 0x08); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1; - ctx = !(cbp_a & 0x08) + 2 * !(cbp & 0x01); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2; - ctx = !(cbp & 0x04) + 2 * !(cbp & 0x02); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3; - return cbp; -} -static int decode_cabac_mb_cbp_chroma(H264Cabac_spu *hc, CABACContext *c) { - int ctx; - int cbp_a, cbp_b; - - cbp_a = (hc->left_cbp>>4)&0x03; - cbp_b = (hc-> top_cbp>>4)&0x03; - - ctx = 0; - if( cbp_a > 0 ) ctx++; - if( cbp_b > 0 ) ctx += 2; - if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 ) - return 0; - - ctx = 4; - if( cbp_a == 2 ) ctx++; - if( cbp_b == 2 ) ctx += 2; - return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] ); -} - -static int decode_cabac_p_mb_sub_type( CABACContext *c) { - if( get_cabac(c, &c->cabac_state[21] ) ) - return 0; /* 8x8 */ - if( !get_cabac(c, &c->cabac_state[22] ) ) - return 1; /* 8x4 */ - if( get_cabac(c, &c->cabac_state[23] ) ) - return 2; /* 4x8 */ - return 3; /* 4x4 */ -} -static int decode_cabac_b_mb_sub_type(CABACContext *c) { - int type; - if( !get_cabac(c, &c->cabac_state[36] ) ) - return 0; /* B_Direct_8x8 */ - if( !get_cabac(c, &c->cabac_state[37] ) ) - return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */ - type = 3; - if( get_cabac(c, &c->cabac_state[38] ) ) { - if( get_cabac(c, &c->cabac_state[39] ) ) - return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */ - type += 4; - } - type += 2*get_cabac(c, &c->cabac_state[39] ); - type += get_cabac(c, &c->cabac_state[39] ); - return type; -} - -static int decode_cabac_mb_ref(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, int list, int n ) { - H264Mb *m = s->m; - int refa = m->ref_cache[list][scan8[n] - 1]; - int refb = m->ref_cache[list][scan8[n] - 8]; - int ref = 0; - int ctx = 0; - - if( s->slice_type_nos == FF_B_TYPE) { - if( refa > 0 && !(hc->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) ) - ctx++; - if( refb > 0 && !(hc->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) ) - ctx += 2; - } else { - if( refa > 0 ) - ctx++; - if( refb > 0 ) - ctx += 2; - } - - while( get_cabac(c, &c->cabac_state[54+ctx] ) ) { - ref++; - ctx = (ctx>>2)+4; - if(ref >= 32 /*h->ref_list[list]*/){ - fprintf(stderr, "refcount %d\n", ref); - return -1; - } - } - return ref; -} - -static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) { - int mvd; - - if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){ -// if(!get_cabac(&h->cabac, &c->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){ - *mvda= 0; - return 0; - } - - mvd= 1; - ctxbase+= 3; - while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) { - if( mvd < 4 ) - ctxbase++; - mvd++; - } - - if( mvd >= 9 ) { - int k = 3; - while( get_cabac_bypass(c ) ) { - mvd += 1 << k; - k++; - if(k>24){ - fprintf(stderr, "overflow in decode_cabac_mb_mvd\n"); - return INT_MIN; - } - } - while( k-- ) { - mvd += get_cabac_bypass(c )<mvd_cache[list][scan8[n] - 1][0] +\ - hc->mvd_cache[list][scan8[n] - 8][0];\ - int amvd1 = hc->mvd_cache[list][scan8[n] - 1][1] +\ - hc->mvd_cache[list][scan8[n] - 8][1];\ -\ - mx += decode_cabac_mb_mvd( c, 40, amvd0, &mpx );\ - my += decode_cabac_mb_mvd( c, 47, amvd1, &mpy );\ -} - -static av_always_inline int get_cabac_cbf_ctx(H264Cabac_spu *hc, EDSlice_spu *s, int cat, int idx, int is_dc ) { - H264Mb *m = s->m; - int nza, nzb; - int ctx = 0; - - if( is_dc ) { - if( cat == 0 ) { - nza = hc->left_cbp&0x100; - nzb = hc-> top_cbp&0x100; - } else { - nza = (hc->left_cbp>>(6+idx))&0x01; - nzb = (hc-> top_cbp>>(6+idx))&0x01; - } - } else { - assert(cat == 1 || cat == 2 || cat == 4); - nza = m->non_zero_count_cache[scan8[idx] - 1]; - nzb = m->non_zero_count_cache[scan8[idx] - 8]; - } - - if( nza > 0 ) - ctx++; - - if( nzb > 0 ) - ctx += 2; - - return ctx + 4 * cat; -} - - uint8_t last_coeff_flag_offset_8x8[63] = { - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 -}; - -static const int significant_coeff_flag_offset[2][6] = { - { 105+0, 105+15, 105+29, 105+44, 105+47, 402 }, - { 277+0, 277+15, 277+29, 277+44, 277+47, 436 } -}; -static const int last_coeff_flag_offset[2][6] = { - { 166+0, 166+15, 166+29, 166+44, 166+47, 417 }, - { 338+0, 338+15, 338+29, 338+44, 338+47, 451 } -}; -static const int coeff_abs_level_m1_offset[6] = { - 227+0, 227+10, 227+20, 227+30, 227+39, 426 -}; -static const uint8_t significant_coeff_flag_offset_8x8[2][63] = { - { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, - 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, - 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, - 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 }, - { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5, - 6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, - 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, - 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 } -}; -/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). -* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). -* map node ctx => cabac ctx for level=1 */ -static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; -/* map node ctx => cabac ctx for level>1 */ -static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; -static const uint8_t coeff_abs_level_transition[2][8] = { - /* update node ctx after decoding a level=1 */ - { 1, 2, 3, 3, 4, 5, 6, 7 }, - /* update node ctx after decoding a level>1 */ - { 4, 4, 4, 4, 5, 6, 7, 7 } -}; - -static av_always_inline void decode_cabac_residual_internal(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { - H264Mb *m = s->m; - const int mb_x = m->mb_x; - int index[64]; - - int av_unused last; - int coeff_count = 0; - int node_ctx = 0; - - uint8_t *significant_coeff_ctx_base; - uint8_t *last_coeff_ctx_base; - uint8_t *abs_level_m1_ctx_base; - - /* read coded block flag */ - if( is_dc || cat != 5 ) { - if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( hc, s, cat, n, is_dc ) ] ) == 0 ) { - if( !is_dc ) - m->non_zero_count_cache[scan8[n]] = 0; - return; - } - } - - significant_coeff_ctx_base = c->cabac_state - + significant_coeff_flag_offset[0][cat]; - last_coeff_ctx_base = c->cabac_state - + last_coeff_flag_offset[0][cat]; - abs_level_m1_ctx_base = c->cabac_state - + coeff_abs_level_m1_offset[cat]; - - if( !is_dc && cat == 5 ) { -#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \ - for(last= 0; last < coefs; last++) { \ - uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \ - if( get_cabac( c, sig_ctx )) { \ - uint8_t *last_ctx = last_coeff_ctx_base + last_off; \ - index[coeff_count++] = last; \ - if( get_cabac( c, last_ctx ) ) { \ - last= max_coeff; \ - break; \ - } \ - } \ - }\ - if( last == max_coeff -1 ) {\ - index[coeff_count++] = last;\ - }\ - - const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0]; - DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); - } else { - DECODE_SIGNIFICANCE( max_coeff - 1, last, last ); - } - assert(coeff_count > 0); - - if( is_dc ) { - if( cat == 0 ) - hc->cbp[mb_x] |= 0x100; - else - hc->cbp[mb_x] |= 0x40 << n; - } else { - if( cat == 5 ) - fill_rectangle(&m->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); - else { - assert( cat == 1 || cat == 2 || cat == 4 ); - m->non_zero_count_cache[scan8[n]] = coeff_count; - } - } - - do { - uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; - int j= scantable[index[--coeff_count]]; - - if( get_cabac( c, ctx ) == 0 ) { - node_ctx = coeff_abs_level_transition[0][node_ctx]; - if( is_dc ) { - block[j] = get_cabac_bypass_sign( c, -1); - }else{ - block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6; - } - } else { - int coeff_abs = 2; - ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; - node_ctx = coeff_abs_level_transition[1][node_ctx]; - - while( coeff_abs < 15 && get_cabac( c, ctx ) ) { - coeff_abs++; - } - - if( coeff_abs >= 15 ) { - int j = 0; - while( get_cabac_bypass( c ) ) { - j++; - } - - coeff_abs=1; - while( j-- ) { - coeff_abs += coeff_abs + get_cabac_bypass( c ); - } - coeff_abs+= 14; - } - - if( is_dc ) { - block[j] = get_cabac_bypass_sign( c, -coeff_abs ); - }else{ - block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6; - } - } - } while( coeff_count ); - -} - -static void decode_cabac_residual_dc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { - decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, NULL, max_coeff, 1); -} - -static void decode_cabac_residual_nondc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { - decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, qmul, max_coeff, 0); -} - -/** - * decodes a macroblock - * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed - */ -int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) { - H264Mb *m = s->m; - int mb_x = m->mb_x; - int mb_type, partition_count, cbp = 0; - int dct8x8_allowed= s->pps.transform_8x8_mode; - - fill_decode_neighbors(hc, s); - memset(m->mb, 0 , sizeof(m->mb)); - - if( s->slice_type_nos != FF_I_TYPE ) { - int skip; - /* a skipped mb needs the aff flag from the following mb */ - skip = decode_cabac_mb_skip( hc, s, m, c); - - /* read skip flags */ - if( skip ) { - decode_mb_skip(hc, s); - hc->cbp[mb_x] = m->cbp = 0; - hc->chroma_pred_mode[mb_x] = 0; - s->last_qscale_diff = 0; - return 0; - } - } - - if( s->slice_type_nos == FF_B_TYPE ) { - int ctx = 0; - - if( !IS_DIRECT( m->left_type-1 ) ) - ctx++; - if( !IS_DIRECT( m->top_type-1 ) ) - ctx++; - - if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){ - mb_type= 0; /* B_Direct_16x16 */ - }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) { - mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */ - }else{ - int bits; - bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3; - bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2; - bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1; - bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ); - if( bits < 8 ){ - mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */ - }else if( bits == 13 ){ - mb_type= decode_cabac_intra_mb_type(s, c, 32, 0); - goto decode_intra_mb; - }else if( bits == 14 ){ - mb_type= 11; /* B_L1_L0_8x16 */ - }else if( bits == 15 ){ - mb_type= 22; /* B_8x8 */ - }else{ - bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] ); - mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */ - } - } - partition_count= b_mb_type_info[mb_type].partition_count; - mb_type= b_mb_type_info[mb_type].type; - } else if( s->slice_type_nos == FF_P_TYPE ) { - if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) { - /* P-type */ - if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) { - /* P_L0_D16x16, P_8x8 */ - mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] ); - } else { - /* P_L0_D8x16, P_L0_D16x8 */ - mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] ); - } - partition_count= p_mb_type_info[mb_type].partition_count; - mb_type= p_mb_type_info[mb_type].type; - } else { - mb_type= decode_cabac_intra_mb_type(s, c, 17, 0); - goto decode_intra_mb; - } - } else { - mb_type= decode_cabac_intra_mb_type(s ,c, 3, 1); - if(s->slice_type == FF_SI_TYPE && mb_type) - mb_type--; - assert(s->slice_type_nos == FF_I_TYPE); -decode_intra_mb: - partition_count = 0; - cbp= i_mb_type_info[mb_type].cbp; - m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode; - mb_type= i_mb_type_info[mb_type].type; - } - - if(IS_INTRA_PCM(mb_type)) { - uint8_t *ptr; - // We assume these blocks are very rare so we do not optimize it. - // FIXME The two following lines get the bitstream position in the cabac - // decode, I think it should be done by a function in cabac.h (or cabac.c). - ptr=c->bytestream; - if(c->low&0x1) ptr--; - if(CABAC_BITS==16){ - if(c->low&0x1FF) ptr--; - } - if ((unsigned) (ptr + 384) >= (unsigned) c->bytestream_end){ - fprintf(stderr, "Intra PCM mb crossed bytestream buffer\n Known issue."); - } - - // The pixels are stored in the same order as levels in h->mb array. - memcpy(m->mb, ptr, 256); ptr+=256; - memcpy(m->mb+128, ptr, 128); ptr+=128; - - c->bytestream = ptr; - #if CABAC_BITS == 16 - c->low = (*c->bytestream++)<<18; - c->low+= (*c->bytestream++)<<10; - #else - c->low = (*c->bytestream++)<<10; - #endif - c->low+= ((*c->bytestream++)<<2) + 2; - c->range= 0x1FE; - - // All blocks are present - hc->cbp[mb_x] = 0x1ef; - hc->chroma_pred_mode[mb_x] = 0; - // In deblocking, the quantizer is 0 - hc->qscale[mb_x]= 0; - // All coeffs are present - memset(hc->non_zero_count[mb_x], 16, 32); - hc->mb_type[mb_x]= m->mb_type = mb_type; - s->last_qscale_diff = 0; - fill_filter_caches(hc, s, mb_type); - return 0; - } - fill_decode_caches(hc, s, mb_type); - - if( IS_INTRA( mb_type ) ) { - int i, pred_mode; - if( IS_INTRA4x4( mb_type ) ) { - if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ) ) { - mb_type |= MB_TYPE_8x8DCT; - for( i = 0; i < 16; i+=4 ) { - int pred = pred_intra_mode( s, i ); - int mode = decode_cabac_mb_intra4x4_pred_mode(c, pred ); - fill_rectangle( &m->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 ); - } - } else { - for( i = 0; i < 16; i++ ) { - int pred = pred_intra_mode( s, i ); - m->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode(c, pred ); - } - } - write_back_intra_pred_mode(hc, s); - if( check_intra4x4_pred_mode(s) < 0 ) return -1; - } else { - m->intra16x16_pred_mode= check_intra_pred_mode(s, m->intra16x16_pred_mode ); - if( m->intra16x16_pred_mode < 0 ) return -1; - } - - hc->chroma_pred_mode[mb_x] = - pred_mode = decode_cabac_mb_chroma_pre_mode( hc, s, c ); - - pred_mode= check_intra_pred_mode( s, pred_mode ); - if( pred_mode < 0 ) return -1; - m->chroma_pred_mode= pred_mode; - - } else if( partition_count == 4 ) { - int i, j, sub_partition_count[4], list, ref[2][4]; - - if( s->slice_type_nos == FF_B_TYPE ) { - for( i = 0; i < 4; i++ ) { - m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c ); - sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; - m->sub_mb_type[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].type; - } - if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | - m->sub_mb_type[2] | m->sub_mb_type[3]) ) { - ff_h264_pred_direct_motion(hc, s, &mb_type); - m->ref_cache[0][scan8[4]] = - m->ref_cache[1][scan8[4]] = - m->ref_cache[0][scan8[12]] = - m->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; - for( i = 0; i < 4; i++ ) - fill_rectangle( &hc->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 ); - } - } else { - for( i = 0; i < 4; i++ ) { - m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c ); - sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; - m->sub_mb_type[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].type; - } - } - - for( list = 0; list < s->list_count; list++ ) { - for( i = 0; i < 4; i++ ) { - if(IS_DIRECT(m->sub_mb_type[i])) continue; - if(IS_DIR(m->sub_mb_type[i], 0, list)){ - if( s->ref_count[list] > 1 ){ - ref[list][i] = decode_cabac_mb_ref(hc, s, c, list, 4*i ); - if(ref[list][i] >= s->ref_count[list]){ - fprintf(stderr, "Reference %d >= %d\n", ref[list][i], s->ref_count[list]); - return -1; - } - }else - ref[list][i] = 0; - } else { - ref[list][i] = -1; - } - m->ref_cache[list][ scan8[4*i]+1 ]= - m->ref_cache[list][ scan8[4*i]+8 ]=m->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i]; - } - } - - if(dct8x8_allowed) - dct8x8_allowed = get_dct8x8_allowed(s); - - for(list=0; listlist_count; list++){ - for(i=0; i<4; i++){ - m->ref_cache[list][ scan8[4*i] ]=m->ref_cache[list][ scan8[4*i]+1 ]; - if(IS_DIRECT(m->sub_mb_type[i])){ - fill_rectangle(hc->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); - continue; - } - - if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){ - const int sub_mb_type= m->sub_mb_type[i]; - const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; - for(j=0; jmv_cache[list][ scan8[index]]; - uint8_t (* mvd_cache)[2]= &hc->mvd_cache[list][ scan8[index]]; - pred_motion(s, index, block_width, list, m->ref_cache[list][ scan8[index] ], &mx, &my); - DECODE_CABAC_MB_MVD( hc, c, list, index) - - if(IS_SUB_8X8(sub_mb_type)){ - mv_cache[ 1 ][0]= - mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx; - mv_cache[ 1 ][1]= - mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my; - - mvd_cache[ 1 ][0]= - mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx; - mvd_cache[ 1 ][1]= - mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy; - }else if(IS_SUB_8X4(sub_mb_type)){ - mv_cache[ 1 ][0]= mx; - mv_cache[ 1 ][1]= my; - - mvd_cache[ 1 ][0]= mpx; - mvd_cache[ 1 ][1]= mpy; - }else if(IS_SUB_4X8(sub_mb_type)){ - mv_cache[ 8 ][0]= mx; - mv_cache[ 8 ][1]= my; - - mvd_cache[ 8 ][0]= mpx; - mvd_cache[ 8 ][1]= mpy; - } - mv_cache[ 0 ][0]= mx; - mv_cache[ 0 ][1]= my; - - mvd_cache[ 0 ][0]= mpx; - mvd_cache[ 0 ][1]= mpy; - } - }else{ - fill_rectangle(m->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); - fill_rectangle(hc->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); - } - } - } - } else if( IS_DIRECT(mb_type) ) { - ff_h264_pred_direct_motion(hc, s, &mb_type); - fill_rectangle(hc->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); - fill_rectangle(hc->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); - dct8x8_allowed &= s->direct_8x8_inference_flag; - } else { - int list, i; - if(IS_16X16(mb_type)){ - for(list=0; listlist_count; list++){ - if(IS_DIR(mb_type, 0, list)){ - int ref; - if(s->ref_count[list] > 1){ - ref= decode_cabac_mb_ref(hc, s, c, list, 0); - if(ref >= s->ref_count[list]){ - fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); - return -1; - } - }else - ref=0; - fill_rectangle(&m->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); - } - } - for(list=0; listlist_count; list++){ - if(IS_DIR(mb_type, 0, list)){ - int mx,my,mpx,mpy; - pred_motion(s, 0, 4, list, m->ref_cache[list][ scan8[0] ], &mx, &my); - DECODE_CABAC_MB_MVD( hc, c, list, 0) - - fill_rectangle(hc->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); - fill_rectangle(m->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); - } - - } - } - else if(IS_16X8(mb_type)){ - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ - int ref; - if(s->ref_count[list] > 1){ - ref= decode_cabac_mb_ref(hc, s, c, list, 8*i ); - if(ref >= s->ref_count[list]){ - fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); - return -1; - } - }else - ref=0; - fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); - }else - fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); - } - } - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ - int mx,my,mpx,mpy; - pred_16x8_motion(s, 8*i, list, m->ref_cache[list][scan8[0] + 16*i], &mx, &my); - DECODE_CABAC_MB_MVD( hc, c, list, 8*i) - - fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); - fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); - }else{ - fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); - fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); - } - } - } - }else{ - assert(IS_8X16(mb_type)); - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ //FIXME optimize - int ref; - if(s->ref_count[list] > 1){ - ref= decode_cabac_mb_ref(hc, s, c, list, 4*i ); - if(ref >= s->ref_count[list]){ - fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); - return -1; - } - }else - ref=0; - fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); - }else - fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); - } - } - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ - int mx,my,mpx,mpy; - pred_8x16_motion( s, i*4, list, m->ref_cache[list][ scan8[0] + 2*i ], &mx, &my); - DECODE_CABAC_MB_MVD( hc, c, list, 4*i) - - fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); - fill_rectangle(m->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); - }else{ - fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); - fill_rectangle(m-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); - } - } - } - } - } - - if( IS_INTER( mb_type ) ) { - hc->chroma_pred_mode[mb_x] = 0; - write_back_motion( hc, s, mb_type ); - } - - if( !IS_INTRA16x16( mb_type ) ) { - cbp = decode_cabac_mb_cbp_luma( hc, c); - cbp |= decode_cabac_mb_cbp_chroma( hc, c ) << 4; - } - - hc->cbp[mb_x] = m->cbp = cbp; - if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) { - mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ); - } - - if( cbp || IS_INTRA16x16( mb_type ) ) { - const uint8_t *scan, *scan8x8, *dc_scan; - const uint32_t *qmul; - - if (s->transform_bypass && s->qscale){ - scan8x8= ff_zigzag_direct; - scan= zigzag_scan; - }else{ - scan8x8= hc->zigzag_scan8x8; - scan= hc->zigzag_scan; - } - dc_scan= luma_dc_zigzag_scan; - - // decode_cabac_mb_dqp - if(get_cabac_noinline(c, &c->cabac_state[60 + (s->last_qscale_diff != 0)])){ - int val = 1; - int ctx= 2; - - while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) { - ctx= 3; - val++; - if(val > 102){ //prevent infinite loop - fprintf(stderr, "cabac decode of qscale diff failed at %d %d (%d)\n", m->mb_x, m->mb_y, val); - return -1; - } - } - - if( val&0x01 ) - val= (val + 1)>>1 ; - else - val= -((val + 1)>>1); - s->last_qscale_diff = val; - s->qscale += val; - if(((unsigned)s->qscale) > 51){ - if(s->qscale<0) s->qscale+= 52; - else s->qscale-= 52; - } - s->chroma_qp[0] = s->pps.chroma_qp_table[0][s->qscale]; - s->chroma_qp[1] = s->pps.chroma_qp_table[1][s->qscale]; - }else - s->last_qscale_diff=0; - - if( IS_INTRA16x16( mb_type ) ) { - int i; - decode_cabac_residual_dc( hc, s, c, m->mb, 0, 0, dc_scan, 16); - - if( cbp&15 ) { - qmul = hc->dequant4_coeff[0][s->qscale]; - for( i = 0; i < 16; i++ ) { - decode_cabac_residual_nondc( hc, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15); - } - } else { - fill_rectangle(&m->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1); - } - } else { - int i8x8, i4x4; - for( i8x8 = 0; i8x8 < 4; i8x8++ ) { - if( cbp & (1<mb + 64*i8x8, 5, 4*i8x8, - scan8x8, hc->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64); - } else { - qmul = hc->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale]; - for( i4x4 = 0; i4x4 < 4; i4x4++ ) { - const int index = 4*i8x8 + i4x4; -//START_TIMER - decode_cabac_residual_nondc(hc, s, c, m->mb + 16*index, 2, index, scan, qmul, 16); -//STOP_TIMER("decode_residual") - } - } - } else { - uint8_t * const nnz= &m->non_zero_count_cache[ scan8[4*i8x8] ]; - nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; - } - } - } - - if( cbp&0x30 ){ - int i; - for( i = 0; i < 2; i++ ) { - decode_cabac_residual_dc(hc, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4); - } - } - - if( cbp&0x20 ) { - int i, j; - for( i = 0; i < 2; i++ ) { - qmul = hc->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][s->chroma_qp[i]]; - for( j = 0; j < 4; j++ ) { - const int index = 16 + 4 * i + j; - decode_cabac_residual_nondc( hc, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15); - } - } - } else { - uint8_t * const nnz= &m->non_zero_count_cache[0]; - nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = - nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; - } - } else { - uint8_t * const nnz= &m->non_zero_count_cache[0]; - fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1); - nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = - nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; - s->last_qscale_diff = 0; - } - hc->mb_type[mb_x]= m->mb_type = mb_type; - hc->qscale[mb_x]= s->qscale; - write_back_non_zero_count(hc, s); - fill_filter_caches(hc, s, mb_type); - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,17 +0,0 @@ -#ifndef H264_CABAC_H -#define H264_CABAC_H - -#define CELL_SPE -#include "libavcodec/avcodec.h" -#include "h264_types_spu.h" -#include "cabac_spu.h" - - -/** - * decodes a CABAC coded macroblock - * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed - */ -int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c); -void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,355 +0,0 @@ -static void PREFIX_h264_chroma_mc8_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { - - register int i; - - const int16_t i32ss= 32; - const int16_t imax = 255; - const int16_t iABCD1 = ((8 - x) * (8 - y)); - const int16_t iABCD2 = ((x) * (8 - y)); - const int16_t iABCD3 = ((8 - x) * (y)); - const int16_t iABCD4 = ((x) * (y)); - - const vsint16_t vA = spu_splats(iABCD1); - const vsint16_t vB = spu_splats(iABCD2); - const vsint16_t vC = spu_splats(iABCD3); - const vsint16_t vD = spu_splats(iABCD4); - const vsint32_t vzero = spu_splats(0); - const vsint16_t v32ss = spu_splats(i32ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const int shift_src =(unsigned int) src & 15; - const int shift_dst =(unsigned int) dst & 15; - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; - vuint8_t dstmask; - - if(shift_dst==0) - dstmask=dstmask0; - else - dstmask=dstmask8; - - vuint8_t vsrc0uc1; - vuint8_t vsrc0uc2; - vuint8_t vsrc0uc; - vuint8_t vsrc1uc; - vsrc0uc1 = *(vuint8_t *)(src); - vsrc0uc2 = *(vuint8_t *)(src+16); - vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); - vsrc1uc = spu_slqwbyte(vsrc0uc, 1); - - vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); - vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); - - for (i = 0 ; i < h ; i++) { - - vuint8_t vsrc2uc1; - vuint8_t vsrc2uc2; - vuint8_t vsrc2uc; - vuint8_t vsrc3uc; - vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); - vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); - vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); - vsrc3uc = spu_slqwbyte(vsrc2uc, 1); - - vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); - vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); - - vsint16_t psum; - - vsint32_t psum1 = spu_mule(vsrc0ssH, vA); - vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); - psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - - psum1 = spu_mule(vsrc1ssH, vB); - psum2 = spu_mulo(vsrc1ssH, vB); - vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum1 = spu_mule(vsrc2ssH, vC); - psum2 = spu_mulo(vsrc2ssH, vC); - psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum1 = spu_mule(vsrc3ssH, vD); - psum2 = spu_mulo(vsrc3ssH, vD); - psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum = spu_add(v32ss, psum); - psum = spu_rlmask(psum, -6); - - //Saturation from 0 to 255 - sat = spu_cmpgt(psum,(vsint16_t)vzero); - psum = spu_and(psum,(vsint16_t)sat); - sat = spu_cmpgt(psum,vmax); - psum = spu_sel(psum,vmax,sat); - - const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += dst_stride; - //src += src_stride; - src += STRIDE_C; - } -} - -static void PREFIX_h264_chroma_mc4_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { - - register int i; - - const int16_t i32ss= 32; - const int16_t imax = 255; - const int16_t iABCD1 = ((8 - x) * (8 - y)); - const int16_t iABCD2 = ((x) * (8 - y)); - const int16_t iABCD3 = ((8 - x) * (y)); - const int16_t iABCD4 = ((x) * (y)); - - const vsint16_t vA = spu_splats(iABCD1); - const vsint16_t vB = spu_splats(iABCD2); - const vsint16_t vC = spu_splats(iABCD3); - const vsint16_t vD = spu_splats(iABCD4); - const vsint32_t vzero = spu_splats(0); - const vsint16_t v32ss = spu_splats(i32ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - const int shift_src = (unsigned int) src & 15; - const int shift_dst = (unsigned int) dst & 15; - vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; - - switch(shift_dst){ - case 0: dstmask = dstmask0; - break; - case 4: dstmask = dstmask4; - break; - case 8: dstmask = dstmask8; - break; - case 12: dstmask = dstmask12; - break; - } - - vuint8_t vsrc0uc1; - vuint8_t vsrc0uc2; - vuint8_t vsrc0uc; - vuint8_t vsrc1uc; - vsrc0uc1 = *(vuint8_t *)(src); - vsrc0uc2 = *(vuint8_t *)(src+16); - vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); - vsrc1uc = spu_slqwbyte(vsrc0uc, 1); - - vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); - vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); - - for (i = 0 ; i < h ; i++) { - - vuint8_t vsrc2uc1; - vuint8_t vsrc2uc2; - vuint8_t vsrc2uc; - vuint8_t vsrc3uc; - vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); - vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); - vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); - vsrc3uc = spu_slqwbyte(vsrc2uc, 1); - - vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); - vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); - - vsint16_t psum; - - vsint32_t psum1 = spu_mule(vsrc0ssH, vA); - vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); - psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - - psum1 = spu_mule(vsrc1ssH, vB); - psum2 = spu_mulo(vsrc1ssH, vB); - vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum1 = spu_mule(vsrc2ssH, vC); - psum2 = spu_mulo(vsrc2ssH, vC); - psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum1 = spu_mule(vsrc3ssH, vD); - psum2 = spu_mulo(vsrc3ssH, vD); - psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum = spu_add(v32ss, psum); - psum = spu_rlmask(psum, -6); - - //Saturation from 0 to 255 - sat = spu_cmpgt(psum,(vsint16_t)vzero); - psum = spu_and(psum,(vsint16_t)sat); - sat = spu_cmpgt(psum,vmax); - psum = spu_sel(psum,vmax,sat); - - const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += dst_stride; - src += STRIDE_C; - } -} - -static void PREFIX_h264_chroma_mc2_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { - - register int i; - - const int16_t i32ss= 32; - const int16_t imax = 255; - const int16_t iABCD1 = ((8 - x) * (8 - y)); - const int16_t iABCD2 = ((x) * (8 - y)); - const int16_t iABCD3 = ((8 - x) * (y)); - const int16_t iABCD4 = ((x) * (y)); - - const vsint16_t vA = spu_splats(iABCD1); - const vsint16_t vB = spu_splats(iABCD2); - const vsint16_t vC = spu_splats(iABCD3); - const vsint16_t vD = spu_splats(iABCD4); - const vsint32_t vzero = spu_splats(0); - const vsint16_t v32ss = spu_splats(i32ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - const int shift_src = (unsigned int) src & 15; - const int shift_dst = (unsigned int) dst & 15; - vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const vuint8_t dstmask0= {0x10,0x11,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask2= {0x00,0x01,0x10,0x11,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask6= {0x00,0x01,0x02,0x03,0x04,0x05,0x10,0x11,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask10= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x10,0x11,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x0E,0x0F}; - const vuint8_t dstmask14= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x10,0x11}; - - switch(shift_dst){ - case 0: dstmask = dstmask0; - break; - case 2: dstmask = dstmask2; - break; - case 4: dstmask = dstmask4; - break; - case 6: dstmask = dstmask6; - break; - case 8: dstmask = dstmask8; - break; - case 10: dstmask = dstmask10; - break; - case 12: dstmask = dstmask12; - break; - case 14: dstmask = dstmask14; - break; - } - - vuint8_t vsrc0uc1; - vuint8_t vsrc0uc2; - vuint8_t vsrc0uc; - vuint8_t vsrc1uc; - vsrc0uc1 = *(vuint8_t *)(src); - vsrc0uc2 = *(vuint8_t *)(src+16); - vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); - vsrc1uc = spu_slqwbyte(vsrc0uc, 1); - - vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); - vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); - - for (i = 0 ; i < h ; i++) { - - vuint8_t vsrc2uc1; - vuint8_t vsrc2uc2; - vuint8_t vsrc2uc; - vuint8_t vsrc3uc; - vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); - vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); - vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); - vsrc3uc = spu_slqwbyte(vsrc2uc, 1); - - vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); - vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); - - vsint16_t psum; - - vsint32_t psum1 = spu_mule(vsrc0ssH, vA); - vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); - psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - - psum1 = spu_mule(vsrc1ssH, vB); - psum2 = spu_mulo(vsrc1ssH, vB); - vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum1 = spu_mule(vsrc2ssH, vC); - psum2 = spu_mulo(vsrc2ssH, vC); - psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum1 = spu_mule(vsrc3ssH, vD); - psum2 = spu_mulo(vsrc3ssH, vD); - psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); - psum = spu_add(psum3, psum); - - psum = spu_add(v32ss, psum); - psum = spu_rlmask(psum, -6); - - //Saturation from 0 to 255 - sat = spu_cmpgt(psum,(vsint16_t)vzero); - psum = spu_and(psum,(vsint16_t)sat); - sat = spu_cmpgt(psum,vmax); - psum = spu_sel(psum,vmax,sat); - - const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += dst_stride; - src += STRIDE_C; - } -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2009 TUDelft - * - * Cell Parallel SPU - 2DWave Macroblock Decoding. - */ - -/** - * @file libavcodec/cell/spu/h264_main_spu.c - * Cell Parallel SPU - 2DWave Macroblock Decoding - * @author C C Chi - * - * SIMD kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - -#include "h264_deblock_spu.h" -#include "h264_decode_mb_spu.h" - -extern int print_debug; - -static void filter_mb_edgev( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { - H264slice *s= h->s; - const int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - tc[0] = tc0_table[index_a][bS[0]]; - tc[1] = tc0_table[index_a][bS[1]]; - tc[2] = tc0_table[index_a][bS[2]]; - tc[3] = tc0_table[index_a][bS[3]]; - - h->dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc); - } else { - h->dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta); - } -} - -static void filter_mb_edgecv( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { - H264slice *s= h->s; - const int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - - tc[0] = tc0_table[index_a][bS[0]]+1; - tc[1] = tc0_table[index_a][bS[1]]+1; - tc[2] = tc0_table[index_a][bS[2]]+1; - tc[3] = tc0_table[index_a][bS[3]]+1; - - h->dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc); - } else { - h->dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta); - } -} - -static void filter_mb_edgeh( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { - H264slice *s= h->s; - const int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - - tc[0] = tc0_table[index_a][bS[0]]; - tc[1] = tc0_table[index_a][bS[1]]; - tc[2] = tc0_table[index_a][bS[2]]; - tc[3] = tc0_table[index_a][bS[3]]; - - h->dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc); - } else { - h->dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta); - } -} - -static void filter_mb_edgech( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { - H264slice *s= h->s; - const int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - - tc[0] = tc0_table[index_a][bS[0]]+1; - tc[1] = tc0_table[index_a][bS[1]]+1; - tc[2] = tc0_table[index_a][bS[2]]+1; - tc[3] = tc0_table[index_a][bS[3]]+1; - - h->dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc); - } else { - h->dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta); - } -} - -static void filter_mb_dir(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int dir) { - H264Mb *mb = h->mb; - H264slice *s = h->s; - const int qp_xy= mb->qscale_mb_xy; - const int qp_dir = dir == 0 ? mb->qscale_left_mb_xy : mb->qscale_top_mb_xy; - const int mbm_type = dir == 0 ? mb->left_type : mb->top_type; - const int mb_type = mb->mb_type; - int edge; - const int edges = mb->edges[dir]; - //int (*ref2frm)[64] = s->ref2frm; - -// int start;//= h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0; -// -// const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) -// == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4; -// // how often to recheck mv-based bS when iterating between edges -// const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 : -// (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0; -// // how often to recheck mv-based bS when iterating along each edge -// const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); - -// if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0)) -// start =1; -// else -// start =0; -// -// /* Calculate bS */ -// for( edge = start; edge < edges; edge++ ) { -// const int mbn_type = edge > 0 ? mb_type : mbm_type; -// const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm; -// int (*ref2frmn)[64] = ref2frm;//edge > 0 ? ref2frm : ref2frmm; -// int16_t bS[4]; -// int qp; -// -// if( (edge&1) && IS_8x8DCT(mb_type) ) -// continue; -// -// if( IS_INTRA(mb_type) || -// IS_INTRA(mbn_type) ) { -// int value; -// -// if (edge == 0) { -// value = 4; -// } else { -// value = 3; -// } -// bS[0] = bS[1] = bS[2] = bS[3] = value; -// } else { -// int i, l; -// int mv_done; -// -// if( edge & mask_edge ) { -// -// bS[0] = bS[1] = bS[2] = bS[3] = 0; -// mv_done = 1; -// } -// else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { -// int b_idx= 8 + 4 + edge * (dir ? 8:1); -// int bn_idx= b_idx - (dir ? 8:1); -// int v = 0; -// -// for( l = 0; !v && l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) { -// v |= ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] || -// FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || -// FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit; -// } -// bS[0] = bS[1] = bS[2] = bS[3] = v; -// -// mv_done = 1; -// } -// else -// mv_done = 0; -// -// for( i = 0; i < 4; i++ ) { -// int x = dir == 0 ? edge : i; -// int y = dir == 0 ? i : edge; -// int b_idx= 8 + 4 + x + 8*y; -// int bn_idx= b_idx - (dir ? 8:1); -// -// if( mb->non_zero_count_cache[b_idx] | -// mb->non_zero_count_cache[bn_idx] ) { -// bS[i] = 2; -// } -// else if(!mv_done) -// { -// bS[i] = 0; -// for( l = 0; l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) { -// if( ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] || -// FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || -// FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) { -// bS[i] = 1; -// break; -// } -// } -// } -// } -// -// if(bS[0]+bS[1]+bS[2]+bS[3] == 0) -// continue; -// } -// qp = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1; - - if(mbm_type){ - int16_t* bS=mb->bS[dir][0]; - /* Filter edge */ - // Do not use s->qscale as luma quantizer because it has not the same - // value in IPCM macroblocks. - if(bS[0]+bS[1]+bS[2]+bS[3]){ - int qp = ( qp_xy + qp_dir + 1 ) >> 1; - if( dir == 0 ) { - filter_mb_edgev(h, &img_y[0], linesize, bS, qp); - { - int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; - filter_mb_edgecv(h, &img_cb[0], uvlinesize, bS, qp); - filter_mb_edgecv(h, &img_cr[0], uvlinesize, bS, qp); - } - } else { - filter_mb_edgeh(h, &img_y[0], linesize, bS, qp); - { - int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; - filter_mb_edgech(h, &img_cb[0], uvlinesize, bS, qp); - filter_mb_edgech(h, &img_cr[0], uvlinesize, bS, qp); - } - } - } - } - - for( edge = 1; edge < edges; edge++ ) { - int16_t* bS=mb->bS[dir][edge]; - int qp = qp_xy; - - if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) - continue; - - /* Filter edge */ - // Do not use s->qscale as luma quantizer because it has not the same - // value in IPCM macroblocks. - - if(bS[0]+bS[1]+bS[2]+bS[3] == 0) - continue; - - if( dir == 0 ) { - filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp ); - if( (edge&1) == 0 ) { - filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) ); - filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) ); - } - } else { - filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp ); - if( (edge&1) == 0 ) { - filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) ); - filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) ); - } - } - } -} - -void filter_mb( H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) { - filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 0); - filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 1); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ -#ifndef H264_FILTER_SPU_H -#define H264_FILTER_SPU_H - -#include "types_spu.h" -#include "h264_decode_mb_spu.h" - -#define FFABS(a) ((a) >= 0 ? (a) : (-(a))) - -void filter_mb(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize); - -/* Deblocking filter (p153) */ -static const uint8_t alpha_table[52*3] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, - 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, - 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, - 80, 90,101,113,127,144,162,182,203,226, - 255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, -}; - -static const uint8_t beta_table[52*3] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, - 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, - 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, - 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, - 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, -}; - -static const uint8_t tc0_table[52*3][4] = { - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, - {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, - {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, - {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, - {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, - {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, - {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, -}; - -static inline int get_chroma_qp(H264slice *s, int t, int qscale){ - return s->chroma_qp_table[t][qscale]; -} - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,725 +0,0 @@ -/* - * Copyright (c) 2009 TUDelft - * - * Cell Parallel SPU - 2DWave Macroblock Decoding. - */ - -/** - * @file libavcodec/cell/spu/h264_main_spu.c - * Cell Parallel SPU - 2DWave Macroblock Decoding - * @author C C Chi - * - * SIMD kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - -#include -#include -#include -//#include "dsputil_cell.h" -#include "types_spu.h" -#include "h264_tables.h" -#include "h264_dma.h" -#include "h264_mc_spu.h" -#include "h264_intra_spu.h" -#include "h264_decode_mb_spu.h" -#include "h264_deblock_spu.h" - -//border buffers -DECLARE_ALIGNED_16(TopBorder, top_ls[240]); -LeftBorder left_ls; - -//mb line buffer - statically allocated for up to 1920 width video -DECLARE_ALIGNED_16(uint8_t, dest_y_ls[2*16*20]); -DECLARE_ALIGNED_16(uint8_t, dest_cb_ls[2*8*10]); -DECLARE_ALIGNED_16(uint8_t, dest_cr_ls[2*8*10]); - -//dma transfer buffer -DECLARE_ALIGNED_16(uint8_t, dma_y_ls [64*(32+20)]); //EDGE_WIDTH = 32 -DECLARE_ALIGNED_16(uint8_t, dma_cb_ls[32*(16+10)]); -DECLARE_ALIGNED_16(uint8_t, dma_cr_ls[32*(16+10)]); - -DECLARE_ALIGNED_16(uint8_t, extra_edge_y [32*(32+20)]); //EDGE_WIDTH = 32 -DECLARE_ALIGNED_16(uint8_t, extra_edge_cr[16*(16+10)]); -DECLARE_ALIGNED_16(uint8_t, extra_edge_cb[16*(16+10)]); - - -// For intra mode -/// for now do the extra copy before dma, but it's better to skip this and do the dma right away -static void backup_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ - H264Mb* mb= h->mb; - - int i; - uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y; - uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb; - uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr; - - uint8_t* left_border_y = left_ls.unfiltered_y; - uint8_t* left_border_cb = left_ls.unfiltered_cb; - uint8_t* left_border_cr = left_ls.unfiltered_cr; - - src_y -= linesize; - src_cb -= uvlinesize; - src_cr -= uvlinesize; - - // There are two lines saved, the line above the top macroblock of a pair, - // and the line above the bottom macroblock - left_border_y[0] = top_border_y[15]; - for(i=1; i<17; i++){ - left_border_y[i] = src_y[15+i* linesize]; - } - - *(qword*)(top_border_y)= *(qword*)(src_y + 16*linesize); - - left_border_cb[0] = top_border_cb[7]; - left_border_cr[0] = top_border_cr[7]; - for(i=1; i<9; i++){ - left_border_cb[i] = src_cb[7+i*uvlinesize]; - left_border_cr[i] = src_cr[7+i*uvlinesize]; - } - *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); - *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); -} - -static void xchg_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ - H264Mb* mb= h->mb; - H264slice* s = h->s; - - int temp8, i; - uint64_t temp64; - int deblock_left; - int deblock_top; - - uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y; - uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb; - uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr; - uint8_t* top_border_y_next = top_ls[mb->mb_x +1].unfiltered_y; - - uint8_t* left_border_y = left_ls.unfiltered_y; - uint8_t* left_border_cb = left_ls.unfiltered_cb; - uint8_t* left_border_cr = left_ls.unfiltered_cr; - - deblock_left = (mb->mb_x > 0); - deblock_top = (mb->mb_y > 0); - - src_y -= ( linesize + 1); - src_cb -= (uvlinesize + 1); - src_cr -= (uvlinesize + 1); - - #define XCHG(a,b,t,xchg)\ - t= a;\ - if(xchg)\ - a= b;\ - b= t; - - if(deblock_left){ - for(i = !deblock_top; i<16; i++){ - XCHG(left_border_y[i], src_y [i* linesize], temp8, xchg); - } - XCHG(left_border_y[i], src_y [i* linesize], temp8, 1); - - for(i = !deblock_top; i<8; i++){ - XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg); - XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg); - } - XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1); - XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1); - } - - if(deblock_top){ - XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg); - XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1); - if(mb->mb_x+1 < s->mb_width){ - XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1); - } - XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1); - XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1); - } -} - -void copy_top_borders(int mb_x, uint8_t *dst_y, uint8_t *dst_cb, uint8_t *dst_cr, int stride_y, int stride_c){ - qword *qsrc_y = (qword *) (top_ls[mb_x].top_borders_y); - dst_y-= 4*stride_y; - - *((qword *) (dst_y + 0*stride_y)) = *qsrc_y++; - *((qword *) (dst_y + 1*stride_y)) = *qsrc_y++; - *((qword *) (dst_y + 2*stride_y)) = *qsrc_y++; - *((qword *) (dst_y + 3*stride_y)) = *qsrc_y++; - - dst_cb-=2*stride_c; - uint64_t *dsrc_cb = (uint64_t *) (top_ls[mb_x].top_borders_cb); - *((uint64_t *) (dst_cb + 0*stride_c)) = *dsrc_cb++; - *((uint64_t *) (dst_cb + 1*stride_c)) = *dsrc_cb++; - - dst_cr-=2*stride_c; - uint64_t *dsrc_cr = (uint64_t *) (top_ls[mb_x].top_borders_cr); - *((uint64_t *) (dst_cr + 0*stride_c)) = *dsrc_cr++; - *((uint64_t *) (dst_cr + 1*stride_c)) = *dsrc_cr++; -} - -static void send_top_borders(H264Context_spu *h, int mb_x, uint8_t* dest_y, uint8_t* dest_cb, uint8_t* dest_cr, int stride_y, int stride_c){ - H264spe *spe= &h->spe; - //fill borders (unfiltered borders already filled in backup_mb_border) - dest_y+= 12*stride_y; - qword *qtop_y = (qword *) top_ls[mb_x].top_borders_y; - for(int i=0; i<4; i++){ - qword *qdest_y = (qword *) dest_y; - *qtop_y++ = *qdest_y; - dest_y+=stride_y; - } - dest_cb+= 6*stride_c; - dest_cr+= 6*stride_c; - uint64_t *dtop_cb = (uint64_t *) top_ls[mb_x].top_borders_cb; - uint64_t *dtop_cr = (uint64_t *) top_ls[mb_x].top_borders_cr; - for(int i=0; i<2; i++){ - uint64_t *ddest_cb = (uint64_t *) dest_cb; - uint64_t *ddest_cr = (uint64_t *) dest_cr; - - *dtop_cb++ = *ddest_cb; - *dtop_cr++ = *ddest_cr; - - dest_cb+=stride_c; - dest_cr+=stride_c; - } - uint8_t* top_border_tgt = spe->tgt_spe + (unsigned) &top_ls[mb_x]; - spu_dma_put(&top_ls[mb_x], (unsigned) top_border_tgt, sizeof(TopBorder), MBD_put); -} - -static void extend_edges_left(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c){ - for (int i=0; is; - - uint8_t *dma_y; - uint8_t *dma_cb; - uint8_t *dma_cr; - - uint8_t *extra_y = extra_edge_y; - uint8_t *extra_cb = extra_edge_cb; - uint8_t *extra_cr = extra_edge_cr; - - int pos = (mb_x+2) %4; - if (mb_x == 0){ - if (mb_y ==0){ - extend_edges_left(&dma_y_ls[32*64], &dma_cb_ls[16*32], &dma_cr_ls[16*32], 12, 6); - }else if (mb_y == s->mb_height -1){ - extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 20, 10); - }else { - extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 16, 8); - } - }else if (mb_x == s->mb_width-1){ - dma_y = &dma_y_ls [(pos+1)*16]; - dma_cb = &dma_cb_ls[(pos+1)*8]; - dma_cr = &dma_cr_ls[(pos+1)*8]; - if (mb_y ==0){ - dma_y += 32*64; - dma_cb += 16*32; - dma_cr += 16*32; - extra_y = extra_edge_y + 32*32; - extra_cb= extra_edge_cb + 16*16; - extra_cr= extra_edge_cr + 16*16; - - if (pos==2){ - extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 1); - extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6); - }else if (pos==3){ - extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6); - }else{ - extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 2); - } - }else if (mb_y == s->mb_height -1){ - if (pos==2){ - extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 1); - extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10); - }else if (pos==3){ - extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10); - }else{ - extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 2); - } - }else { - if (pos==2){ - extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1); - extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8); - }else if (pos==3){ - extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8); - }else{ - extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1); - } - } - } - - if (mb_y == 0){ - dma_y = &dma_y_ls [32*64]; - dma_cb = &dma_cb_ls[16*32]; - dma_cr = &dma_cr_ls[16*32]; - extra_y = extra_edge_y + 32*32; - extra_cb= extra_edge_cb + 16*16; - extra_cr= extra_edge_cr + 16*16; - - if (mb_x ==0){ - extend_edges_top (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8); - extend_edges_top (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8); - extend_edges_top (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8); - }else if (mb_x == s->mb_width -1){ - if (pos==2){ - extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); - extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); - extend_extra_edge_top(extra_y, extra_cb, extra_cr); - }else if (pos == 3){ - extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); - extend_extra_edge_top(extra_y, extra_cb, extra_cr); - }else{ - extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); - extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); - extend_edges_top (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8); - } - }else { - extend_edges_top (dma_y + pos*16, dma_cb + pos*8, dma_cr + pos*8); - } - }else if (mb_y == s->mb_height -1){ - dma_y = &dma_y_ls [19*64]; - dma_cb = &dma_cb_ls[9*32]; - dma_cr = &dma_cr_ls[9*32]; - extra_y = extra_edge_y + 19*32; - extra_cb= extra_edge_cb + 9*16; - extra_cr= extra_edge_cr + 9*16; - - if (mb_x ==0){ - extend_edges_bottom (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8); - extend_edges_bottom (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8); - extend_edges_bottom (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8); - }else if (mb_x == s->mb_width -1){ - if (pos==2){ - extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); - extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); - extend_extra_edge_bottom(extra_y, extra_cb, extra_cr); - }else if (pos == 3){ - extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); - extend_extra_edge_bottom(extra_y, extra_cb, extra_cr); - }else{ - extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); - extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); - extend_edges_bottom (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8); - } - }else { - extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); - } - } -} - -static void send_pic_data(H264Context_spu *h, int mb_x, int mb_y, int pos, int stride_y, int stride_c){ - H264slice *s = h->s; - int lines, lines_c; - int linesize = s->linesize; - int uvlinesize = s->uvlinesize; - - uint8_t* dst_y = s->dst_y + (mb_x-pos)*16 + (mb_y*16)*linesize; - uint8_t* dst_cb = s->dst_cb +(mb_x-pos)*8 + (mb_y*8)*uvlinesize; - uint8_t* dst_cr = s->dst_cr +(mb_x-pos)*8 + (mb_y*8)*uvlinesize; - - if (mb_y == 0){ - dst_y -= 32 *linesize; - dst_cb-= 16 *uvlinesize; - dst_cr-= 16 *uvlinesize; - }else { - dst_y -= 4 *linesize; - dst_cb-= 2 *uvlinesize; - dst_cr-= 2 *uvlinesize; - } - - if (mb_y == 0){ - lines = 12+32; lines_c=6+16; - }else if (mb_y == s->mb_height-1){ - lines = 20+32; lines_c=10+16; - }else{ - lines = 16; lines_c=8; - } - - put_list = put_list_buf; - put_dma_list(dma_y_ls, dst_y, stride_y, lines, linesize, MBD_pic); - put_dma_list(dma_cb_ls, dst_cb, stride_c, lines_c, uvlinesize, MBD_pic); - put_dma_list(dma_cr_ls, dst_cr, stride_c, lines_c, uvlinesize, MBD_pic); - - if (mb_x == s->mb_width-1 && pos>1){ - put_dma_list(extra_edge_y, dst_y+64, 32, lines, linesize, MBD_pic); - put_dma_list(extra_edge_cb, dst_cb+32, 16, lines_c, uvlinesize, MBD_pic); - put_dma_list(extra_edge_cr, dst_cr+32, 16, lines_c, uvlinesize, MBD_pic); - } -} - -void copy_data_and_send(H264Context_spu *h, int mb_x, int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ - H264slice *s = h->s; - int lines, lines_c; - int pos = (mb_x+2)%4; //4 slots in our 64 byte wide transfer buffer. Offset 2 for edge emulation - uint8_t *dma_y = &dma_y_ls[pos*16]; - uint8_t *dma_cb = &dma_cb_ls[pos*8]; - uint8_t *dma_cr = &dma_cr_ls[pos*8]; - - if (mb_y == 0){ - dma_y += 32*64; - dma_cb+= 16*32; - dma_cr+= 16*32; - }else{ - dest_y -= 4*stride_y; - dest_cb-= 2*stride_c; - dest_cr-= 2*stride_c; - } - - if (mb_y == 0){ - lines = 12; lines_c=6; - }else if (mb_y == s->mb_height-1){ - lines = 20; lines_c=10; - }else{ - lines = 16; lines_c=8; - } - - for(int i=0; imb_width-1){ - send_pic_data(h, mb_x, mb_y, pos, 64, 32); - } -} - -static void shift_left(int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ - int lines, lines_c; - if (mb_y > 0){ - lines =20; - lines_c=10; - dest_y -= 4*stride_y; - dest_cb -= 2*stride_c; - dest_cr -= 2*stride_c; - }else { - lines =16; - lines_c= 8; - } - - for (int i=0; is; - H264Mb *mb = h->mb; - const int mb_x= mb->mb_x; - const int mb_y= mb->mb_y; - const int mb_type= mb->mb_type; - - uint8_t *dest_y, *dest_cb, *dest_cr; //ls ptrs (abstracts the fact it is operating in a ls buffer) - - int i; - - void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); - void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); - - dest_y = dest_y_ls + 16 + 4*stride_y; - dest_cb = dest_cb_ls + 8 + 2*stride_c; - dest_cr = dest_cr_ls + 8 + 2*stride_c; - - if(IS_8x8DCT(mb_type)){ - idct_dc_add = ff_idct8_dc_add; - idct_add = h->dsp.h264_idct_add[0]; - } - else{ - idct_dc_add = ff_idct_dc_add; - idct_add = h->dsp.h264_idct_add[1]; - } - - if (mb_y>0){ - copy_top_borders(mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c); - } - - if(IS_INTRA(mb_type)){ - xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 1); - - h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cb, stride_c); - h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cr, stride_c); - - if(IS_INTRA4x4(mb_type)){ - if(IS_8x8DCT(mb_type)){ - - for(i=0; i<16; i+=4){ - uint8_t * const ptr= dest_y + block_offset[i]; - const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ]; - const int nnz = mb->non_zero_count_cache[ scan8[i] ]; - h->hpc.pred8x8l[ dir ](ptr, (mb->topleft_samples_available<topright_samples_available<mb[i*16]) - idct_dc_add(ptr, mb->mb + i*16, stride_y); - else{ - idct_add (ptr, mb->mb + i*16, stride_y); - } - } - } - }else{ - for(i=0; i<16; i++){ - uint8_t * const ptr= dest_y + block_offset[i]; - const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ]; - - uint8_t *topright; - int nnz, tr; - if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ - const int topright_avail= (mb->topright_samples_available<hpc.pred4x4[ dir ](ptr, topright, stride_y); - nnz = mb->non_zero_count_cache[ scan8[i] ]; - if(nnz){ - if(nnz == 1 && mb->mb[i*16]) - idct_dc_add(ptr, mb->mb + i*16, stride_y); - else - idct_add (ptr, mb->mb + i*16, stride_y); - } - } - } - - }else{ - h->hpc.pred16x16[ mb->intra16x16_pred_mode ](dest_y , stride_y); - h264_luma_dc_dequant_idct_c(mb->mb, mb->dequant4_coeff_y); - } - xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 0); - - }else { - hl_motion(h, dest_y, dest_cb, dest_cr, stride_y, stride_c); - } - - if(!IS_INTRA4x4(mb_type)){ - if(IS_INTRA16x16(mb_type)){ - for(i=0; i<16; i++){ - if(mb->non_zero_count_cache[ scan8[i] ]) - idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); - else if(mb->mb[i*16]) - idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); - } - }else if(mb->cbp&15){ - const int incr = IS_8x8DCT(mb_type) ? 4 : 1; - for(i=0; i<16; i+=incr){ - int nnz = mb->non_zero_count_cache[ scan8[i] ]; - if(nnz){ - if(nnz==1 && mb->mb[i*16]) - idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); - else - idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); - } - } - } - } - - if(mb->cbp&0x30){ - uint8_t *dest[2] = {dest_cb, dest_cr}; - chroma_dc_dequant_idct_c(mb->mb + 16*16, mb->dequant4_coeff_cb); - chroma_dc_dequant_idct_c(mb->mb + 16*16+4*16, mb->dequant4_coeff_cr); - - idct_add = h->dsp.h264_idct_add[1]; - idct_dc_add = ff_idct_dc_add; - for(i=16; i<16+8; i++){ - if(mb->non_zero_count_cache[ scan8[i] ]) - idct_add (dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c); - else if(mb->mb[i*16]) - idct_dc_add(dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c); - } - } - - // save unfiltered borders - backup_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c); - if (mb->deblock_mb){ - filter_mb( h, dest_y, dest_cb, dest_cr, stride_y, stride_c); - } - - if (mb_y < s->mb_height-1){ - if(mb_x>0){ - send_top_borders(h, mb_x-1, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); - } - if (mb_x == s->mb_width-1){ - send_top_borders(h, mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c); - } - } - update_tgt_spe_dep(h, 0); - - if (h->blocking){ - if (mb_x>0){ - copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); - wait_dma_id(MBD_pic); - } - if (mb_x == s->mb_width-1){ - copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); - wait_dma_id(MBD_pic); - } - - }else{ - if (mb_x>0){ - wait_dma_id(MBD_pic); - copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); - } - if (mb_x == s->mb_width-1){ - wait_dma_id(MBD_pic); - copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); - } - } - - if (mb_x < s->mb_width) - shift_left(mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); - -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2009 TUDelft - * - * Cell Parallel SPU - 2DWave Macroblock Decoding. - */ - -/** - * @file libavcodec/cell/spu/h264_main_spu.c - * Cell Parallel SPU - 2DWave Macroblock Decoding - * @author C C Chi - * - * SIMD kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - -#ifndef H264_DECODE_MB_SPU_H -#define H264_DECODE_MB_SPU_H - -#define CELL_SPE -#include "libavcodec/avcodec.h" -#include "types_spu.h" -#include "h264_types_spu.h" -#include "h264_mc_spu.h" -#include "h264_dma.h" -#include "dsputil_spu.h" -#include "h264_intra_spu.h" - -/** - * H264Context - */ -typedef struct H264Context_spu{ - DECLARE_ALIGNED_16(H264spe, spe); // contains simple type parameters that doesn't change - DECLARE_ALIGNED_16(H264Mb, mb_buf[3]); // contains simple type parameters that changes for macroblock - DECLARE_ALIGNED_16(H264slice, slice_buf[2]); // contains simple type parameters that changes for slice - - DSPContext_spu dsp; // struct that contains pointers to mc interpolations functions - H264PredContext_spu hpc; // struct that contains pointers to intra prediction functions - - H264slice *s; - int sl_idx; - int frames; - //mc arg buffer - H264mc mc_buf[2]; - H264mc *mc; //mc ptr to current decoded mb - int mc_idx; - int n_mc; //next mb_id to mc - int mb_proc; - int mb_total; - int curr_line; - - H264Mb* mb; //mb ptr to current decoded mb - int mb_id; //next mb_id to dma - int mb_dec; //mb_buf index - decoded mb - int mb_mc; //mb_buf index - prebuffer motion data - int mb_dma; //mb_buf index - target for dma mb data - int next_mb_idx; -/*// for deblocking filter - int edges[2]; - int start[2]; - int bS[2][4][4]; // dir, edge, bS; - int qp[2][4]; // dir, edge; - int chroma_qp[2][2][4]; // cb/cr, dir, edge; -*/ - int blocking; -}H264Context_spu; - -void print_output(H264Context_spu* h, const char* msg); -void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c); -void update_tgt_spe_dep(H264Context_spu *h, int end); - -// IDCT functions -void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); -void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); - -void ff_idct_dc_add(uint8_t *dst, DCTELEM *block, int stride); -void ff_idct8_dc_add(uint8_t *dst, DCTELEM *block, int stride); - -void ff_cropTbl_init(); -void add_pixels8_c(uint8_t *pixels, DCTELEM *block, int line_size); -void add_pixels4_c(uint8_t *pixels, DCTELEM *block, int line_size); -void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul); -void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul); -// Filter functions -//void calculate_bS_qp(H264Context_spu *h); - -// Motion compensation function -void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc); -void calc_mc_params(H264Mb *mb, H264mc *mc); -void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c); - - -// Function to get traces -void trace_event_SPU(int event, int id); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,332 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 direct mb/block decoding. - * @author Michael Niedermayer - */ -#define CELL_SPE -#include "libavcodec/avcodec.h" -#include "dsputil_spu.h" -#include "h264_tables.h" -#include "h264_types_spu.h" -#include "libavutil/common.h" -#include "libavutil/intreadwrite.h" -#include "mathops_spu.h" -#include "rectangle_spu.h" - -//#undef NDEBUG -#include -static void pred_spatial_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ - H264Mb *m = s->m; - int b4_stride = hc->b_stride; - const int mb_x = m->mb_x; - int mb_type_col[2]; - const int16_t (*l1mv0)[2], (*l1mv1)[2]; - const int8_t *l1ref0, *l1ref1; - const int is_b8x8 = IS_8X8(*mb_type); - unsigned int sub_mb_type= MB_TYPE_L0L1; - int i8, i4; - int ref[2]; - int mv[2]; - int list; - - //assert(h->ref_list[1][0].reference&3); - -#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) - - /* ref = min(neighbors) */ - for(list=0; list<2; list++){ - int left_ref = m->ref_cache[list][scan8[0] - 1]; - int top_ref = m->ref_cache[list][scan8[0] - 8]; - int refc = m->ref_cache[list][scan8[0] - 8 + 4]; - const int16_t *C= m->mv_cache[list][ scan8[0] - 8 + 4]; - if(refc == PART_NOT_AVAILABLE){ - refc = m->ref_cache[list][scan8[0] - 8 - 1]; - C = m-> mv_cache[list][scan8[0] - 8 - 1]; - } - ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); - if(ref[list] >= 0){ - //this is just pred_motion() but with the cases removed that cannot happen for direct blocks - const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ]; - const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ]; - - int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); - if(match_count > 1){ //most common - mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]), - mid_pred(A[1], B[1], C[1]) ); - }else { - assert(match_count==1); - if(left_ref==ref[list]){ - mv[list]= AV_RN32A(A); - }else if(top_ref==ref[list]){ - mv[list]= AV_RN32A(B); - }else{ - mv[list]= AV_RN32A(C); - } - } - }else{ - int mask= ~(MB_TYPE_L0 << (2*list)); - mv[list] = 0; - ref[list] = -1; - if(!is_b8x8) - *mb_type &= mask; - sub_mb_type &= mask; - } - } - - if(ref[0] < 0 && ref[1] < 0){ - ref[0] = ref[1] = 0; - if(!is_b8x8) - *mb_type |= MB_TYPE_L0L1; - sub_mb_type |= MB_TYPE_L0L1; - } - - if(!(is_b8x8|mv[0]|mv[1])){ - fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); - fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); - fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); - fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); - *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; - return; - } - - mb_type_col[0] = - mb_type_col[1] = hc->list1_mb_type[mb_x]; - - sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ - if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ - *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ - }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ - *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); - }else{ - if(!s->direct_8x8_inference_flag){ - /* FIXME save sub mb types from previous frames (or derive from MVs) - * so we know exactly what block size to use */ - sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ - } - *mb_type |= MB_TYPE_8x8; - } - -// l1mv0 = (void *) &hc->list1_motion_val[0][4*mb_x]; -// l1mv1 = (void *) &hc->list1_motion_val[1][4*mb_x]; - l1mv0 = (void *) hc->list1_motion_val[0]; - l1mv1 = (void *) hc->list1_motion_val[1]; - l1ref0 = &hc->list1_ref_index [0][4*mb_x]; - l1ref1 = &hc->list1_ref_index [1][4*mb_x]; -// if(!b8_stride){ -// if(m->mb_y&1){ -// l1ref0 += 2; -// l1ref1 += 2; -// l1mv0 += 2*b4_stride; -// l1mv1 += 2*b4_stride; -// } -// } - - if(IS_16X16(*mb_type)){ - int a,b; - - fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); - fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); - if(!IS_INTRA(mb_type_col[0]) && ( (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1) - || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1 - ))){ - a=b=0; - if(ref[0] > 0) - a= mv[0]; - if(ref[1] > 0) - b= mv[1]; - }else{ - a= mv[0]; - b= mv[1]; - } - fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, a, 4); - fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, b, 4); - }else{ - int n=0; - for(i8=0; i8<4; i8++){ - const int x8 = i8&1; - const int y8 = i8>>1; - - if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) - continue; - m->sub_mb_type[i8] = sub_mb_type; - - fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4); - fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4); - fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1); - fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1); - - /* col_zero_flag */ - if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 )) - ){ - const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1; - if(IS_SUB_8X8(sub_mb_type)){ -// const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; - const int16_t *mv_col = l1mv[x8*3 + y8*3*4]; - if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ - if(ref[0] == 0) - fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); - if(ref[1] == 0) - fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); - n+=4; - } - }else{ - int k=0; - for(i4=0; i4<4; i4++){ - //const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; - const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4]; - if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ - if(ref[0] == 0) - AV_ZERO32(m->mv_cache[0][scan8[i8*4+i4]]); - if(ref[1] == 0) - AV_ZERO32(m->mv_cache[1][scan8[i8*4+i4]]); - k++; - } - } - if(!(k&3)) - m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8; - n+=k; - } - } - } - if(!is_b8x8 && !(n&15)){ - *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; - } - } -} - -static void pred_temp_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ - H264Mb *m = s->m; - const int mb_x = m->mb_x; - int b4_stride = hc->b_stride; - int mb_type_col[2]; - const int16_t (*l1mv0)[2], (*l1mv1)[2]; - const int8_t *l1ref0, *l1ref1; - const int is_b8x8 = IS_8X8(*mb_type); - unsigned int sub_mb_type; - int i8, i4; - const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]}; - const int *dist_scale_factor = s->dist_scale_factor; - - mb_type_col[0] = - mb_type_col[1] = hc->list1_mb_type[mb_x]; - - sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ - if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ - *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ - }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ - *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); - }else{ - if(!s->direct_8x8_inference_flag){ - /* FIXME save sub mb types from previous frames (or derive from MVs) - * so we know exactly what block size to use */ - sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ - } - *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; - } - -// l1mv0 = (void *) &hc->list1_motion_val[0][4*mb_x]; -// l1mv1 = (void *) &hc->list1_motion_val[1][4*mb_x]; - l1mv0 = (void *) hc->list1_motion_val[0]; - l1mv1 = (void *) hc->list1_motion_val[1]; - l1ref0 = &hc->list1_ref_index [0][4*mb_x]; - l1ref1 = &hc->list1_ref_index [1][4*mb_x]; - - /* one-to-one mv scaling */ - if(IS_16X16(*mb_type)){ - int ref, mv0, mv1; - - fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1); - if(IS_INTRA(mb_type_col[0])){ - ref=mv0=mv1=0; - }else{ - const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]] - : map_col_to_list0[1][l1ref1[0]]; - const int scale = dist_scale_factor[ref0]; - const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0]; - int mv_l0[2]; - mv_l0[0] = (scale * mv_col[0] + 128) >> 8; - mv_l0[1] = (scale * mv_col[1] + 128) >> 8; - ref= ref0; - mv0= pack16to32(mv_l0[0],mv_l0[1]); - mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]); - } - fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1); - fill_rectangle(&m-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4); - fill_rectangle(&m-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4); - }else{ - for(i8=0; i8<4; i8++){ - const int x8 = i8&1; - const int y8 = i8>>1; - int ref0, scale; - const int16_t (*l1mv)[2]= l1mv0; - - if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) - continue; - m->sub_mb_type[i8] = sub_mb_type; - fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1); - if(IS_INTRA(mb_type_col[0])){ - fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1); - fill_rectangle(&m-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); - fill_rectangle(&m-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); - continue; - } - - ref0 = l1ref0[i8]; - if(ref0 >= 0) - ref0 = map_col_to_list0[0][ref0 ]; - else{ - ref0 = map_col_to_list0[1][l1ref1[i8]]; - l1mv= l1mv1; - } - scale = dist_scale_factor[ref0]; - - fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1); - if(IS_SUB_8X8(sub_mb_type)){ -// const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; - const int16_t *mv_col = l1mv[x8*3 + y8*3*4]; - int mx = (scale * mv_col[0] + 128) >> 8; - int my = (scale * mv_col[1] + 128) >> 8; - fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4); - fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4); - }else - for(i4=0; i4<4; i4++){ -// const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; - const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4]; - int16_t *mv_l0 = m->mv_cache[0][scan8[i8*4+i4]]; - mv_l0[0] = (scale * mv_col[0] + 128) >> 8; - mv_l0[1] = (scale * mv_col[1] + 128) >> 8; - AV_WN32A(m->mv_cache[1][scan8[i8*4+i4]], - pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1])); - } - } - } -} - -void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ - if(s->direct_spatial_mv_pred){ - pred_spatial_direct_motion(hc, s, mb_type); - }else{ - pred_temp_direct_motion(hc, s, mb_type); - } -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -#ifndef H264_DIRECT_H -#define H264_DIRECT_H - -#include "h264_types_spu.h" - -void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -#include -#include "h264_dma.h" - -DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]); -dma_list_elem_t* put_list; - -DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]); -dma_list_elem_t* get_list; - -inline void spu_dma_get(void *ls, unsigned ea, int size, int tag){ - mfc_get(ls, ea, size, tag, 0, 0); -} - -inline void spu_dma_put(void *ls, unsigned ea, int size, int tag){ - mfc_put(ls, ea, size, tag, 0, 0); -} - -inline void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag){ - mfc_putb(ls, ea, size, tag, 0, 0); -} - -// Function that wait to finish a DMA transfer with especific id -inline void wait_dma_id(int id){ - spu_writech(MFC_WrTagMask, 1<< id); - (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); -} - -// Functions to get/put a block from/to main memory -void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier) -{ - unsigned int i = 0; - unsigned int listsize; - unsigned int ea_low; - - dma_list_elem_t* list = get_list; - get_list+=h; - - ea_low=(uint32_t) mfc_ea2l(ea); - - /* Create the list, size of each list id the "width" parameter defined by the user */ - for ( i=0; i - * - * SIMD kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - - -#include -#include -#include - -#include "h264_filter_spu.h" -#include "h264_decode_mb_spu.h" -// To use scan8 table -#include "h264_mc_spu.h" - - -int get_chroma_qp(H264Context_spu *h, int t, int qscale){ - return h->slice.chroma_qp_table[t][qscale]; -} - -static inline int clip(int a, int amin, int amax){ - if (a < amin) - return amin; - else if (a > amax) - return amax; - else - return a; -} - -static inline vsint16_t clip_altivec(vsint16_t a, vsint16_t amin, vsint16_t amax){ - vector unsigned short min_mask,max_mask; - min_mask = spu_cmpgt(amin, a); - max_mask = spu_cmpgt(a, amax); - - return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask); -} - -static inline vsint16_t clip_uint8_altivec(vsint16_t a){ - const vsint16_t amax = {255,255,255,255,255,255,255,255}; - const vsint16_t amin = {0, 0, 0, 0, 0, 0, 0, 0}; - vector unsigned short min_mask,max_mask; - min_mask = spu_cmpgt(amin, a); - max_mask = spu_cmpgt(a, amax); - - return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask); -} - -static inline void h264_loop_filter_chroma(vsint16_t *pix, int alpha, int beta, int8_t *tc0){ - - short a = (short) tc0[0]; - short b = (short) tc0[1]; - short c = (short) tc0[2]; - short d = (short) tc0[3]; - const vsint16_t vec_tc0 = {a,a,b,b,c,c,d,d}; - const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0}; - vector unsigned short mask_B0; - - mask_B0 = spu_cmpgt(vec_v0, vec_tc0); - - const vsint16_t p0 = pix[-1]; - const vsint16_t p1 = pix[-2]; - const vsint16_t q0 = pix[0]; - const vsint16_t q1 = pix[1]; - - const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; - const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; - const vsint16_t v_2 = {2,2,2,2,2,2,2,2}; - const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; - const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; - - vsint16_t rp0; - vsint16_t rq0; - vsint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0; - vector unsigned short mask_B1, mask_tmp; - vsint16_t i_delta; - - abs_p0mq0 = (vector signed short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); - abs_p1mp0 = (vector signed short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); - abs_q1mq0 = (vector signed short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); - - mask_B1 = spu_cmpgt(v_alpha, abs_p0mq0); - mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); - mask_B1 = spu_and(mask_B1, mask_tmp); - mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); - mask_B1 = spu_and(mask_B1, mask_tmp); - - - i_delta = clip_altivec(spu_rlmaska(spu_add(spu_sl(spu_sub(q0,p0 ), (vuint16_t)v_2), spu_add(spu_sub(p1,q1),v_4)), (vsint16_t)-v_3), -vec_tc0, vec_tc0); - - rp0 = clip_uint8_altivec( spu_add(p0,i_delta)); - rq0 = clip_uint8_altivec( spu_sub(q0,i_delta)); - - pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0); - pix[0] = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0); -} - -static void h264_v_loop_filter_luma_c(vsint16_t *pix, int alpha, int beta, int8_t *tc0, int inc_low2high){ - - short a = (short) tc0[0 + inc_low2high]; - short b = (short) tc0[1 + inc_low2high]; - const vsint16_t vec_tc0 = {a,a,a,a,b,b,b,b}; - const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0}; - vector unsigned short mask_B0; - - mask_B0 = spu_cmpgt(vec_v0, vec_tc0); - const vsint16_t p0 = pix[-1]; - const vsint16_t p1 = pix[-2]; - const vsint16_t p2 = pix[-3]; - const vsint16_t q0 = pix[0]; - const vsint16_t q1 = pix[1]; - const vsint16_t q2 = pix[2]; - - const vuint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; - const vuint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; - - const vuint16_t v_1 = {1,1,1,1,1,1,1,1}; - const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; - const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; - const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; - - vsint16_t rp0, rp1; - vsint16_t rq0, rq1; - vsint16_t tc0_B2P, tc0_B2Q, rtc0; - vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0; - vector unsigned short mask_B1, mask_B2P, mask_B2Q, mask_tmp; - vsint16_t i_delta, i_delta2; - - abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); - abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); - abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); - abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0); - abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0); - - mask_B1 = spu_cmpgt(v_alpha, abs_p0mq0); - mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); - mask_B1 = spu_and(mask_B1, mask_tmp); - mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); - mask_B1 = spu_and(mask_B1, mask_tmp); - - mask_B2P = spu_cmpgt(v_beta, abs_p2mp0); - mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0); - - rp1 = spu_add(p1, clip_altivec(spu_sub(spu_rlmaska(spu_add(p2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), p1), -vec_tc0, vec_tc0 )); - rq1 = spu_add(q1, clip_altivec(spu_sub(spu_rlmaska(spu_add(q2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), q1), -vec_tc0, vec_tc0 )); - - tc0_B2P = spu_add(vec_tc0, (vsint16_t) v_1); - tc0_B2P = spu_sel(vec_tc0, tc0_B2P, mask_B2P); - - tc0_B2Q = spu_add(tc0_B2P, (vsint16_t) v_1); - rtc0 = spu_sel(tc0_B2P, tc0_B2Q, mask_B2Q); - i_delta2 = spu_add(spu_sub(p1,q1),v_4); - i_delta = spu_sl(spu_sub(q0,p0 ), v_2); - i_delta = spu_add(i_delta,i_delta2 ); - i_delta = spu_rlmaska(i_delta, (vsint16_t)-v_3); - i_delta = clip_altivec(i_delta, -rtc0, rtc0); - - rp0 = clip_uint8_altivec( spu_add(p0,i_delta)); /* p0' */ - rq0 = clip_uint8_altivec( spu_sub(q0,i_delta)); /* q0' */ - - pix[-2] = spu_sel(spu_sel(p1,spu_sel(p1,rp1,mask_B2P) ,mask_B1), p1,mask_B0); - pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0); - pix[0] = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0); - pix[1] = spu_sel(spu_sel(q1,spu_sel(q1,rq1,mask_B2Q) ,mask_B1), q1,mask_B0); -} - - - -static inline void h264_loop_filter_chroma_intra(vsint16_t *pix, int alpha, int beta){ - - const vuint16_t p0 = (vuint16_t) pix[-1]; - const vuint16_t p1 = (vuint16_t) pix[-2]; - const vuint16_t q0 = (vuint16_t) pix[0]; - const vuint16_t q1 = (vuint16_t) pix[1]; - - const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; - const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; - const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; - - vuint16_t rp0; - vuint16_t rq0; - vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0; - vector unsigned short mask_B0, mask_tmp; - - abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); - abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); - abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); - - mask_B0 = spu_cmpgt(v_alpha, (vsint16_t)abs_p0mq0); - mask_tmp = spu_cmpgt(v_beta, (vsint16_t)abs_p1mp0); - mask_B0 = spu_and(mask_B0, mask_tmp); - mask_tmp = spu_cmpgt( v_beta, (vsint16_t)abs_q1mq0); - mask_B0 = spu_and(mask_B0, mask_tmp); - - rp0 = spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2; - rp0 = spu_rlmaska(rp0, (vsint16_t)-v_2); - rq0 = spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2; - rq0 = spu_rlmaska(rq0, (vsint16_t)-v_2); - - pix[-1] = (vsint16_t) spu_sel(p0, rp0, mask_B0); - pix[0] = (vsint16_t) spu_sel(q0, rq0, mask_B0); -} -int slice_alpha_c0_offset; -int slice_beta_offset; -static void filter_mb_edgecv(vsint16_t *pix, int bS[4], int qp ) { - int i; - const int index_a = qp + slice_alpha_c0_offset; - const int alpha = (alpha_table+52)[index_a]; - const int beta = (beta_table+52)[qp + slice_beta_offset]; - - if( bS[0] < 4 ) { - int8_t tc[4]; - for(i=0; i<4; i++) - tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0; - h264_loop_filter_chroma(pix, alpha, beta, tc); - } else { - h264_loop_filter_chroma_intra(pix, alpha, beta); - } -} - -static void filter_mb_edgeh(vsint16_t *pix, int bS[4], int qp, int inc_low2high ) { - int i; - const int index_a = qp + slice_alpha_c0_offset; - const int alpha = (alpha_table+52)[index_a]; - const int beta = (beta_table+52)[qp + slice_beta_offset]; - - if( bS[0] < 4 ) { - int8_t tc[4]; - for(i=0; i<4; i++) - tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1; - h264_v_loop_filter_luma_c(pix, alpha, beta, tc, inc_low2high); - } else { - - const vuint16_t p0 = (vuint16_t) pix[-1]; - const vuint16_t p1 = (vuint16_t) pix[-2]; - const vuint16_t p2 = (vuint16_t) pix[-3]; - const vuint16_t p3 = (vuint16_t) pix[-4]; - const vuint16_t q0 = (vuint16_t) pix[0]; - const vuint16_t q1 = (vuint16_t) pix[1]; - const vuint16_t q2 = (vuint16_t) pix[2]; - const vuint16_t q3 = (vuint16_t) pix[3]; - - const vuint16_t v_alpha = {(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha}; - const vuint16_t v_beta = {(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta}; - const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; - const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; - const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; - - vuint16_t rp0_B1f, rp0_B2t, rp0_B2f, rp1_B2t, rp2_B2t; - vuint16_t rq0_B1f, rq0_B2t, rq0_B2f, rq1_B2t, rq2_B2t; - vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0; - vuint16_t v_alpha_2 = spu_rlmaska(v_alpha, (vsint16_t)-v_2); - vector unsigned short mask_B0, mask_B1, mask_B2P, mask_B2Q, mask_tmp; - - v_alpha_2 = spu_add(v_alpha_2, v_2); - - abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); - abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); - abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); - abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0); - abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0); - - mask_B0 = spu_cmpgt(v_alpha, abs_p0mq0); - mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); - mask_B0 = spu_and(mask_B0, mask_tmp); - mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); - mask_B0 = spu_and(mask_B0, mask_tmp); - - mask_B1 = spu_cmpgt(v_alpha_2, abs_p0mq0); - mask_B2P = spu_cmpgt(v_beta,abs_p2mp0); - mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0); - - rp0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p2,p1),spu_add(p1,p0)),spu_add(spu_add(p0,q0),spu_add(q0,q1))),(vuint16_t)v_4),(vsint16_t) -v_3); - //( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; - rp1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p2,p1),spu_add(q0,p0)),v_2),(vsint16_t)-v_2);//( p2 + p1 + p0 + q0 + 2 ) >> 2; - rp2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p3,p3),spu_add(p2,p2)),spu_add(spu_add(p2,p1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3); - //( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; - rq0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p1,p0),spu_add(p0,q0)),spu_add(spu_add(q0,q1),spu_add(q1,q2))),(vuint16_t)v_4),(vsint16_t)-v_3); - - //( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; - rq1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p0,q0),spu_add(q1,q2)),v_2),(vsint16_t)-v_2);//( p0 + q0 + q1 + q2 + 2 ) >> 2; - rq2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(q3,q3),spu_add(q2,q2)),spu_add(spu_add(q2,q1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3); - //( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; - rp0_B1f = - rp0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2),(vsint16_t)-v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2; - rq0_B1f = - rq0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2),(vsint16_t)-v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2; - - pix[-1] = (vsint16_t) spu_sel(p0, spu_sel(rp0_B1f, spu_sel(rp0_B2f, rp0_B2t, mask_B2P), mask_B1), mask_B0); - pix[-2] = (vsint16_t) spu_sel(p1, spu_sel(p1, spu_sel(p1, rp1_B2t, mask_B2P), mask_B1), mask_B0); - pix[-3] = (vsint16_t) spu_sel(p2, spu_sel(p2, spu_sel(p2, rp2_B2t, mask_B2P), mask_B1), mask_B0); - pix[0] = (vsint16_t) spu_sel(q0, spu_sel(rq0_B1f, spu_sel(rq0_B2f, rq0_B2t, mask_B2Q), mask_B1), mask_B0); - pix[1] = (vsint16_t) spu_sel(q1, spu_sel(q1, spu_sel(q1, rq1_B2t,mask_B2Q), mask_B1), mask_B0); - pix[2] = (vsint16_t) spu_sel(q2, spu_sel(q2, spu_sel(q2, rq2_B2t,mask_B2Q), mask_B1), mask_B0); - } -} - -// This function gets bS and qp for luma and chroma before the filter -void calculate_bS_qp(H264Context_spu *h){ - H264mb* mb = &h->mb; - H264slice* slice = h->slice; - int dir; - const int mvy_limit = 4; - /* FIXME: A given frame may occupy more than one position in - * the reference list. So ref2frm should be populated with - * frame numbers, not indices. */ - - int (*ref2frm)[64] = slice->ref2frm; - int mb_x = mb->mb_x; - int mb_y = mb->mb_y; - int mb_type =mb->mb_type; - /* dir : 0 -> vertical edge, 1 -> horizontal edge */ - for( dir = 0; dir < 2; dir++ ){ - int edge; - const int mbm_type = dir == 0 ? mb->mb_type_xy_n1 : mb->mb_type_top; - const int8_t qscale_mbm = dir == 0 ? mb->qscale_mbxy_n1 : mb->qscale_mbxy_top; - - // how often to recheck mv-based bS when iterating between edges - const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :(mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0; - // how often to recheck mv-based bS when iterating along each edge - const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); - - h->edges[dir] = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4; - - if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0)) - h->start[dir] =1; - else - h->start[dir] =0; - - /* Calculate bS */ - for( edge = h->start[dir]; edge < h->edges[dir]; edge++ ) { - /* mbn_xy: neighbor macroblock */ - const int mbn_type = edge > 0 ? mb_type : mbm_type; - const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm; - int* bS = h->bS[dir][edge]; - - if( (edge&1) && IS_8x8DCT(mb_type) ){ - bS[0] = bS[1] = bS[2] = bS[3] = 0; //extra code due to decoupling - continue; - } - if( IS_INTRA(mb_type) || - IS_INTRA(mbn_type) ) { - int value; - if (edge == 0) { - value = 4; - } else { - value = 3; - } - bS[0] = bS[1] = bS[2] = bS[3] = value; - } else { - int i, l; - int mv_done; - - if( edge & mask_edge ) { - bS[0] = bS[1] = bS[2] = bS[3] = 0; - mv_done = 1; - } - else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { - int b_idx= 8 + 4 + edge * (dir ? 8:1); - int bn_idx= b_idx - (dir ? 8:1); - int v = 0; - - for( l = 0; !v && l < 1 + (slice->slice_type_nos == FF_B_TYPE); l++ ) { - v |= ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] || - FFABS(mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || - FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit; - } - bS[0] = bS[1] = bS[2] = bS[3] = v; - - mv_done = 1; - } - else - mv_done = 0; - - for( i = 0; i < 4; i++ ) { - int x = dir == 0 ? edge : i; - int y = dir == 0 ? i : edge; - int b_idx= 8 + 4 + x + 8*y; - int bn_idx= b_idx - (dir ? 8:1); - - if( mb->non_zero_count_cache[b_idx] != 0 || - mb->non_zero_count_cache[bn_idx] != 0 ) { - bS[i] = 2; - } - else if(!mv_done) - { - bS[i] = 0; - for( l = 0; l < 1 + (slice->slice_type == B_TYPE); l++ ) { - if( ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] || - FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || - FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) { - bS[i] = 1; - break; - } - } - } - } - - if(bS[0]+bS[1]+bS[2]+bS[3] == 0) - continue; - } - - /* Filter edge */ - // Do not use s->qscale as luma quantizer because it has not the same - // value in IPCM macroblocks. - h->qp[dir][edge] = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1; - h->chroma_qp[0][dir][edge] = ( mb->chroma_qp[0] + get_chroma_qp(h, 0, qscale_mbn_xy ) + 1 ) >> 1; - - h->chroma_qp[1][dir][edge] = ( mb->chroma_qp[1] + get_chroma_qp(h, 1, qscale_mbn_xy ) + 1 ) >> 1; - } - slice_alpha_c0_offset=slice->slice_alpha_c0_offset; - slice_beta_offset= slice->slice_beta_offset; - } -} - - -#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7,merge_h,merge_l) \ - b0 = spu_shuffle( a0, a4, merge_h); \ - b1 = spu_shuffle( a0, a4, merge_l ); \ - b2 = spu_shuffle( a1, a5, merge_h ); \ - b3 = spu_shuffle( a1, a5, merge_l ); \ - b4 = spu_shuffle( a2, a6, merge_h ); \ - b5 = spu_shuffle( a2, a6, merge_l ); \ - b6 = spu_shuffle( a3, a7, merge_h ); \ - b7 = spu_shuffle( a3, a7, merge_l ); \ - a0 = spu_shuffle( b0, b4, merge_h ); \ - a1 = spu_shuffle( b0, b4, merge_l ); \ - a2 = spu_shuffle( b1, b5, merge_h ); \ - a3 = spu_shuffle( b1, b5, merge_l ); \ - a4 = spu_shuffle( b2, b6, merge_h ); \ - a5 = spu_shuffle( b2, b6, merge_l); \ - a6 = spu_shuffle( b3, b7, merge_h ); \ - a7 = spu_shuffle( b3, b7, merge_l ); \ - b0 = spu_shuffle( a0, a4, merge_h ); \ - b1 = spu_shuffle( a0, a4, merge_l ); \ - b2 = spu_shuffle( a1, a5, merge_h ); \ - b3 = spu_shuffle( a1, a5, merge_l); \ - b4 = spu_shuffle( a2, a6, merge_h ); \ - b5 = spu_shuffle( a2, a6, merge_l ); \ - b6 = spu_shuffle( a3, a7, merge_h ); \ - b7 = spu_shuffle( a3, a7, merge_l ) - -void filter_mb_spu(vsint16_t *img_y, vsint16_t *img_cb, vsint16_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int edges[2], int bS[2][4][4], int qp[2][4], int chroma_qp[2][2][4], int start[2]){ - - int dir,x; - vsint16_t o_vec_img_y[(16+8)*2]; - vsint16_t t_vec_img_y[(16+8)*2]; - vsint16_t *vec_img_y_o = o_vec_img_y; - vsint16_t *vec_img_y_t = t_vec_img_y; - - vsint16_t o_vec_img_cb[8+8+4]; - vsint16_t t_vec_img_cb[8+8]; - vsint16_t *vec_img_cb_o = &o_vec_img_cb[2]; - vsint16_t *vec_img_cb_t = t_vec_img_cb; - - vsint16_t o_vec_img_cr[8+8+4]; - vsint16_t t_vec_img_cr[8+8]; - vsint16_t *vec_img_cr_o = &o_vec_img_cr[2]; - vsint16_t *vec_img_cr_t = t_vec_img_cr; - - vuint8_t *pvec_tmp; - - const vuint8_t patt_high = {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}; - const vuint8_t patt_low = {16, 8, 17, 9, 18, 10, 19, 11, 20, 12, 21, 13, 22, 14, 23, 15}; - const vuint8_t patt_unpack={ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; - const vuint8_t patt_pack_hw={0, 1, 2, 3, 4, 5, 6, 7, 17, 19, 21, 23, 25, 27, 29, 31}; - const vuint8_t patt_pack_chroma_aligned={0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; - const vuint8_t patt_pack_chroma_unaligned={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F}; - const vuint8_t v_0 = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - const vuint8_t mergehu16 = {0x00,0x01,0x10,0x11,0x02,0x03,0x12,0x13,0x04,0x05,0x14,0x15,0x06,0x07,0x16,0x17}; - const vuint8_t mergelu16 = {0x08,0x09,0x18,0x19,0x0A,0x0B,0x1A,0x1B,0x0C,0x0D,0x1C,0x1D,0x0E,0x0F,0x1E,0x1F}; - vuint8_t store_chroma, store_chroma_n1, load_chroma, load_chroma_n1; - int mb_xy_n1; - const int unalign_chroma = (unsigned int) img_cb & 15; - - if(unalign_chroma==0){ - load_chroma = patt_high; - load_chroma_n1 = patt_low; // for load chroma mb_x-1 - store_chroma = patt_pack_chroma_aligned; - store_chroma_n1 = patt_pack_chroma_unaligned; // for store chroma mb_x-1 - mb_xy_n1 = 1; // si no hay desalineamineto se necesita el bloque anterior para filtrar horizontalmente - } - else{ - load_chroma = patt_low; - load_chroma_n1 = patt_high; // for load mb_x-1 - store_chroma = patt_pack_chroma_unaligned; - store_chroma_n1 = patt_pack_chroma_aligned; // for store chroma mb_x-1 - mb_xy_n1 = 0; // si hay desalineamineto 8 no se necesita el bloque anterior - } - - /* dir : 0 -> vertical edge, 1 -> horizontal edge */ - - // LOAD MB_X -1 - - for (x = 0; x < 16; x++){ //Unpack Memory to 8 positions vector - vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize - 1], v_0 , patt_low); - } - - for (x = 0; x < 8; x++){ //Unpack Memory to 8 positions vector - vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cb[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1); - vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cr[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1); - } - - VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16); - - vec_img_y_t = &vec_img_y_t[8]; - vec_img_y_o = &vec_img_y_o[8]; - vec_img_cb_t = &vec_img_cb_t[8]; - vec_img_cb_o = &vec_img_cb_o[10]; - vec_img_cr_t = &vec_img_cr_t[8]; - vec_img_cr_o = &vec_img_cr_o[10]; - - //LOAD CURRENT MB - for (x = 0; x < 16; x++){ //Unpack Memory to 8 positions vector - pvec_tmp = (vuint8_t *) &img_y[x*linesize]; - vec_img_y_o[x] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_high); - vec_img_y_o[x+24] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_low); - } - - for (x = 0; x < 8; x++){ //Unpack Memory to 8 positions vector - vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma); - vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma); - } - - //TRANSPOSE MATRIX - - VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31], vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39], vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16); - - //PROCESS - dir = 0; - { - int edge; - for( edge = start[dir]; edge < edges[dir]; edge++ ) { - if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0) - { - filter_mb_edgeh( &vec_img_y_t[4*edge ], bS[dir][edge], qp[dir][edge],0);//low - filter_mb_edgeh( &vec_img_y_t[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high - - if( (edge&1) == 0 ) { - filter_mb_edgecv( &vec_img_cb_t[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] ); - filter_mb_edgecv( &vec_img_cr_t[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] ); - } - } - } - } - - //SAVE MB_X -1 RESULTS - - VEC_TRANSPOSE_8(vec_img_y_t[-8], vec_img_y_t[-7], vec_img_y_t[-6], vec_img_y_t[-5], vec_img_y_t[-4], vec_img_y_t[-3], vec_img_y_t[-2], vec_img_y_t[-1], vec_img_y_o[-8], vec_img_y_o[-7], vec_img_y_o[-6], vec_img_y_o[-5], vec_img_y_o[-4], vec_img_y_o[-3], vec_img_y_o[-2], vec_img_y_o[-1],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_t[16], vec_img_y_t[17], vec_img_y_t[18], vec_img_y_t[19], vec_img_y_t[20], vec_img_y_t[21], vec_img_y_t[22], vec_img_y_t[23], vec_img_y_o[16], vec_img_y_o[17], vec_img_y_o[18], vec_img_y_o[19], vec_img_y_o[20], vec_img_y_o[21], vec_img_y_o[22], vec_img_y_o[23],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cb_t[ -8], vec_img_cb_t[-7], vec_img_cb_t[-6], vec_img_cb_t[-5], vec_img_cb_t[-4], vec_img_cb_t[-3], vec_img_cb_t[-2], vec_img_cb_t[-1], vec_img_cb_o[-10], vec_img_cb_o[-9], vec_img_cb_o[-8], vec_img_cb_o[-7], vec_img_cb_o[-6], vec_img_cb_o[-5], vec_img_cb_o[-4], vec_img_cb_o[-3],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cr_t[ -8], vec_img_cr_t[-7], vec_img_cr_t[-6], vec_img_cr_t[-5], vec_img_cr_t[-4], vec_img_cr_t[-3], vec_img_cr_t[-2], vec_img_cr_t[-1], vec_img_cr_o[-10], vec_img_cr_o[-9], vec_img_cr_o[-8], vec_img_cr_o[-7], vec_img_cr_o[-6], vec_img_cr_o[-5], vec_img_cr_o[-4], vec_img_cr_o[-3],mergehu16, mergelu16); - - for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory - img_y[x*linesize - 1] = spu_shuffle(img_y[x*linesize - 1], vec_img_y_o[-8+x], patt_pack_hw); - } - - for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory - img_y[(x+8)*linesize - 1] = spu_shuffle(img_y[(x+8)*linesize - 1], vec_img_y_o[16+x], patt_pack_hw); - } - - for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory - img_cb[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cb[x*uvlinesize - mb_xy_n1], vec_img_cb_o[-10+x], store_chroma_n1); - img_cr[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cr[x*uvlinesize - mb_xy_n1], vec_img_cr_o[-10+x], store_chroma_n1); - } - - //TRANSPOSE MATRIX - - VEC_TRANSPOSE_8(vec_img_y_t[ 0], vec_img_y_t[ 1], vec_img_y_t[ 2], vec_img_y_t[ 3], vec_img_y_t[ 4], vec_img_y_t[ 5], vec_img_y_t[ 6], vec_img_y_t[ 7], vec_img_y_o[ 0], vec_img_y_o[ 1], vec_img_y_o[ 2], vec_img_y_o[ 3], vec_img_y_o[ 4], vec_img_y_o[ 5], vec_img_y_o[ 6], vec_img_y_o[ 7],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15], vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31], vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39], vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7], vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7],mergehu16, mergelu16); - - VEC_TRANSPOSE_8(vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7], vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7],mergehu16, mergelu16); - - - //LOAD MB_Y - 1 - for (x = -4; x < 0; x++){ //Unpack Memory to 8 positions vector - vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_high); - vec_img_y_o[x+24] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_low); - } - - for (x = -2; x < 0; x++){ //Unpack Memory to 8 positions vector - vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma); - vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma); - } - - //PROCESS - dir = 1; - { - int edge; - for( edge = start[dir]; edge < edges[dir]; edge++ ) { - if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0) - { - filter_mb_edgeh( &vec_img_y_o[4*edge ], bS[dir][edge], qp[dir][edge],0);//low - filter_mb_edgeh( &vec_img_y_o[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high - if( (edge&1) == 0 ) { - filter_mb_edgecv( &vec_img_cb_o[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] ); - filter_mb_edgecv( &vec_img_cr_o[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] ); - } - } - } - - for (x = -3; x < 16; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory - img_y[x*linesize] = spu_shuffle(vec_img_y_o[x], vec_img_y_o[x+24], patt_unpack); - } - - for (x = -1; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory - img_cb[x*uvlinesize] = spu_shuffle(img_cb[x*uvlinesize], vec_img_cb_o[x], store_chroma); - img_cr[x*uvlinesize] = spu_shuffle(img_cr[x*uvlinesize], vec_img_cr_o[x], store_chroma); - } - } -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,408 +0,0 @@ -/* - * Copyright (c) 2009 TUDelft - * - * Cell Parallel SPU - Macroblock Decoding. - */ - -/** - * @file libavcodec/cell/spu/h264_main_spu.c - * Cell Parallel SPU - Macroblock Decoding - * @author C C Chi - * - * SIMD kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - -#include -#include "types_spu.h" -#include "h264_tables.h" -#include "h264_idct_spu.h" -#include "h264_intra_spu.h" - -/*********************************************************************** - * ff_h264_idct_add_spu - *********************************************************************** - * h264 idct 4x4 transform with SPU SIMD intrinsics - * using the factorized algorithm - * Mauricio Alvarez: alvarez@ac.upc.edu - * - DCTELEM* block: transformed coefficients are stored consecutvely in memory, - * - for the 4x4 transform the structure is like that: - * || coef_00 | coef_01 || coef_02 | coef_03 ||..||coef_0F|| - * - Usually the DCTELEM block is declared with an alignment modificator in such a way - * that the array is 128 bit (16 byte, 8 short) aligned. - * - The dst pointer can be unaligned with unaligment as a multiple of 4. - ***********************************************************************/ - -// idct_dc -void ff_idct_dc_add(uint8_t *dst, short *block, int stride){ - int i, j; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - int dc = (block[0] + 32) >> 6; - for( j = 0; j < 4; j++ ){ - for( i = 0; i < 4; i++ ) - dst[i] = cm[ dst[i] + dc ]; - dst += stride; - } -} - -void ff_idct8_dc_add(uint8_t *dst, short *block, int stride){ - int i, j; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - int dc = (block[0] + 32) >> 6; - for( j = 0; j < 8; j++ ){ - for( i = 0; i < 8; i++ ) - dst[i] = cm[ dst[i] + dc ]; - dst += stride; - } -} - -// add without idct - -void add_pixels8_c(uint8_t *pixels, short *block, int line_size) -{ - int i; - for(i=0;i<8;i++) { - pixels[0] += block[0]; - pixels[1] += block[1]; - pixels[2] += block[2]; - pixels[3] += block[3]; - pixels[4] += block[4]; - pixels[5] += block[5]; - pixels[6] += block[6]; - pixels[7] += block[7]; - pixels += line_size; - block += 8; - } -} - -void add_pixels4_c(uint8_t *pixels, short *block, int line_size) -{ - int i; - for(i=0;i<4;i++) { - pixels[0] += block[0]; - pixels[1] += block[1]; - pixels[2] += block[2]; - pixels[3] += block[3]; - pixels += line_size; - block += 4; - } -} - -void h264_luma_dc_dequant_idct_c(short *block, int qmul){ - #define stride 16 - int i; - int temp[16]; //FIXME check if this is a good idea - static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; - static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; - - for(i=0; i<4; i++){ - const int offset= y_offset[i]; - const int z0= block[offset+stride*0] + block[offset+stride*4]; - const int z1= block[offset+stride*0] - block[offset+stride*4]; - const int z2= block[offset+stride*1] - block[offset+stride*5]; - const int z3= block[offset+stride*1] + block[offset+stride*5]; - - temp[4*i+0]= z0+z3; - temp[4*i+1]= z1+z2; - temp[4*i+2]= z1-z2; - temp[4*i+3]= z0-z3; - } - - for(i=0; i<4; i++){ - const int offset= x_offset[i]; - const int z0= temp[4*0+i] + temp[4*2+i]; - const int z1= temp[4*0+i] - temp[4*2+i]; - const int z2= temp[4*1+i] - temp[4*3+i]; - const int z3= temp[4*1+i] + temp[4*3+i]; - - block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual - block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); - block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); - block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); - } -} -#undef stride - -void chroma_dc_dequant_idct_c(short *block, int qmul){ - const int stride= 16*2; - const int xStride= 16; - int a,b,c,d,e; - - a= block[stride*0 + xStride*0]; - b= block[stride*0 + xStride*1]; - c= block[stride*1 + xStride*0]; - d= block[stride*1 + xStride*1]; - - e= a-b; - a= a+b; - b= c-d; - c= c+d; - - block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; - block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; - block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; - block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; -} - -void h264_idct4_add_spu(uint8_t *dst, short *block, int stride) -{ - vsint16_t __vz0, __vz1, __vz2, __vz3; // used as temporal storage in for VEC_1D_DCT - vsint16_t va0, va1, va2, va3; - vsint16_t vtmp0, vtmp1, vtmp2, vtmp3; - vuint16_t sat; - vuint8_t va_u8; - vsint16_t vdst_ss; - vuint8_t dstperm; - vuint8_t vdst, vdst_orig, vfdst; - const int16_t imax = 255; - const vsint32_t vzero = spu_splats(0); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - const int shift_dst = (unsigned int) dst & 15; - const vuint8_t packu16 = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F); - const vuint8_t mergehu8 = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17); - //for optimized matrix transpose: - const vuint8_t tr0 =AVV(0x00,0x01,0x08,0x09,0x10,0x11,0x18,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); - const vuint8_t tr1 =AVV(0x02,0x03,0x0A,0x0B,0x12,0x13,0x1A,0x1B,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); - const vuint8_t tr2 =AVV(0x04,0x05,0x0C,0x0D,0x14,0x15,0x1C,0x1D,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); - const vuint8_t tr3 =AVV(0x06,0x07,0x0E,0x0F,0x16,0x17,0x1E,0x1F,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); - const vuint8_t conc =AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17); - - block[0] += 32; // add 32 as a DC-level for rounding - - //load matrix - vtmp0 = *(vsint16_t *)(block); - vtmp1 = spu_rlqwbyte(vtmp0,8); - vtmp2 = *(vsint16_t *)(block+8); - vtmp3 = spu_rlqwbyte(vtmp2,8); - - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - - //concatenate first two rows of matrix - va0=spu_shuffle(va0,va1,conc); - //concatenate last two rows of matrix - va2=spu_shuffle(va2,va3,conc); - - //do transpose starting from two vectors, storing as four vectors of which the second part is unused - vtmp0 = spu_shuffle( va0, va2, tr0); - vtmp1 = spu_shuffle( va0, va2, tr1); - vtmp2 = spu_shuffle( va0, va2, tr2); - vtmp3 = spu_shuffle( va0, va2, tr3); - - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - - // division by 64 - va0 = spu_rlmaska(va0,-6); - va1 = spu_rlmaska(va1,-6); - va2 = spu_rlmaska(va2,-6); - va3 = spu_rlmaska(va3,-6); - - switch (shift_dst){ - case 0: { - dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); - } break; - case 4: { - dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); - } break; - case 8: { - dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F); - } break; - case 12: { - dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13); - } break; - default: { - dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); - } break; - } - - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm); -} - -void h264_idct8_add_spu(uint8_t *dst, short *block, int stride) -{ - vsint16_t va0, va1, va2, va3, va4, va5, va6, va7; - vsint16_t vza0, vza1, vza2, vza3, vza4, vza5, vza6, vza7, vzal,vzah; - vsint16_t vzb0, vzb1, vzb2, vzb3, vzb4, vzb5, vzb6, vzb7; - vsint16_t vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6, vtmp7; - vuint16_t sat; - vuint8_t va_u8; - const int block_stride=8; - vsint16_t vdst_ss; - const int16_t imax = 255; - const vsint32_t vzero = spu_splats(0); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint8_t vdst, vdst_orig, vfdst; - vuint8_t dstperm; - const int shift_dst = (unsigned int) dst & 15; - const vuint8_t packu16 = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F); - const vuint8_t mergehu8 = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17); - const vuint8_t m1 = AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17); - const vuint8_t m2 = AVV(0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F); - const vuint8_t m3 = AVV(0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x18,0x19,0x1A,0x1B); - const vuint8_t m4 = AVV(0x14,0x15,0x16,0x17,0x04,0x05,0x06,0x07,0x1C,0x1D,0x1E,0x1F,0x0C,0x0D,0x0E,0x0F); - const vuint8_t m5 = AVV(0x00,0x01,0x10,0x11,0x04,0x05,0x14,0x15,0x08,0x09,0x18,0x19,0x0C,0x0D,0x1C,0x1D); - const vuint8_t m6 = AVV(0x12,0x13,0x02,0x03,0x16,0x17,0x06,0x07,0x1A,0x1B,0x0A,0x0B,0x1E,0x1F,0x0E,0x0F); - - block[0] += 32; // add 32 as a DC-level for rounding - - vtmp0 = *(vsint16_t *)(block); - vtmp1 = *(vsint16_t *)(block + block_stride); - vtmp2 = *(vsint16_t *)(block + 2*block_stride); - vtmp3 = *(vsint16_t *)(block + 3*block_stride); - vtmp4 = *(vsint16_t *)(block + 4*block_stride); - vtmp5 = *(vsint16_t *)(block + 5*block_stride); - vtmp6 = *(vsint16_t *)(block + 6*block_stride); - vtmp7 = *(vsint16_t *)(block + 7*block_stride); - - VEC_1D_DCT8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7); - VEC_TRANSPOSE_8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7,va0,va1,va2,va3,va4,va5,va6,va7); - VEC_1D_DCT8(va0, va1, va2, va3, va4, va5, va6, va7); - - va0 = spu_rlmaska(va0,-6); - va1 = spu_rlmaska(va1,-6); - va2 = spu_rlmaska(va2,-6); - va3 = spu_rlmaska(va3,-6); - va4 = spu_rlmaska(va4,-6); - va5 = spu_rlmaska(va5,-6); - va6 = spu_rlmaska(va6,-6); - va7 = spu_rlmaska(va7,-6); - - if (shift_dst==8) - dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17); - else dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); - - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va4,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va5,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va6,dstperm); - dst += stride; - VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va7,dstperm); - -} - -/* - -void h264_idct4_add_spu(uint8_t *dst, short *block, int stride){ - int i; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - - block[0] += 32; - - for(i=0; i<4; i++){ - const int z0= block[0 + 4*i] + block[2 + 4*i]; - const int z1= block[0 + 4*i] - block[2 + 4*i]; - const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i]; - const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1); - - block[0 + 4*i]= z0 + z3; - block[1 + 4*i]= z1 + z2; - block[2 + 4*i]= z1 - z2; - block[3 + 4*i]= z0 - z3; - } - - for(i=0; i<4; i++){ - const int z0= block[i + 4*0] + block[i + 4*2]; - const int z1= block[i + 4*0] - block[i + 4*2]; - const int z2= (block[i + 4*1]>>1) - block[i + 4*3]; - const int z3= block[i + 4*1] + (block[i + 4*3]>>1); - - dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ]; - dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ]; - dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ]; - dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ]; - } -} - -void h264_idct8_add_spu(uint8_t *dst, short *block, int stride){ - int i; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - - block[0] += 32; - - for( i = 0; i < 8; i++ ) - { - const int a0 = block[0+i*8] + block[4+i*8]; - const int a2 = block[0+i*8] - block[4+i*8]; - const int a4 = (block[2+i*8]>>1) - block[6+i*8]; - const int a6 = (block[6+i*8]>>1) + block[2+i*8]; - - const int b0 = a0 + a6; - const int b2 = a2 + a4; - const int b4 = a2 - a4; - const int b6 = a0 - a6; - - const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1); - const int a3 = block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1); - const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1); - const int a7 = block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1); - - const int b1 = (a7>>2) + a1; - const int b3 = a3 + (a5>>2); - const int b5 = (a3>>2) - a5; - const int b7 = a7 - (a1>>2); - - block[0+i*8] = b0 + b7; - block[7+i*8] = b0 - b7; - block[1+i*8] = b2 + b5; - block[6+i*8] = b2 - b5; - block[2+i*8] = b4 + b3; - block[5+i*8] = b4 - b3; - block[3+i*8] = b6 + b1; - block[4+i*8] = b6 - b1; - } - for( i = 0; i < 8; i++ ) - { - const int a0 = block[i+0*8] + block[i+4*8]; - const int a2 = block[i+0*8] - block[i+4*8]; - const int a4 = (block[i+2*8]>>1) - block[i+6*8]; - const int a6 = (block[i+6*8]>>1) + block[i+2*8]; - - const int b0 = a0 + a6; - const int b2 = a2 + a4; - const int b4 = a2 - a4; - const int b6 = a0 - a6; - - const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1); - const int a3 = block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1); - const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1); - const int a7 = block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1); - - const int b1 = (a7>>2) + a1; - const int b3 = a3 + (a5>>2); - const int b5 = (a3>>2) - a5; - const int b7 = a7 - (a1>>2); - - dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ]; - dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ]; - dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ]; - dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ]; - dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ]; - dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ]; - dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ]; - dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; - } -}*/ - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,141 +0,0 @@ -#ifndef H264_IDCT_SPU_H -#define H264_IDCT_SPU_H - -void h264_idct4_add_spu(uint8_t *dst, short *block, int stride); -void h264_idct8_add_spu(uint8_t *dst, short *block, int stride); - -/*********************************************************************** - * VEC_1D_IDCT - *********************************************************************** - * 1-dimensional 4x4 H264 integer DCT inverse transform. - * Actually source and destination are 8x4. The low elements of the - * source are discarded and the low elements of the destination mustn't - * be used. - * __vz0-__vz3 registers need to be declared in the caller function - ***********************************************************************/ -#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ - /* 1st stage */ \ - __vz0 = spu_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ - __vz1 = spu_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ - __vz2 = spu_rlmaska(vb1,-1); \ - __vz2 = spu_sub(__vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ - __vz3 = spu_rlmaska(vb3,-1); \ - __vz3 = spu_add(vb1,__vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ - \ - /* 2nd stage: output */ \ - va0 = spu_add(__vz0,__vz3); /* x[0] = temp[0] + temp[3] */ \ - va1 = spu_add(__vz1,__vz2); /* x[1] = temp[1] + temp[2] */ \ - va2 = spu_sub(__vz1,__vz2); /* x[2] = temp[1] - temp[2] */ \ - va3 = spu_sub(__vz0,__vz3) /* x[3] = temp[0] - temp[3] */ - -/*********************************************************************** - * VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8 - *********************************************************************** - * load a vuint8_t vector from a unaligned memory position p - * Converts the vector to vsint16_t - * Adds the loaded and converted vector to a defined vector va - * converts back the result to vuint8_t and store it to memory - **********************************************************************/ - -#define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,shift,va,align_dst) \ - vdst_orig = *(vuint8_t *) (p); \ - vdst = spu_or(spu_slqwbyte(vdst_orig, shift),(vuint8_t) vzero); \ - vdst_ss = (vsint16_t) spu_shuffle((vuint8_t)vzero,vdst,mergehu8); \ - va = spu_add(va,vdst_ss); \ - sat = spu_cmpgt(va,(vsint16_t)vzero); \ - va = spu_and(va,(vsint16_t)sat); \ - sat = spu_cmpgt(va,vmax); \ - va = spu_sel(va,vmax,sat); \ - va_u8 = (vuint8_t) spu_shuffle(va,(vsint16_t) vzero,packu16); \ - vfdst = spu_shuffle(vdst_orig, va_u8, align_dst); \ - *(vuint8_t *) (dst) = vfdst - -/*********************************************************************** - * VEC_TRANSPOSE_8 - *********************************************************************** - * Transposes a 8x8 matrix of s16 vectors - **********************************************************************/ -#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ - b0 = spu_shuffle( a0, a4, m1 ); \ - b1 = spu_shuffle( a1, a5, m1 ); \ - b2 = spu_shuffle( a2, a6, m1 ); \ - b3 = spu_shuffle( a3, a7, m1 ); \ - b4 = spu_shuffle( a4, a0, m2 ); \ - b5 = spu_shuffle( a5, a1, m2 ); \ - b6 = spu_shuffle( a6, a2, m2 ); \ - b7 = spu_shuffle( a7, a3, m2 ); \ - a0 = spu_shuffle( b0, b2, m3 ); \ - a1 = spu_shuffle( b1, b3, m3 ); \ - a2 = spu_shuffle( b2, b0, m4 ); \ - a3 = spu_shuffle( b3, b1, m4 ); \ - a4 = spu_shuffle( b4, b6, m3 ); \ - a5 = spu_shuffle( b5, b7, m3 ); \ - a6 = spu_shuffle( b6, b4, m4 ); \ - a7 = spu_shuffle( b7, b5, m4 ); \ - b0 = spu_shuffle( a0, a1, m5 ); \ - b1 = spu_shuffle( a1, a0, m6 ); \ - b2 = spu_shuffle( a2, a3, m5 ); \ - b3 = spu_shuffle( a3, a2, m6 ); \ - b4 = spu_shuffle( a4, a5, m5 ); \ - b5 = spu_shuffle( a5, a4, m6 ); \ - b6 = spu_shuffle( a6, a7, m5 ); \ - b7 = spu_shuffle( a7, a6, m6 ) - -/*********************************************************************** - * VEC_1D_IDCT8 - *********************************************************************** - * 1-dimensional 8x8 H264 integer DCT inverse transform. - ***********************************************************************/ -#define VEC_1D_DCT8(vb0,vb1,vb2,vb3,vb4,vb5,vb6,vb7) \ - vza0 = spu_add(vb0,vb4); /* a[0] = Y[0] + Y[4] */ \ - vza2 = spu_sub(vb0,vb4); /* a[2] = Y[0] - Y[4] */ \ - vza4 = spu_rlmaska(vb2,-1); \ - vza4 = spu_sub(vza4,vb6); /* a[4] = Y[2]>>1 - Y[6] */ \ - vza6 = spu_rlmaska(vb6,-1 ); \ - vza6 = spu_add(vb2,vza6); /* a[6] = Y[2] + Y[6]>>1 */ \ - \ - vzb0 = spu_add(vza0,vza6); /* b[0] = a[0] + a[6] */ \ - vzb2 = spu_add(vza2,vza4); /* b[2] = a[2] + a[4] */ \ - vzb4 = spu_sub(vza2,vza4); /* b[4] = a[2] - a[4] */ \ - vzb6 = spu_sub(vza0,vza6); /* b[6] = a[0] - a[6] */ \ - \ - vza1 = spu_rlmaska(vb7,-1); \ - vzal = spu_add(vza1,vb7); \ - vzah = spu_sub(vb5,vb3); \ - vza1 = spu_sub(vzah,vzal); /* a1 = (-Y[3] + Y[5]) - (Y[7] + (Y[7]>>1)) */ \ - \ - vza3 = spu_rlmaska(vb3,-1); \ - vzal = spu_add(vza3,vb3); \ - vzah = spu_add(vb1,vb7); \ - vza3 = spu_sub(vzah,vzal); /* a3 = (Y[1] + Y[7]) - (Y[3] + (Y[3]>>1)) */ \ - \ - vza5 = spu_rlmaska(vb5,-1); \ - vzal = spu_add(vza5,vb5); \ - vzah = spu_sub(vb7,vb1); \ - vza5 = spu_add(vzah,vzal); /* a5 = (-Y[1] + Y[7]) + (Y[5] + Y[5]>>1)) */ \ - \ - vza7 = spu_rlmaska(vb1,-1); \ - vzal = spu_add(vza7,vb1); \ - vzah = spu_add(vb3,vb5); \ - vza7 = spu_add(vzah,vzal); /* a7 = (Y[3] + Y[5]) + (Y[1] + (Y[1]>>1)) */ \ - \ - vzb1 = spu_rlmaska(vza7,-2); \ - vzb1 = spu_add(vzb1,vza1); /* b1 = (a7>>2) + a1 */ \ - vzb3 = spu_rlmaska(vza5,-2); \ - vzb3 = spu_add(vzb3,vza3); /* b3 = a3 + (a5>>2) */ \ - vzb5 = spu_rlmaska(vza3,-2); \ - vzb5 = spu_sub(vzb5,vza5); /* b5 = (a3>>2) - a5 */ \ - vzb7 = spu_rlmaska(vza1,-2); \ - vzb7 = spu_sub(vza7,vzb7); /* b7 = a7 - (a1>>2) */ \ - \ - vb0 = spu_add(vzb0,vzb7); /* src[i][0] = b0 + b7 */ \ - vb7 = spu_sub(vzb0,vzb7); /* src[i][7] = b0 - b7 */ \ - vb1 = spu_add(vzb2,vzb5); /* src[i][1] = b2 + b5 */ \ - vb6 = spu_sub(vzb2,vzb5); /* src[i][6] = b2 - b5 */ \ - vb2 = spu_add(vzb4,vzb3); /* src[i][2] = b4 + b3 */ \ - vb5 = spu_sub(vzb4,vzb3); /* src[i][5] = b4 - b3 */ \ - vb3 = spu_add(vzb6,vzb1); /* src[i][3] = b6 + b1 */ \ - vb4 = spu_sub(vzb6,vzb1); /* src[i][4] = b6 - b1 */ - - -#endif /*H264_IDCT_SPU_H*/ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,802 +0,0 @@ -#include "types_spu.h" -#include "h264_tables.h" -#include "h264_intra_spu.h" -#include - -void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const uint32_t a= ((uint32_t*)(src-stride))[0]; - ((uint32_t*)(src+0*stride))[0]= a; - ((uint32_t*)(src+1*stride))[0]= a; - ((uint32_t*)(src+2*stride))[0]= a; - ((uint32_t*)(src+3*stride))[0]= a; -} - -void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; - ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; - ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; - ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; -} - -void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] - + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; -} - -void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; - - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; -} - -void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; - - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; -} - -void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; -} - - -#define LOAD_TOP_RIGHT_EDGE\ - const int t4= topright[0];\ - const int t5= topright[1];\ - const int t6= topright[2];\ - const int t7= topright[3];\ - -#define LOAD_LEFT_EDGE\ - const int l0= src[-1+0*stride];\ - const int l1= src[-1+1*stride];\ - const int l2= src[-1+2*stride];\ - const int l3= src[-1+3*stride];\ - -#define LOAD_TOP_EDGE\ - const int t0= src[ 0-1*stride];\ - const int t1= src[ 1-1*stride];\ - const int t2= src[ 2-1*stride];\ - const int t3= src[ 3-1*stride];\ - -void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int lt= src[-1-1*stride]; - LOAD_TOP_EDGE - LOAD_LEFT_EDGE - - src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; - src[0+2*stride]= - src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; - src[0+1*stride]= - src[1+2*stride]= - src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; - src[0+0*stride]= - src[1+1*stride]= - src[2+2*stride]= - src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; - src[1+0*stride]= - src[2+1*stride]= - src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; - src[2+0*stride]= - src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; -} - -void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){ - LOAD_TOP_EDGE - LOAD_TOP_RIGHT_EDGE -// LOAD_LEFT_EDGE - - src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; - src[1+0*stride]= - src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; - src[2+0*stride]= - src[1+1*stride]= - src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; - src[3+0*stride]= - src[2+1*stride]= - src[1+2*stride]= - src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; - src[3+1*stride]= - src[2+2*stride]= - src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; - src[3+2*stride]= - src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; - src[3+3*stride]=(t6 + 3*t7 + 2)>>2; -} - -void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int lt= src[-1-1*stride]; - LOAD_TOP_EDGE - LOAD_LEFT_EDGE - (void) l3; - - src[0+0*stride]= - src[1+2*stride]=(lt + t0 + 1)>>1; - src[1+0*stride]= - src[2+2*stride]=(t0 + t1 + 1)>>1; - src[2+0*stride]= - src[3+2*stride]=(t1 + t2 + 1)>>1; - src[3+0*stride]=(t2 + t3 + 1)>>1; - src[0+1*stride]= - src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; - src[1+1*stride]= - src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; - src[2+1*stride]= - src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; - src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; - src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; -} - -void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){ - LOAD_TOP_EDGE - LOAD_TOP_RIGHT_EDGE - (void) t7; - - src[0+0*stride]=(t0 + t1 + 1)>>1; - src[1+0*stride]= - src[0+2*stride]=(t1 + t2 + 1)>>1; - src[2+0*stride]= - src[1+2*stride]=(t2 + t3 + 1)>>1; - src[3+0*stride]= - src[2+2*stride]=(t3 + t4+ 1)>>1; - src[3+2*stride]=(t4 + t5+ 1)>>1; - src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[1+1*stride]= - src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; - src[2+1*stride]= - src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; - src[3+1*stride]= - src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; - src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; -} - -void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - LOAD_LEFT_EDGE - - src[0+0*stride]=(l0 + l1 + 1)>>1; - src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; - src[2+0*stride]= - src[0+1*stride]=(l1 + l2 + 1)>>1; - src[3+0*stride]= - src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; - src[2+1*stride]= - src[0+2*stride]=(l2 + l3 + 1)>>1; - src[3+1*stride]= - src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; - src[3+2*stride]= - src[1+3*stride]= - src[0+3*stride]= - src[2+2*stride]= - src[2+3*stride]= - src[3+3*stride]=l3; -} - -void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int lt= src[-1-1*stride]; - LOAD_TOP_EDGE - LOAD_LEFT_EDGE - (void) t3; - - src[0+0*stride]= - src[2+1*stride]=(lt + l0 + 1)>>1; - src[1+0*stride]= - src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; - src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; - src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[0+1*stride]= - src[2+2*stride]=(l0 + l1 + 1)>>1; - src[1+1*stride]= - src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; - src[0+2*stride]= - src[2+3*stride]=(l1 + l2+ 1)>>1; - src[1+2*stride]= - src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; - src[0+3*stride]=(l2 + l3 + 1)>>1; - src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; -} - -void ff_pred16x16_vertical_c(uint8_t *src, int stride){ - int i; - const vuint32_t v= *((vuint32_t*)(src-stride)); - for(i=0; i<4; i++){ - *((vuint32_t*) src ) =v; - *((vuint32_t*)(src + stride)) =v; - *((vuint32_t*)(src + 2*stride)) =v; - *((vuint32_t*)(src + 3*stride)) =v; - src+= 4*stride; - } - - /*const uint32_t a= ((uint32_t*)(src-stride))[0]; - const uint32_t b= ((uint32_t*)(src-stride))[1]; - const uint32_t c= ((uint32_t*)(src-stride))[2]; - const uint32_t d= ((uint32_t*)(src-stride))[3]; - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= a; - ((uint32_t*)(src+i*stride))[1]= b; - ((uint32_t*)(src+i*stride))[2]= c; - ((uint32_t*)(src+i*stride))[3]= d; - }*/ -} - -void ff_pred16x16_horizontal_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; - } -} - -void ff_pred16x16_dc_c(uint8_t *src, int stride){ - int i; - int dc=0; - for(i=0;i<16; i++){ - dc+= src[-1+i*stride]; - } - - for(i=0;i<16; i++){ - dc+= src[i-stride]; - } - dc= 0x01010101*((dc + 16)>>5); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } -} - -void ff_pred16x16_left_dc_c(uint8_t *src, int stride){ - int i; - - int dc=0; - for(i=0;i<16; i++){ - dc+= src[-1+i*stride]; - } - dc= 0x01010101*((dc + 8)>>4); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } -} - -void ff_pred16x16_top_dc_c(uint8_t *src, int stride){ - int i; - int dc0=0; - for(i=0;i<16; i++){ - dc0+= src[i-stride]; - } - - dc0= 0x01010101*((dc0 + 8)>>4); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc0; - } -} - -void ff_pred16x16_128_dc_c(uint8_t *src, int stride){ - int i; - - /*const vuint32_t v= AVV(0x01010101U*128U, 0x01010101U*128U,0x01010101U*128U,0x01010101U*128U); - for(i=0; i<4; i++){ - *((vuint32_t*) src ) =v; - *((vuint32_t*)(src + stride)) =v; - *((vuint32_t*)(src + 2*stride)) =v; - *((vuint32_t*)(src + 3*stride)) =v; - src+= 4*stride; - }*/ - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U; - } -} - -void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){ - int i, j, k; - int a; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - const uint8_t * const src0 = src+7-stride; - const uint8_t *src1 = src+8*stride-1; - const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; - int H = src0[1] - src0[-1]; - int V = src1[0] - src2[ 0]; - for(k=2; k<=8; ++k) { - src1 += stride; src2 -= stride; - H += k*(src0[k] - src0[-k]); - V += k*(src1[0] - src2[ 0]); - } - if(svq3){ - H = ( 5*(H/4) ) / 16; - V = ( 5*(V/4) ) / 16; - - /* required for 100% accuracy */ - i = H; H = V; V = i; - }else{ - H = ( 5*H+32 ) >> 6; - V = ( 5*V+32 ) >> 6; - } - - a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); - for(j=16; j>0; --j) { - int b = a; - a += V; - for(i=-16; i<0; i+=4) { - src[16+i] = cm[ (b ) >> 5 ]; - src[17+i] = cm[ (b+ H) >> 5 ]; - src[18+i] = cm[ (b+2*H) >> 5 ]; - src[19+i] = cm[ (b+3*H) >> 5 ]; - b += 4*H; - } - src += stride; - } -} - -void ff_pred16x16_plane_c(uint8_t *src, int stride){ - pred16x16_plane_compat_c(src, stride, 0); -} - -void ff_pred8x8_vertical_c(uint8_t *src, int stride){ - int i; - const uint32_t a= ((uint32_t*)(src-stride))[0]; - const uint32_t b= ((uint32_t*)(src-stride))[1]; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= a; - ((uint32_t*)(src+i*stride))[1]= b; - } -} - -void ff_pred8x8_horizontal_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; - } -} - -void ff_pred8x8_128_dc_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; - } -} - -void ff_pred8x8_left_dc_c(uint8_t *src, int stride){ - int i; - int dc0, dc2; - - dc0=dc2=0; - for(i=0;i<4; i++){ - dc0+= src[-1+i*stride]; - dc2+= src[-1+(i+4)*stride]; - } - dc0= 0x01010101*((dc0 + 2)>>2); - dc2= 0x01010101*((dc2 + 2)>>2); - - for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= dc0; - } - for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= dc2; - } -} - -void ff_pred8x8_top_dc_c(uint8_t *src, int stride){ - int i; - int dc0, dc1; - - dc0=dc1=0; - for(i=0;i<4; i++){ - dc0+= src[i-stride]; - dc1+= src[4+i-stride]; - } - dc0= 0x01010101*((dc0 + 2)>>2); - dc1= 0x01010101*((dc1 + 2)>>2); - - for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; - } - for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; - } -} - - -void ff_pred8x8_dc_c(uint8_t *src, int stride){ - int i; - int dc0, dc1, dc2, dc3; - - dc0=dc1=dc2=0; - for(i=0;i<4; i++){ - dc0+= src[-1+i*stride] + src[i-stride]; - dc1+= src[4+i-stride]; - dc2+= src[-1+(i+4)*stride]; - } - dc3= 0x01010101*((dc1 + dc2 + 4)>>3); - dc0= 0x01010101*((dc0 + 4)>>3); - dc1= 0x01010101*((dc1 + 2)>>2); - dc2= 0x01010101*((dc2 + 2)>>2); - - for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; - } - for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= dc2; - ((uint32_t*)(src+i*stride))[1]= dc3; - } -} - -void ff_pred8x8_plane_c(uint8_t *src, int stride){ - int j, k; - int a; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - const uint8_t * const src0 = src+3-stride; - const uint8_t *src1 = src+4*stride-1; - const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; - int H = src0[1] - src0[-1]; - int V = src1[0] - src2[ 0]; - for(k=2; k<=4; ++k) { - src1 += stride; src2 -= stride; - H += k*(src0[k] - src0[-k]); - V += k*(src1[0] - src2[ 0]); - } - H = ( 17*H+16 ) >> 5; - V = ( 17*V+16 ) >> 5; - - a = 16*(src1[0] + src2[8]+1) - 3*(V+H); - for(j=8; j>0; --j) { - int b = a; - a += V; - src[0] = cm[ (b ) >> 5 ]; - src[1] = cm[ (b+ H) >> 5 ]; - src[2] = cm[ (b+2*H) >> 5 ]; - src[3] = cm[ (b+3*H) >> 5 ]; - src[4] = cm[ (b+4*H) >> 5 ]; - src[5] = cm[ (b+5*H) >> 5 ]; - src[6] = cm[ (b+6*H) >> 5 ]; - src[7] = cm[ (b+7*H) >> 5 ]; - src += stride; - } -} - - -#define SRC(x,y) src[(x)+(y)*stride] -#define PL(y) \ - const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; -#define PREDICT_8x8_LOAD_LEFT \ - const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ - + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ - PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ - const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 - -#define PT(x) \ - const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; -#define PREDICT_8x8_LOAD_TOP \ - const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ - + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ - PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ - const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ - + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 - -#define PTR(x) \ - t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; -#define PREDICT_8x8_LOAD_TOPRIGHT \ - int t8, t9, t10, t11, t12, t13, t14, t15; \ - if(has_topright) { \ - PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ - t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ - } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); - -#define PREDICT_8x8_LOAD_TOPLEFT \ - const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 - -#define PREDICT_8x8_DC(v) \ - int y; \ - for( y = 0; y < 8; y++ ) { \ - ((uint32_t*)src)[0] = \ - ((uint32_t*)src)[1] = v; \ - src += stride; \ - } - -static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - (void) has_topright; - (void) has_topleft; - PREDICT_8x8_DC(0x80808080); -} -static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - (void) has_topright; - PREDICT_8x8_LOAD_LEFT; - const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; - PREDICT_8x8_DC(dc); -} -static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - PREDICT_8x8_LOAD_TOP; - const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; - PREDICT_8x8_DC(dc); -} -static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOP; - const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 - +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; - PREDICT_8x8_DC(dc); -} -static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - (void) has_topright; - PREDICT_8x8_LOAD_LEFT; -#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ - ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y - ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); -#undef ROW -} -static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - int y; - PREDICT_8x8_LOAD_TOP; - src[0] = t0; - src[1] = t1; - src[2] = t2; - src[3] = t3; - src[4] = t4; - src[5] = t5; - src[6] = t6; - src[7] = t7; - for( y = 1; y < 8; y++ ) - *(uint64_t*)(src+y*stride) = *(uint64_t*)src; -} -static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_TOPRIGHT; - SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; - SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; - SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; - SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; - SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; - SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; - SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; - SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; - SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; - SRC(7,7)= (t14 + 3*t15 + 2) >> 2; -} -static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOPLEFT; - SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; - SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; - SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; - SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; - SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; - SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; - SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; - SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; - SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; - SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; - -} -static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOPLEFT; - (void) l7; - SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; - SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; - SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; - SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; - SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; - SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; - SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; - SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; - SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; - SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; - SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; - SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; - SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; - SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; - SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; - SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; - SRC(7,0)= (t6 + t7 + 1) >> 1; -} -static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOPLEFT; - (void) t7; - SRC(0,7)= (l6 + l7 + 1) >> 1; - SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; - SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; - SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; - SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; - SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; - SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; - SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; - SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; - SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; - SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; - SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; - SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; - SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; - SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; - SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; - SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; - SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; - SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; - SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; - SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; - SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; -} -static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_TOPRIGHT; - SRC(0,0)= (t0 + t1 + 1) >> 1; - SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; - SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; - SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; - SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; - SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; - SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; - SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; - SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; - SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; - SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; - SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; - SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; - SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; - SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; - SRC(7,6)= (t10 + t11 + 1) >> 1; - SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; -} -static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride) -{ - (void) has_topright; - PREDICT_8x8_LOAD_LEFT; - SRC(0,0)= (l0 + l1 + 1) >> 1; - SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; - SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; - SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; - SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; - SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; - SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; - SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; - SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; - SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; - SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; - SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; - SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; - SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; - SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= - SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= - SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= - SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; -} -#undef PREDICT_8x8_LOAD_LEFT -#undef PREDICT_8x8_LOAD_TOP -#undef PREDICT_8x8_LOAD_TOPLEFT -#undef PREDICT_8x8_LOAD_TOPRIGHT -#undef PREDICT_8x8_DC -#undef PTR -#undef PT -#undef PL -#undef SRC - -void init_pred_ptrs(H264PredContext_spu *i){ - - i->pred4x4[VERT_PRED ]= pred4x4_vertical_c; - i->pred4x4[HOR_PRED ]= pred4x4_horizontal_c; - i->pred4x4[DC_PRED ]= pred4x4_dc_c; - i->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c; - i->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c; - i->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c; - i->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c; - i->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c; - i->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c; - i->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c; - i->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c; - i->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c; - - i->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c; - i->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c; - i->pred8x8l[DC_PRED ]= pred8x8l_dc_c; - i->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c; - i->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c; - i->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c; - i->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c; - i->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c; - i->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c; - i->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c; - i->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c; - i->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c; - - - i->pred8x8[VERT_PRED8x8 ]= ff_pred8x8_vertical_c; - i->pred8x8[HOR_PRED8x8 ]= ff_pred8x8_horizontal_c; - i->pred8x8[PLANE_PRED8x8 ]= ff_pred8x8_plane_c; - i->pred8x8[DC_PRED8x8 ]= ff_pred8x8_dc_c; - i->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c; - i->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c; - i->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c; - - i->pred16x16[DC_PRED8x8 ]= ff_pred16x16_dc_c; - i->pred16x16[VERT_PRED8x8 ]= ff_pred16x16_vertical_c; - i->pred16x16[HOR_PRED8x8 ]= ff_pred16x16_horizontal_c; - i->pred16x16[PLANE_PRED8x8 ]= ff_pred16x16_plane_c; - i->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c; - i->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c; - i->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c; - -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -#ifndef H264_INTRA_SPU_H -#define H264_INTRA_SPU_H - -#define MAX_NEG_CROP 1024 - -// For Intra mode -#define MB_TYPE_INTRA4x4 0x0001 -#define IS_INTRA(a) ((a)&7) -#define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4) - -#define CODEC_FLAG_GRAY 0x2000 - -#define VERT_PRED 0 -#define HOR_PRED 1 -#define DC_PRED 2 -#define DIAG_DOWN_LEFT_PRED 3 -#define DIAG_DOWN_RIGHT_PRED 4 -#define VERT_RIGHT_PRED 5 -#define HOR_DOWN_PRED 6 -#define VERT_LEFT_PRED 7 -#define HOR_UP_PRED 8 - -#define LEFT_DC_PRED 9 -#define TOP_DC_PRED 10 -#define DC_128_PRED 11 - - -#define DC_PRED8x8 0 -#define HOR_PRED8x8 1 -#define VERT_PRED8x8 2 -#define PLANE_PRED8x8 3 - -#define LEFT_DC_PRED8x8 4 -#define TOP_DC_PRED8x8 5 -#define DC_128_PRED8x8 6 - -typedef struct H264PredContext_spu{ - - intra_pred4x4 pred4x4[9+3]; - intra_pred16x16 pred16x16[4+3]; - intra_pred8x8 pred8x8[4+3]; - intra_pred8x8l pred8x8l[9+3]; - -}H264PredContext_spu; - -void init_pred_ptrs(H264PredContext_spu *i); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1560 +0,0 @@ -static void PREFIX_h264_qpel16_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { - - register int i; - - const int16_t i20ss= 20; - const int16_t i5ss= 5; - const int16_t i16ss= 16; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t v16ss = spu_splats(i16ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const int shift_src =(unsigned int) src & 15; - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - uint8_t *srcbis = src - (STRIDE_Y * 2); - - const vuint8_t srcM2a = *(vuint8_t *)(srcbis); - const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcM1a = *(vuint8_t *)(srcbis); - const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP0a = *(vuint8_t *)(srcbis); - const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP1a = *(vuint8_t *)(srcbis); - const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP2a = *(vuint8_t *)(srcbis); - const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); - - srcbis += STRIDE_Y; - - vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); - vsint16_t srcM2ssB = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); - vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); - vsint16_t srcM1ssB = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); - vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); - vsint16_t srcP0ssB = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); - vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); - vsint16_t srcP1ssB = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); - vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); - vsint16_t srcP2ssB = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); - - for (i = 0 ; i < h ; i++) { - const vuint8_t srcP3a = *(vuint8_t *)(srcbis); - const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); - - const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); - const vsint16_t srcP3ssB = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); - srcbis += STRIDE_Y; - - const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); - const vsint16_t sum1B = spu_add(srcP0ssB, srcP1ssB); - const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); - const vsint16_t sum2B = spu_add(srcM1ssB, srcP2ssB); - const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); - const vsint16_t sum3B = spu_add(srcM2ssB, srcP3ssB); - - srcM2ssA = srcM1ssA; - srcM2ssB = srcM1ssB; - srcM1ssA = srcP0ssA; - srcM1ssB = srcP0ssB; - srcP0ssA = srcP1ssA; - srcP0ssB = srcP1ssB; - srcP1ssA = srcP2ssA; - srcP1ssB = srcP2ssB; - srcP2ssA = srcP3ssA; - srcP2ssB = srcP3ssB; - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, v16ss); - - const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); - const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); - const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); - const vsint16_t pp1B = spu_add(pp1B3, v16ss); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); - const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); - const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); - - const vsint16_t pp3A = spu_add(sum3A, pp1A); - const vsint16_t pp3B = spu_add(sum3B, pp1B); - - const vsint16_t psumA = spu_sub(pp3A, pp2A); - const vsint16_t psumB = spu_sub(pp3B, pp2B); - - vsint16_t sumA = spu_rlmask(psumA, -5); - vsint16_t sumB = spu_rlmask(psumB, -5); - - //Saturation to 0 and 255 - sat = spu_cmpgt(sumA,(vsint16_t)vzero); - sumA = spu_and(sumA,(vsint16_t)sat); - sat = spu_cmpgt(sumA,vmax); - sumA = spu_sel(sumA,vmax,sat); - sat = spu_cmpgt(sumB,(vsint16_t)vzero); - sumB = spu_and(sumB,(vsint16_t)sat); - sat = spu_cmpgt(sumB,vmax); - sumB = spu_sel(sumB,vmax,sat); - - const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu); - - /* 16x16 dest luma blocks are alway aligned */ - const vuint8_t vdst = *(vuint8_t *)dst; - - vuint8_t fsum; - OP_U8_SPU(fsum, sum, vdst); - - *(vuint8_t *)dst=fsum; - - dst += dstStride; /* stride is multiple of 16 ,so dstperm and dstmask can remain out of the loop */ - } -} - -static void PREFIX_h264_qpel16_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { - - register int i; - - const int16_t i20ss = 20; - const int16_t i5ss = 5; - const int16_t i16ss = 16; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t v16ss = spu_splats(i16ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - const int permM2 = (unsigned int) (src-2) & 15; - const int permM1 = (unsigned int) (src-1) & 15; - const int permP0 = (unsigned int) (src) & 15; - const int permP1 = (unsigned int) (src+1) & 15; - const int permP2 = (unsigned int) (src+2) & 15; - const int permP3 = (unsigned int) (src+3) & 15; - - register int align = ((((unsigned long)src) - 2) % 16); - - for (i = 0 ; i < h ; i ++) { - vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vuint8_t srcR1 = *(vuint8_t *)(src-2); - vuint8_t srcR2 = *(vuint8_t *)(src+14); - - switch (align) { - default: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); - } break; - case 11: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = srcR2; - } break; - case 12: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = srcR2; - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 13: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = srcR2; - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 14: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = srcR2; - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 15: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = srcR2; - srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - } - - const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); - const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); - const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); - const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); - - const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); - const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); - const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); - const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); - - const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); - const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); - const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); - const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); - - const vsint16_t sum1A = spu_add(srcP0A, srcP1A); - const vsint16_t sum1B = spu_add(srcP0B, srcP1B); - const vsint16_t sum2A = spu_add(srcM1A, srcP2A); - const vsint16_t sum2B = spu_add(srcM1B, srcP2B); - const vsint16_t sum3A = spu_add(srcM2A, srcP3A); - const vsint16_t sum3B = spu_add(srcM2B, srcP3B); - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, v16ss); - - const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); - const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); - const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); - const vsint16_t pp1B = spu_add(pp1B3, v16ss); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); - const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); - const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); - - const vsint16_t pp3A = spu_add(sum3A, pp1A); - const vsint16_t pp3B = spu_add(sum3B, pp1B); - - const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); - const vsint16_t psumB = spu_sub(pp3B, (vsint16_t)pp2B); - - vsint16_t sumA = spu_rlmask(psumA, -5); - vsint16_t sumB = spu_rlmask(psumB, -5); - - //Saturation to 0 and 255 - sat = spu_cmpgt(sumA,(vsint16_t)vzero); - sumA = spu_and(sumA,(vsint16_t)sat); - sat = spu_cmpgt(sumA,vmax); - sumA = spu_sel(sumA,vmax,sat); - sat = spu_cmpgt(sumB,(vsint16_t)vzero); - sumB = spu_and(sumB,(vsint16_t)sat); - sat = spu_cmpgt(sumB,vmax); - sumB = spu_sel(sumB,vmax,sat); - - const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu); - - /* 16x16 dest luma blocks are alway aligned */ - const vuint8_t vdst = *(vuint8_t *)dst; - - vuint8_t fsum; - OP_U8_SPU(fsum, sum, vdst); - - *(vuint8_t *)dst=fsum; - - src += STRIDE_Y; - dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ - } -} - -/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ -static void PREFIX_h264_qpel16_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { - register int i; - - const int16_t i20ss = 20; - const int16_t i5ss = 5; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - const int permM2 = (unsigned int) (src-2) & 15; - const int permM1 = (unsigned int) (src-1) & 15; - const int permP0 = (unsigned int) (src) & 15; - const int permP1 = (unsigned int) (src+1) & 15; - const int permP2 = (unsigned int) (src+2) & 15; - const int permP3 = (unsigned int) (src+3) & 15; - - register int align = ((((unsigned long)src) - 2) % 16); - - src -= (2 * STRIDE_Y); - - for (i = 0 ; i < (h+5) ; i ++) { - vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vuint8_t srcR1 = *(vuint8_t *)(src-2); - vuint8_t srcR2 = *(vuint8_t *)(src+14); - - switch (align) { - default: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); - } break; - case 11: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = srcR2; - } break; - case 12: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = srcR2; - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 13: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = srcR2; - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 14: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = srcR2; - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 15: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = srcR2; - srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - } - - const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); - const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); - const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); - const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); - - const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); - const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); - const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); - const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); - - const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); - const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); - const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); - const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); - - const vsint16_t sum1A = spu_add(srcP0A, srcP1A); - const vsint16_t sum1B = spu_add(srcP0B, srcP1B); - const vsint16_t sum2A = spu_add(srcM1A, srcP2A); - const vsint16_t sum2B = spu_add(srcM1B, srcP2B); - const vsint16_t sum3A = spu_add(srcM2A, srcP3A); - const vsint16_t sum3B = spu_add(srcM2B, srcP3B); - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, sum3A); - - const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); - const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); - const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); - const vsint16_t pp1B = spu_add(pp1B3, sum3B); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); - const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); - const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); - - const vsint16_t psumA = spu_sub(pp1A, pp2A); - const vsint16_t psumB = spu_sub(pp1B, pp2B); - - *(vsint16_t *)tmp = psumA; - *(vsint16_t *)(tmp+8) = psumB; - - src += STRIDE_Y; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - const int32_t ni10si = -10; - const int16_t i1ss = 1; - const int32_t i512si = 512; - const int32_t ni16si = -16; - - const vsint32_t nv10si = spu_splats(ni10si); - const vsint16_t v1ss = spu_splats(i1ss); - const vsint32_t v512si = spu_splats(i512si); - const vsint32_t nv16si = spu_splats(ni16si); - - const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; - const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; - - int16_t *tmpbis = tmp - (tmpStride * (h+5)); - - vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); - vsint16_t tmpM2ssB = *(vsint16_t *)(tmpbis+8); - tmpbis += tmpStride; - vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); - vsint16_t tmpM1ssB = *(vsint16_t *)(tmpbis+8); - tmpbis += tmpStride; - vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); - vsint16_t tmpP0ssB = *(vsint16_t *)(tmpbis+8); - tmpbis += tmpStride; - vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); - vsint16_t tmpP1ssB = *(vsint16_t *)(tmpbis+8); - tmpbis += tmpStride; - vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); - vsint16_t tmpP2ssB = *(vsint16_t *)(tmpbis+8); - tmpbis += tmpStride; - - for (i = 0 ; i < h ; i++) { - const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); - const vsint16_t tmpP3ssB = *(vsint16_t *)(tmpbis+8); - tmpbis += tmpStride; - - const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); - const vsint16_t sum1B = spu_add(tmpP0ssB, tmpP1ssB); - const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); - const vsint16_t sum2B = spu_add(tmpM1ssB, tmpP2ssB); - const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); - const vsint16_t sum3B = spu_add(tmpM2ssB, tmpP3ssB); - - tmpM2ssA = tmpM1ssA; - tmpM2ssB = tmpM1ssB; - tmpM1ssA = tmpP0ssA; - tmpM1ssB = tmpP0ssB; - tmpP0ssA = tmpP1ssA; - tmpP0ssB = tmpP1ssB; - tmpP1ssA = tmpP2ssA; - tmpP1ssB = tmpP2ssB; - tmpP2ssA = tmpP3ssA; - tmpP2ssB = tmpP3ssB; - - const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); - const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); - const vsint32_t pp1Be = spu_mule(sum1B, v20ss); - const vsint32_t pp1Bo = spu_mulo(sum1B, v20ss); - - const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); - const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); - const vsint32_t pp2Be = spu_mule(sum2B, v5ss); - const vsint32_t pp2Bo = spu_mulo(sum2B, v5ss); - - const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); - const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); - const vsint32_t pp3Be = spu_rlmask((vsint32_t)sum3B, nv16si); - const vsint32_t pp3Bo = spu_mulo(sum3B, v1ss); - - const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); - const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); - const vsint32_t pp1cBe = spu_add(pp1Be, v512si); - const vsint32_t pp1cBo = spu_add(pp1Bo, v512si); - - const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); - const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); - const vsint32_t pp32Be = spu_sub(pp3Be, pp2Be); - const vsint32_t pp32Bo = spu_sub(pp3Bo, pp2Bo); - - const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); - const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); - const vsint32_t sumBe = spu_add(pp1cBe, pp32Be); - const vsint32_t sumBo = spu_add(pp1cBo, pp32Bo); - - const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); - const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); - const vsint32_t ssumBe = spu_rlmask(sumBe, nv10si); - const vsint32_t ssumBo = spu_rlmask(sumBo, nv10si); - - vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, ssumBe, packs); - vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, ssumBo, packs); - - //Saturation to 0 and 255 - sat = spu_cmpgt(ssume,(vsint16_t)vzero); - ssume = spu_and(ssume,(vsint16_t)sat); - sat = spu_cmpgt(ssume,vmax); - ssume = spu_sel(ssume,vmax,sat); - sat = spu_cmpgt(ssumo,(vsint16_t)vzero); - ssumo = spu_and(ssumo,(vsint16_t)sat); - sat = spu_cmpgt(ssumo,vmax); - ssumo = spu_sel(ssumo,vmax,sat); - - const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); - - const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); - - /* 16x16 dest luma blocks are alway aligned */ - const vuint8_t vdst = *(vuint8_t *)dst; - - vuint8_t fsum; - OP_U8_SPU(fsum, sum, vdst); - - *(vuint8_t *)dst=fsum; - - dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ - - } -} - -static void PREFIX_h264_qpel8_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { - - register int i; - - const int16_t i20ss= 20; - const int16_t i5ss= 5; - const int16_t i16ss= 16; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t v16ss = spu_splats(i16ss); - const int shift_src = (unsigned int) src & 15; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - /* 8x8 dest luma blocks are aligned or desaligned by 8*/ - const int shift_dst = (unsigned int) dst & 15; - vuint8_t dstmask; - const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; - - if(shift_dst==0){ - dstmask = dst8mask1; - } - else{ - dstmask = dst8mask2; - } - - uint8_t *srcbis = src - (STRIDE_Y * 2); - - const vuint8_t srcM2a = *(vuint8_t *)(srcbis); - const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcM1a = *(vuint8_t *)(srcbis); - const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP0a = *(vuint8_t *)(srcbis); - const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP1a = *(vuint8_t *)(srcbis); - const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP2a = *(vuint8_t *)(srcbis); - const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); - - srcbis += STRIDE_Y; - - vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); - vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); - vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); - vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); - vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); - - for (i = 0 ; i < h ; i++) { - const vuint8_t srcP3a = *(vuint8_t *)(srcbis); - const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); - - const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); - srcbis += STRIDE_Y; - - const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); - const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); - const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); - - srcM2ssA = srcM1ssA; - srcM1ssA = srcP0ssA; - srcP0ssA = srcP1ssA; - srcP1ssA = srcP2ssA; - srcP2ssA = srcP3ssA; - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, v16ss); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint16_t pp3A = spu_add(sum3A, pp1A); - const vsint16_t psumA = spu_sub(pp3A, pp2A); - vsint16_t sumA = spu_rlmask(psumA, -5); - - //Saturation to 0 and 255 - sat = spu_cmpgt(sumA,(vsint16_t)vzero); - sumA = spu_and(sumA,(vsint16_t)sat); - sat = spu_cmpgt(sumA,vmax); - sumA = spu_sel(sumA,vmax,sat); - - const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - dst += dstStride; - } -} - -static void PREFIX_h264_qpel8_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { - - register int i; - - const int16_t i20ss = 20; - const int16_t i5ss = 5; - const int16_t i16ss = 16; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t v16ss = spu_splats(i16ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - /* 8x8 dest luma blocks are aligned or desaligned by 8*/ - const int shift_dst = (unsigned int) dst & 15; - vuint8_t dstmask; - const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; - - if(shift_dst==0){ - dstmask = dst8mask1; - } - else{ - dstmask = dst8mask2; - } - - const int permM2 = (unsigned int) (src-2) & 15; - const int permM1 = (unsigned int) (src-1) & 15; - const int permP0 = (unsigned int) (src) & 15; - const int permP1 = (unsigned int) (src+1) & 15; - const int permP2 = (unsigned int) (src+2) & 15; - const int permP3 = (unsigned int) (src+3) & 15; - - register int align = ((((unsigned long)src) - 2) % 16); - - for (i = 0 ; i < h ; i ++) { - vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vuint8_t srcR1 = *(vuint8_t *)(src-2); - vuint8_t srcR2 = *(vuint8_t *)(src+14); - - switch (align) { - default: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); - } break; - case 11: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = srcR2; - } break; - case 12: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = srcR2; - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 13: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = srcR2; - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 14: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = srcR2; - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 15: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = srcR2; - srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - } - - const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); - const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); - - const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); - const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); - - const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); - const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); - - const vsint16_t sum1A = spu_add(srcP0A, srcP1A); - const vsint16_t sum2A = spu_add(srcM1A, srcP2A); - const vsint16_t sum3A = spu_add(srcM2A, srcP3A); - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, v16ss); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint16_t pp3A = spu_add(sum3A, pp1A); - - const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); - - vsint16_t sumA = spu_rlmask(psumA, -5); - - //Saturation to 0 and 255 - sat = spu_cmpgt(sumA,(vsint16_t)vzero); - sumA = spu_and(sumA,(vsint16_t)sat); - sat = spu_cmpgt(sumA,vmax); - sumA = spu_sel(sumA,vmax,sat); - - const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - src += STRIDE_Y; - dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ - } -} - -/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ -static void PREFIX_h264_qpel8_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { - register int i; - - const int16_t i20ss = 20; - const int16_t i5ss = 5; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - const int permM2 = (unsigned int) (src-2) & 15; - const int permM1 = (unsigned int) (src-1) & 15; - const int permP0 = (unsigned int) (src) & 15; - const int permP1 = (unsigned int) (src+1) & 15; - const int permP2 = (unsigned int) (src+2) & 15; - const int permP3 = (unsigned int) (src+3) & 15; - - register int align = ((((unsigned long)src) - 2) % 16); - - src -= (2 * STRIDE_Y); - - for (i = 0 ; i < (h+5) ; i ++) { - vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vuint8_t srcR1 = *(vuint8_t *)(src-2); - vuint8_t srcR2 = *(vuint8_t *)(src+14); - - switch (align) { - default: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); - } break; - case 11: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = srcR2; - } break; - case 12: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = srcR2; - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 13: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = srcR2; - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 14: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = srcR2; - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 15: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = srcR2; - srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - } - - const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh); - const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh); - const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh); - const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh); - const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh); - const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh); - - const vsint16_t sum1A = spu_add(srcP0A, srcP1A); - const vsint16_t sum2A = spu_add(srcM1A, srcP2A); - const vsint16_t sum3A = spu_add(srcM2A, srcP3A); - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, sum3A); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint16_t psumA = spu_sub(pp1A, pp2A); - - *(vsint16_t *)tmp = psumA; - - src += STRIDE_Y; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - const int32_t ni10si = -10; - const int16_t i1ss = 1; - const int32_t i512si = 512; - const int32_t ni16si = -16; - - const vsint32_t nv10si = spu_splats(ni10si); - const vsint16_t v1ss = spu_splats(i1ss); - const vsint32_t v512si = spu_splats(i512si); - const vsint32_t nv16si = spu_splats(ni16si); - - const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; - const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; - - const int shift_dst = (unsigned int) (dst) & 15; - /* 8x8 dest luma blocks are aligned or desaligned by 8*/ - vuint8_t dstmask; - const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; - - if(shift_dst==0){ - dstmask = dst8mask1; - } - else{ - dstmask = dst8mask2; - } - - int16_t *tmpbis = tmp - (tmpStride * (h+5)); - - vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - - for (i = 0 ; i < h ; i++) { - const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - - const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); - const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); - const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); - - tmpM2ssA = tmpM1ssA; - tmpM1ssA = tmpP0ssA; - tmpP0ssA = tmpP1ssA; - tmpP1ssA = tmpP2ssA; - tmpP2ssA = tmpP3ssA; - - const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); - const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); - const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); - const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); - - const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); - const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); - - const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); - const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); - - const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); - const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); - - const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); - const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); - - const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); - const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); - - vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs); - vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs); - - //Saturation to 0 and 255 - sat = spu_cmpgt(ssume,(vsint16_t)vzero); - ssume = spu_and(ssume,(vsint16_t)sat); - sat = spu_cmpgt(ssume,vmax); - ssume = spu_sel(ssume,vmax,sat); - sat = spu_cmpgt(ssumo,(vsint16_t)vzero); - ssumo = spu_and(ssumo,(vsint16_t)sat); - sat = spu_cmpgt(ssumo,vmax); - ssumo = spu_sel(ssumo,vmax,sat); - - const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); - - const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ - - } -} - -static void PREFIX_h264_qpel4_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { - - register int i; - - const int16_t i20ss= 20; - const int16_t i5ss= 5; - const int16_t i16ss= 16; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t v16ss = spu_splats(i16ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const int shift_src = (unsigned int) src & 15; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ - const int shift_dst = (unsigned int) dst & 15; - vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; - - switch(shift_dst){ - case 0: dstmask = dst4mask0; - break; - case 4: dstmask = dst4mask4; - break; - case 8: dstmask = dst4mask8; - break; - case 12: dstmask = dst4mask12; - break; - } - - uint8_t *srcbis = src - (STRIDE_Y * 2); - - const vuint8_t srcM2a = *(vuint8_t *)(srcbis); - const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcM1a = *(vuint8_t *)(srcbis); - const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP0a = *(vuint8_t *)(srcbis); - const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP1a = *(vuint8_t *)(srcbis); - const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); - - srcbis += STRIDE_Y; - const vuint8_t srcP2a = *(vuint8_t *)(srcbis); - const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); - - srcbis += STRIDE_Y; - - vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); - vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); - vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); - vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); - vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); - - for (i = 0 ; i < h ; i++) { - const vuint8_t srcP3a = *(vuint8_t *)(srcbis); - const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); - const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); - - const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); - srcbis += STRIDE_Y; - - const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); - const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); - const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); - - srcM2ssA = srcM1ssA; - srcM1ssA = srcP0ssA; - srcP0ssA = srcP1ssA; - srcP1ssA = srcP2ssA; - srcP2ssA = srcP3ssA; - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, v16ss); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint16_t pp3A = spu_add(sum3A, pp1A); - const vsint16_t psumA = spu_sub(pp3A, pp2A); - vsint16_t sumA = spu_rlmask(psumA, -5); - - //Saturation to 0 and 255 - sat = spu_cmpgt(sumA,(vsint16_t)vzero); - sumA = spu_and(sumA,(vsint16_t)sat); - sat = spu_cmpgt(sumA,vmax); - sumA = spu_sel(sumA,vmax,sat); - - const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - dst += dstStride; - } -} - -static void PREFIX_h264_qpel4_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { - - register int i; - - const int16_t i20ss = 20; - const int16_t i5ss = 5; - const int16_t i16ss = 16; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t v16ss = spu_splats(i16ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ - const int shift_dst = (unsigned int) dst & 15; - vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; - - switch(shift_dst){ - case 0: dstmask = dst4mask0; - break; - case 4: dstmask = dst4mask4; - break; - case 8: dstmask = dst4mask8; - break; - case 12: dstmask = dst4mask12; - break; - } - - const int permM2 = (unsigned int) (src-2) & 15; - const int permM1 = (unsigned int) (src-1) & 15; - const int permP0 = (unsigned int) (src) & 15; - const int permP1 = (unsigned int) (src+1) & 15; - const int permP2 = (unsigned int) (src+2) & 15; - const int permP3 = (unsigned int) (src+3) & 15; - - register int align = ((((unsigned long)src) - 2) % 16); - - for (i = 0 ; i < h ; i ++) { - vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vuint8_t srcR1 = *(vuint8_t *)(src-2); - vuint8_t srcR2 = *(vuint8_t *)(src+14); - - switch (align) { - default: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); - } break; - case 11: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = srcR2; - } break; - case 12: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = srcR2; - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 13: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = srcR2; - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 14: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = srcR2; - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 15: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = srcR2; - srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - } - - const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); - const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); - - const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); - const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); - - const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); - const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); - - const vsint16_t sum1A = spu_add(srcP0A, srcP1A); - const vsint16_t sum2A = spu_add(srcM1A, srcP2A); - const vsint16_t sum3A = spu_add(srcM2A, srcP3A); - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, v16ss); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint16_t pp3A = spu_add(sum3A, pp1A); - - const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); - - vsint16_t sumA = spu_rlmask(psumA, -5); - - //Saturation to 0 and 255 - sat = spu_cmpgt(sumA,(vsint16_t)vzero); - sumA = spu_and(sumA,(vsint16_t)sat); - sat = spu_cmpgt(sumA,vmax); - sumA = spu_sel(sumA,vmax,sat); - - const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - src += STRIDE_Y; - dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ - } -} - -static void PREFIX_h264_qpel4_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { - register int i; - - const int16_t i20ss = 20; - const int16_t i5ss = 5; - const int16_t imax = 255; - - const vsint32_t vzero = spu_splats(0); - const vsint16_t v20ss = spu_splats(i20ss); - const vsint16_t v5ss = spu_splats(i5ss); - const vsint16_t vmax = (vsint16_t)spu_splats(imax); - vuint16_t sat; - - const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07}; - const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; - const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; - - const int permM2 = (unsigned int) (src-2) & 15; - const int permM1 = (unsigned int) (src-1) & 15; - const int permP0 = (unsigned int) (src) & 15; - const int permP1 = (unsigned int) (src+1) & 15; - const int permP2 = (unsigned int) (src+2) & 15; - const int permP3 = (unsigned int) (src+3) & 15; - - register int align = ((((unsigned long)src) - 2) % 16); - - src -= (2 * STRIDE_Y); - - for (i = 0 ; i < (h+5) ; i ++) { - vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vuint8_t srcR1 = *(vuint8_t *)(src-2); - vuint8_t srcR2 = *(vuint8_t *)(src+14); - - switch (align) { - default: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); - } break; - case 11: { - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); - srcP3 = srcR2; - } break; - case 12: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); - srcP2 = srcR2; - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 13: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); - srcP1 = srcR2; - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 14: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); - srcP0 = srcR2; - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - case 15: { - vuint8_t srcR3 = *(vuint8_t *)(src+30); - srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); - srcM1 = srcR2; - srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); - srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); - srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); - srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); - } break; - } - - const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh); - const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh); - const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh); - const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh); - const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh); - const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh); - - const vsint16_t sum1A = spu_add(srcP0A, srcP1A); - const vsint16_t sum2A = spu_add(srcM1A, srcP2A); - const vsint16_t sum3A = spu_add(srcM2A, srcP3A); - - const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); - const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); - const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); - const vsint16_t pp1A = spu_add(pp1A3, sum3A); - - const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); - const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); - const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); - - const vsint16_t psumA = spu_sub(pp1A, pp2A); - - *(vsint16_t *)tmp = psumA; - - src += STRIDE_Y; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - const int32_t ni10si = -10; - const int16_t i1ss = 1; - const int32_t i512si = 512; - const int32_t ni16si = -16; - - const vsint32_t nv10si = spu_splats(ni10si); - const vsint16_t v1ss = spu_splats(i1ss); - const vsint32_t v512si = spu_splats(i512si); - const vsint32_t nv16si = spu_splats(ni16si); - - const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; - const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; - - const int shift_dst = (unsigned int) (dst) & 15; - /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ - vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; - const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; - - switch(shift_dst){ - case 0: dstmask = dst4mask0; - break; - case 4: dstmask = dst4mask4; - break; - case 8: dstmask = dst4mask8; - break; - case 12: dstmask = dst4mask12; - break; - } - - int16_t *tmpbis = tmp - (tmpStride * (h+5)); - - vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - - for (i = 0 ; i < h ; i++) { - const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); - tmpbis += tmpStride; - - const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); - const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); - const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); - - tmpM2ssA = tmpM1ssA; - tmpM1ssA = tmpP0ssA; - tmpP0ssA = tmpP1ssA; - tmpP1ssA = tmpP2ssA; - tmpP2ssA = tmpP3ssA; - - const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); - const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); - const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); - const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); - - const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); - const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); - - const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); - const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); - - const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); - const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); - - const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); - const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); - - const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); - const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); - - vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs); - vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs); - - //Saturation to 0 and 255 - sat = spu_cmpgt(ssume,(vsint16_t)vzero); - ssume = spu_and(ssume,(vsint16_t)sat); - sat = spu_cmpgt(ssume,vmax); - ssume = spu_sel(ssume,vmax,sat); - sat = spu_cmpgt(ssumo,(vsint16_t)vzero); - ssumo = spu_and(ssumo,(vsint16_t)sat); - sat = spu_cmpgt(ssumo,vmax); - ssumo = spu_sel(ssumo,vmax,sat); - - const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); - - const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); - - const vuint8_t dst1 = *(vuint8_t *)dst; - - const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); - vuint8_t fsum; - OP_U8_SPU(fsum, dsum, dst1); - - *(vuint8_t *)dst=fsum; - - dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ - - } -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2009 TUDelft - * - * Cell Parallel SPU - 2DWave Macroblock Decoding. - */ - -/** - * @file libavcodec/cell/spu/h264_main_spu.c - * Cell Parallel SPU - 2DWave Macroblock Decoding - * @author C C Chi - * - * SIMD kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - - -#include -#include -#include -#include - -#include "h264_mc_spu.h" -#include "h264_dma.h" -#include "h264_tables.h" -#include "h264_decode_mb_spu.h" - - -//biweight buffer -DECLARE_ALIGNED_16(uint8_t, tmp_y_ls[48*16]); -DECLARE_ALIGNED_16(uint8_t, tmp_cb_ls[32*8]); -DECLARE_ALIGNED_16(uint8_t, tmp_cr_ls[32*8]); - -//ref buffer (double buffered) -DECLARE_ALIGNED_16(uint8_t, mc_ref[2][16*(4+5)*48 + 2*16*(2+1)*32]); -uint8_t* ref_ptr; - -/** Motion Compensation functions*/ - -static void fill_mc_part(H264mc *mc, int n, int chroma_height, int x_offset, int y_offset, int itp, int weight, int list0, int list1){ - H264mc_part *mc_part = mc->mc_part + mc->npart; - mc_part->n =n; - mc_part->chroma_height =chroma_height; - mc_part->x_offset = x_offset; - mc_part->y_offset = y_offset; - mc_part->itp = itp; - mc_part->weight = weight; - mc_part->list0 = list0; - mc_part->list1 = list1; - - mc->npart++; -} - -void calc_mc_params(H264Mb* mb, H264mc *mc){ - int mb_type = mb->mb_type; - mc->npart=0; - - assert(!IS_INTRA(mb_type)); - if(IS_16X16(mb_type)){ - fill_mc_part(mc, 0, 8, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - }else if(IS_16X8(mb_type)){ - fill_mc_part(mc, 0, 4, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - fill_mc_part(mc, 8, 4, 0, 4, 0, 1, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); - }else if(IS_8X16(mb_type)){ - fill_mc_part(mc, 0, 8, 0, 0, 1, 2, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - fill_mc_part(mc, 4, 8, 4, 0, 1, 2, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); - }else{ - int i; - assert(IS_8X8(mb_type)); - - for(i=0; i<4; i++){ - const int sub_mb_type= mb->sub_mb_type[i]; - const int n= 4*i; - int x_offset= (i&1)<<2; - int y_offset= (i&2)<<1; - - if(IS_SUB_8X8(sub_mb_type)){ - fill_mc_part(mc, n, 4, x_offset, y_offset, 1, 3, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else if(IS_SUB_8X4(sub_mb_type)){ - fill_mc_part(mc, n, 2, x_offset, y_offset, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - fill_mc_part(mc, n+2, 2, x_offset, y_offset+2, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else if(IS_SUB_4X8(sub_mb_type)){ - fill_mc_part(mc, n, 4, x_offset, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - fill_mc_part(mc, n+1, 4, x_offset+2, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else{ - int j; - assert(IS_SUB_4X4(sub_mb_type)); - for(j=0; j<4; j++){ - int sub_x_offset= x_offset + 2*(j&1); - int sub_y_offset= y_offset + (j&2); - fill_mc_part(mc, n+j, 2, sub_x_offset, sub_y_offset, 2, 6, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - } - } - } - } -} - -/** -* Returns a pointer to mc_buf -*/ -static void* alloc_mc_buf(int size){ - void* ptr = ref_ptr; - ref_ptr += size; - return ptr; -} - -#define TAG_OFFSET_MC MBD_mc_buf1 -static uint8_t* get_mc_data(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){ - assert(src_ea); - int unalign; - unsigned address_align; - - uint8_t* ea; - uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride); - - ea = src_ea + pic_xoffset + pic_yoffset*linesize; - address_align = ((unsigned) ea) & 0xFFFFFFF0; - unalign = ((unsigned) ea) & 0xF; - get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, idx + TAG_OFFSET_MC, 0); - return (ref_ptr + unalign); -} - -static uint8_t* get_mc_data_blocking(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){ - assert(src_ea); - int unalign; - unsigned address_align; - - uint8_t* ea; - uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride); - - ea = src_ea + pic_xoffset + pic_yoffset*linesize; - address_align = ((unsigned) ea) & 0xFFFFFFF0; - unalign = ((unsigned) ea) & 0xF; - get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, MBD_mc_buf1, 0); - wait_dma_id(MBD_mc_buf1); - return (ref_ptr + unalign); -} - -//#undef TAG_OFFSET_MC - -static void get_mc_components(H264Context_spu *h, H264Mb *mb, H264mc_part* mc_part, Picture_spu *pic, int n, int chroma_height, int list, int src_x_offset, int src_y_offset, int idx){ - assert(pic); - H264slice *s = h->s; - ref_data *ref = &mc_part->ref[list]; - const int mx= mb->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; - const int my= mb->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; - - const int pic_width = 16*s->mb_width; - const int pic_height = 16*s->mb_height; - - int blk_h= chroma_height*2+5; - //int blk_w= 8*2+5; - - int blk_h_c= chroma_height+1; - //int blk_w_c= 9; - - int ymx= mx>>2; - int ymy= my>>2; - int cmy= my>>3; - int cmx= mx>>3; - - //truncate the motion vectors references - if(ymy>= pic_height+2){ - ymy=pic_height+1; - }else if(ymy <=-19){ - ymy=-18; - } - if(ymx>= pic_width+2){ - ymx= pic_width+1; - }else if(ymx<=-19){ - ymx=-19; - } - - if(cmy >= pic_height>>1){ - cmy = (pic_height>>1) -1; - }else if(cmy<=-9){ - cmy=-8; - } - if(cmx >= pic_width>>1){ - cmx = (pic_width>>1) -1; - }else if(cmx<=-9){ - cmx=-8; - } - if (!h->blocking){ - ref->data[0]=get_mc_data(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx); - ref->data[1]=get_mc_data(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); - ref->data[2]=get_mc_data(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); - } else { - ref->data[0]=get_mc_data_blocking(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx); - ref->data[1]=get_mc_data_blocking(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); - ref->data[2]=get_mc_data_blocking(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); - - } - -} - -static void get_ref_data(H264Context_spu *h, H264Mb *mb, H264mc_part *mc_part, int idx){ - H264slice *s = h->s; - int x_offset = mc_part->x_offset; - int y_offset = mc_part->y_offset; - int list0 = mc_part->list0; - int list1 = mc_part->list1; - int n = mc_part->n; - int chroma_height = mc_part->chroma_height; - Picture_spu *refpic; - - x_offset += 8*mb->mb_x; - y_offset += 8*mb->mb_y; - - if(list0){ - refpic= &s->ref_list[0][ mb->ref_cache[0][ scan8[n] ] ]; - get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 0, x_offset, y_offset, idx); - } - if(list1){ - refpic= &s->ref_list[1][ mb->ref_cache[1][ scan8[n] ] ]; - get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 1, x_offset, y_offset, idx); - } -} - -void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc){ - int idx = h->mc_idx; - int i; - - get_list = get_list_buf; - ref_ptr = mc_ref[idx]; - for(i=0; inpart; i++){ - get_ref_data(h, mb, &mc->mc_part[i], idx); - } -} - -static void mc_dir_part(H264Context_spu *h, H264mc_part* mc_part, int n, int chroma_height, int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, int stride_y, int stride_c){ - - H264Mb *mb = h->mb; - ref_data* ref = &mc_part->ref[list]; - const int mx= mb->mv_cache[list][ scan8[n] ][0]; //to determine the interpolation mode - const int my= mb->mv_cache[list][ scan8[n] ][1]; - const int luma_xy= (mx&3) + ((my&3)<<2); - uint8_t *src_y, *src_cb, *src_cr; - - src_y = ref->data[0] +2+2*STRIDE_Y; - src_cb = ref->data[1]; - src_cr = ref->data[2]; - - qpix_op[luma_xy](dest_y, src_y, stride_y, chroma_height*2); - chroma_op(dest_cb, src_cb, stride_c, chroma_height, mx&7, my&7); - chroma_op(dest_cr, src_cr, stride_c, chroma_height, mx&7, my&7); -} - - -static void mc_part_biweighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg){ - - H264Mb *mb = h->mb; - H264slice *s = h->s; - int n = mc_part->n; - int chroma_height = mc_part->chroma_height; - int itp = mc_part->itp; - int refn0 = mb->ref_cache[0][ scan8[n] ]; - int refn1 = mb->ref_cache[1][ scan8[n] ]; - qpel_mc_func *qpix_put= h->dsp.put_h264_qpel_pixels_tab[itp]; - h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp]; - - // don't optimize for luma-only case, since B-frames usually - // use implicit weights => chroma too. - mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c); - - mc_dir_part(h, mc_part, n, chroma_height, 1, tmp_y_ls, tmp_cb_ls, tmp_cr_ls, qpix_put, chroma_put, STRIDE_Y, STRIDE_C); - - if(s->use_weight == 2){ - int weight0 = s->implicit_weight[refn0][refn1][mb->mb_y&1]; - int weight1 = 64 - weight0; - luma_weight_avg( dest_y, tmp_y_ls, stride_y, STRIDE_Y, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0); - }else{ - luma_weight_avg(dest_y, tmp_y_ls, stride_y, STRIDE_Y, s->luma_log2_weight_denom, s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]); - - chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]); - - chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]); - } -} - -static void mc_part_weighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, int list1){ - - H264Mb *mb = h->mb; - H264slice *s = h->s; - - int n = mc_part->n; - int chroma_height = mc_part->chroma_height; - int itp = mc_part->itp; - qpel_mc_func *qpix_put= h->dsp.put_h264_qpel_pixels_tab[itp]; - h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp]; - - int list = list1 ? 1 : 0; - int refn = mb->ref_cache[list][ scan8[n] ]; - - mc_dir_part(h, mc_part, n, chroma_height, list, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c); - - luma_weight_op(dest_y, stride_y, s->luma_log2_weight_denom, s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]); - if(s->use_weight_chroma){ - chroma_weight_op(dest_cb, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]); - - chroma_weight_op(dest_cr, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]); - } -} - - -static void mc_part_std(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, int list0, int list1){ - int n = mc_part->n; - int chroma_height = mc_part->chroma_height; - int itp = mc_part->itp; - - qpel_mc_func *qpix_op= h->dsp.put_h264_qpel_pixels_tab[itp]; - h264_chroma_mc_func chroma_op= h->dsp.put_h264_chroma_pixels_tab[itp]; - - if(list0){ - mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c); - - qpix_op= h->dsp.avg_h264_qpel_pixels_tab[itp]; - chroma_op= h->dsp.avg_h264_chroma_pixels_tab[itp]; - } - - if(list1){ - mc_dir_part(h, mc_part, n, chroma_height, 1, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c); - } -} - -static void mc_part(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ - H264slice *s = h->s; - - int weight = mc_part->weight; - - int x_offset = mc_part->x_offset; - int y_offset = mc_part->y_offset; - int list0 = mc_part->list0; - int list1 = mc_part->list1; - - dest_y += 2*x_offset + 2*y_offset*stride_y; - dest_cb += x_offset + y_offset*stride_c; - dest_cr += x_offset + y_offset*stride_c; - - if(list0 && list1 && s->use_weight !=0){ - h264_biweight_func *weight_avg = &h->dsp.biweight_h264_pixels_tab[weight]; - mc_part_biweighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_avg[0], weight_avg[3]); - } - else if ((list0 || list1) && s->use_weight ==1){ - h264_weight_func *weight_op = &h->dsp.weight_h264_pixels_tab[weight]; - mc_part_weighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_op[0], weight_op[3], list1); - } - else{ - mc_part_std(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, list0, list1); - } -} - -void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ - int i; - H264mc *mc =h->mc; - for(i=0; inpart; i++){ - mc_part(h, &mc->mc_part[i], dest_y, dest_cb, dest_cr, stride_y, stride_c); - } -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -#ifndef H264_MC_SPU_H -#define H264_MC_SPU_H - -//#include "types_spu.h" - -// motion compensation constants: -#define MB_TYPE_16x16 0x0008 -#define MB_TYPE_16x8 0x0010 -#define MB_TYPE_8x16 0x0020 -#define MB_TYPE_8x8 0x0040 -#define MB_TYPE_P0L0 0x1000 -#define IS_16X16(a) ((a)&MB_TYPE_16x16) -#define IS_16X8(a) ((a)&MB_TYPE_16x8) -#define IS_8X16(a) ((a)&MB_TYPE_8x16) -#define IS_8X8(a) ((a)&MB_TYPE_8x8) -#define IS_SUB_8X8(a) ((a)&MB_TYPE_16x16) //note reused -#define IS_SUB_8X4(a) ((a)&MB_TYPE_16x8) //note reused -#define IS_SUB_4X8(a) ((a)&MB_TYPE_8x16) //note reused -#define IS_SUB_4X4(a) ((a)&MB_TYPE_8x8) //note reused -#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list)))) - -#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) -#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) - -//Motion compensation buffer strides -#define STRIDE_Y 48 -#define STRIDE_C 32 - -typedef struct ref_data{ - uint8_t *data[3]; -}ref_data; - -typedef struct H264mc_part{ - int n; - int chroma_height; - int x_offset; - int y_offset; - int itp; - int weight; - int list0; - int list1; - int use_weight; - ref_data ref[2]; - -}H264mc_part; - -typedef struct H264mc{ - H264mc_part mc_part[16]; - int npart; -}H264mc; - - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 prediction functions. - * @author Michael Niedermayer - */ - -#ifndef AVCODEC_H264PRED_H -#define AVCODEC_H264PRED_H - -//#include "libavutil/common.h" -//#include "dsputil.h" - -/** - * Prediction types - */ -//@{ -#define VERT_PRED 0 -#define HOR_PRED 1 -#define DC_PRED 2 -#define DIAG_DOWN_LEFT_PRED 3 -#define DIAG_DOWN_RIGHT_PRED 4 -#define VERT_RIGHT_PRED 5 -#define HOR_DOWN_PRED 6 -#define VERT_LEFT_PRED 7 -#define HOR_UP_PRED 8 - -#define LEFT_DC_PRED 9 -#define TOP_DC_PRED 10 -#define DC_128_PRED 11 - -#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN 12 -#define HOR_UP_PRED_RV40_NODOWN 13 -#define VERT_LEFT_PRED_RV40_NODOWN 14 - -#define DC_PRED8x8 0 -#define HOR_PRED8x8 1 -#define VERT_PRED8x8 2 -#define PLANE_PRED8x8 3 - -#define LEFT_DC_PRED8x8 4 -#define TOP_DC_PRED8x8 5 -#define DC_128_PRED8x8 6 - -#define ALZHEIMER_DC_L0T_PRED8x8 7 -#define ALZHEIMER_DC_0LT_PRED8x8 8 -#define ALZHEIMER_DC_L00_PRED8x8 9 -#define ALZHEIMER_DC_0L0_PRED8x8 10 -//@} - -/** - * Context for storing H.264 prediction functions - */ -typedef struct H264PredContext{ - void (*pred4x4 [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp? - void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride); - void (*pred8x8 [4+3+4])(uint8_t *src, int stride); - void (*pred16x16[4+3])(uint8_t *src, int stride); - - void (*pred4x4_add [2])(uint8_t *pix/*align 4*/, const DCTELEM *block/*align 16*/, int stride); - void (*pred8x8l_add [2])(uint8_t *pix/*align 8*/, const DCTELEM *block/*align 16*/, int stride); - void (*pred8x8_add [3])(uint8_t *pix/*align 8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); - void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); -}H264PredContext; - -void ff_h264_pred_init(H264PredContext *h); -void ff_h264_pred_init_arm(H264PredContext *h); - - -#endif /* AVCODEC_H264PRED_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -#include -#include "h264_tables.h" - -uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP] = {0, }; - -int block_offset[16+4+4]; - -void ff_cropTbl_init(){ - int i; - for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; - for(i=0;i>3); - } - for(i=0; i<4; i++){ - block_offset[16+i]= - block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*uvlinesize*((scan8[i] - scan8[0])>>3); - } -} \ No newline at end of file diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -#ifndef H264_TABLES_H -#define H264_TABLES_H - -#define MAX_NEG_CROP 1024 - -extern uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP]; -extern int block_offset[16+4+4]; - -static const uint8_t scan8[16 + 2*4]={ - 4+1*8, 5+1*8, 4+2*8, 5+2*8, - 6+1*8, 7+1*8, 6+2*8, 7+2*8, - 4+3*8, 5+3*8, 4+4*8, 5+4*8, - 6+3*8, 7+3*8, 6+4*8, 7+4*8, - 1+1*8, 2+1*8, - 1+2*8, 2+2*8, - 1+4*8, 2+4*8, - 1+5*8, 2+5*8, -}; - -static const uint8_t ff_zigzag_direct[64] = { - 0, 1, 8, 16, 9, 2, 3, 10, - 17, 24, 32, 25, 18, 11, 4, 5, - 12, 19, 26, 33, 40, 48, 41, 34, - 27, 20, 13, 6, 7, 14, 21, 28, - 35, 42, 49, 56, 57, 50, 43, 36, - 29, 22, 15, 23, 30, 37, 44, 51, - 58, 59, 52, 45, 38, 31, 39, 46, - 53, 60, 61, 54, 47, 55, 62, 63 -}; - -static const uint8_t zigzag_scan[16]={ - 0+0*4, 1+0*4, 0+1*4, 0+2*4, - 1+1*4, 2+0*4, 3+0*4, 2+1*4, - 1+2*4, 0+3*4, 1+3*4, 2+2*4, - 3+1*4, 3+2*4, 2+3*4, 3+3*4, -}; - -static const uint8_t luma_dc_zigzag_scan[16]={ - 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64, - 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64, - 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64, - 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64, -}; - -static const uint8_t chroma_dc_scan[4]={ - (0+0*2)*16, (1+0*2)*16, - (0+1*2)*16, (1+1*2)*16, //FIXME -}; - -static const uint8_t rem6[52]={ -0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, -}; - -static const uint8_t div6[52]={ -0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, -}; - -static const uint8_t dequant4_coeff_init[6][3]={ - {10,13,16}, - {11,14,18}, - {13,16,20}, - {14,18,23}, - {16,20,25}, - {18,23,29}, -}; - -static const uint8_t dequant8_coeff_init_scan[16] = { - 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 -}; -static const uint8_t dequant8_coeff_init[6][6]={ - {20,18,32,19,25,24}, - {22,19,35,21,28,26}, - {26,23,42,24,33,31}, - {28,25,45,26,35,33}, - {32,28,51,30,40,38}, - {36,32,58,34,46,43}, -}; - - -void init_block_offset(int linesize, int uvlinesize); -void ff_cropTbl_init(); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,203 +0,0 @@ -#ifndef H264_CELL_TYPES_H -#define H264_CELL_TYPES_H - -#include -#include - -typedef struct spe_pos{ - volatile int count; //number of mb processed - uint32_t pad[3]; -}spe_pos; - -//only the picture pointers are needed from the picture struct; -typedef struct Picture_spu { - uint8_t* data[3]; -} Picture_spu; - -///For Cell, might be idea to use this instead for everything -// struct that contains the pararms that change on slice -typedef struct H264slice{ - int deblocking_filter; - int linesize; - int uvlinesize; - int mb_width; - int mb_height; - - int use_weight; - int use_weight_chroma; - int luma_log2_weight_denom; - int chroma_log2_weight_denom; - - int16_t luma_weight[16][2][2]; - int16_t chroma_weight[16][2][2][2]; - int16_t implicit_weight[16][16][2]; - - // ref picture ptr - Picture_spu ref_list[2][16]; - int state; - int emu_edge_width; - int emu_edge_height; - - int slice_type; - int slice_type_nos; - int slice_alpha_c0_offset; - int slice_beta_offset; - - uint8_t chroma_qp_table[2][64]; - - H264Mb *blocks; - uint8_t *dst_y, *dst_cb, *dst_cr; - - //uint32_t pad[2]; // padding the structure for multiple of 16 bytes -}H264slice; - -typedef struct H264spe{ -#define EDIP 0 -#define EDB 1 -#define MBD 2 - int type; - int idx; - int spe_id; - int spe_total; - int mb_width; - int mb_stride; - int mb_height; - int linesize; - int uvlinesize; - //H264slice* slice_params; - void* src_spe; - void* tgt_spe; - - mutex_ea_t lock; - cond_ea_t cond; - atomic_ea_t cnt; - - mutex_ea_t rl_lock; - cond_ea_t rl_cond; - atomic_ea_t rl_cnt; -}H264spe; - -typedef struct H264Cabac_spu{ - int blocking; - - int top_cbp; - int left_cbp; - int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct - - uint32_t dequant4_buffer[6][52][16]; - uint32_t dequant8_buffer[2][52][64]; - uint32_t (*dequant4_coeff[6])[16]; - uint32_t (*dequant8_coeff[2])[64]; - - uint8_t (*non_zero_count_top)[32]; - uint8_t (*non_zero_count)[32]; - - uint8_t (*mvd_top[2])[2]; - uint8_t (*mvd[2])[2]; - - uint8_t *direct_top; - uint8_t *direct; - - uint8_t *chroma_pred_mode_top; - uint8_t *chroma_pred_mode; - - int8_t *intra4x4_pred_mode_top; - int8_t *intra4x4_pred_mode; - - uint16_t *cbp_top; - uint16_t *cbp; - - int8_t *qscale_top; - int8_t *qscale; - - int8_t *ref_index_top[2]; - int8_t *ref_index[2]; - - int16_t (*motion_val_top[2])[2]; - int16_t (*motion_val[2])[2]; - uint32_t *mb_type_top; - uint32_t *mb_type; - - int8_t *list1_ref_index[2]; - uint32_t *list1_mb_type; - DECLARE_ALIGNED_16(int16_t, list1_motion_val[2][4*4][2]); // fill for a macroblock when required - - int b_stride; - int mb_stride; - int mb_width; - int mb_height; - - uint8_t zigzag_scan[16]; - uint8_t zigzag_scan8x8[64]; - - uint8_t direct_cache[5*8]; - // Used to calculate loopfilter bS. - DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; - DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; - DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; - DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; - -} H264Cabac_spu; - -typedef struct EDSlice_spu{ - PPS pps; ///< current pps - - H264Mb *mbs; - - int state; - int qp_thresh; ///< QP threshold to skip loopfilter - - PictureInfo pic; - PictureInfo list1; -// Picture *ref_list[2][16]; ///Reordered version of default_ref_list according to picture reordering in slice header - int ref_count[2]; ///< counts frames or fields, depending on current mb mode - int slice_type; - int slice_type_nos; - int direct_8x8_inference_flag; - - uint8_t list_count; - uint32_t coded_pic_num; -///stuff only needed for nal/entropy decoding - H264Mb *m; - //GetBitContext gb; - const uint8_t *bytestream_start; - int byte_bufsize; - int transform_bypass; - int direct_spatial_mv_pred; - int map_col_to_list0[2][16]; - int dist_scale_factor[16]; - - int cabac_init_idc; - int ref2frm[2][64]; ///< reference to frame number lists, the first 2 are for -2,-1 - int qscale; - int chroma_qp[2]; //QPc - int last_qscale_diff; - -// Picture* release_ref[MAX_MMCO_COUNT]; -// int release_cnt; - - -// int use_weight; -// int use_weight_chroma; -// int luma_log2_weight_denom; -// int chroma_log2_weight_denom; - -// int8_t luma_weight[16][2][2]; -// int8_t chroma_weight[16][2][2][2]; -// int8_t implicit_weight[16][16][2]; - - - -// int slice_alpha_c0_offset; -// int slice_beta_offset; - -// int nal_ref_idc; -// int nal_unit_type; -// uint8_t *rbsp_buffer; -// unsigned int rbsp_buffer_size; - - - -} EDSlice_spu; - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,137 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2001, 2002 Fabrice Bellard - * Copyright (c) 2006 Michael Niedermayer et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef AVCODEC_MATHOPS_H -#define AVCODEC_MATHOPS_H - -// #include "libavutil/common.h" -// #include "libavutil/internal.h" -// -// /* generic implementation */ -// -// #ifndef MULL -// # define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s)) -// #endif -// -// #ifndef MULH -// //gcc 3.4 creates an incredibly bloated mess out of this -// //# define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) -// -// static av_always_inline int MULH(int a, int b){ -// return ((int64_t)(a) * (int64_t)(b))>>32; -// } -// #endif -// -// #ifndef UMULH -// static av_always_inline unsigned UMULH(unsigned a, unsigned b){ -// return ((uint64_t)(a) * (uint64_t)(b))>>32; -// } -// #endif -// -// #ifndef MUL64 -// # define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) -// #endif -// -// #ifndef MAC64 -// # define MAC64(d, a, b) ((d) += MUL64(a, b)) -// #endif -// -// #ifndef MLS64 -// # define MLS64(d, a, b) ((d) -= MUL64(a, b)) -// #endif -// -// /* signed 16x16 -> 32 multiply add accumulate */ -// #ifndef MAC16 -// # define MAC16(rt, ra, rb) rt += (ra) * (rb) -// #endif -// -// /* signed 16x16 -> 32 multiply */ -// #ifndef MUL16 -// # define MUL16(ra, rb) ((ra) * (rb)) -// #endif -// -// #ifndef MLS16 -// # define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb)) -// #endif - -/* median of 3 */ -#ifndef mid_pred -#define mid_pred mid_pred -static inline av_const int mid_pred(int a, int b, int c) -{ -#if 0 - int t= (a-b)&((a-b)>>31); - a-=t; - b+=t; - b-= (b-c)&((b-c)>>31); - b+= (a-b)&((a-b)>>31); - - return b; -#else - if(a>b){ - if(c>b){ - if(c>a) b=a; - else b=c; - } - }else{ - if(b>c){ - if(c>a) b=c; - else b=a; - } - } - return b; -#endif -} -#endif - -// #ifndef sign_extend -// static inline av_const int sign_extend(int val, unsigned bits) -// { -// return (val << (INT_BIT - bits)) >> (INT_BIT - bits); -// } -// #endif -// -// #ifndef zero_extend -// static inline av_const unsigned zero_extend(unsigned val, unsigned bits) -// { -// return (val << (INT_BIT - bits)) >> (INT_BIT - bits); -// } -// #endif -// -// #ifndef COPY3_IF_LT -// #define COPY3_IF_LT(x, y, a, b, c, d)\ -// if ((y) < (x)) {\ -// (x) = (y);\ -// (a) = (b);\ -// (c) = (d);\ -// } -// #endif -// -// #ifndef NEG_SSR32 -// # define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s))) -// #endif -// -// #ifndef NEG_USR32 -// # define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s))) -// #endif - -#endif /* AVCODEC_MATHOPS_H */ - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ -/* - * rectangle filling function - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * useful rectangle filling function - * @author Michael Niedermayer - */ - -#ifndef AVCODEC_RECTANGLE_H -#define AVCODEC_RECTANGLE_H - -#include - -#define STRIDE_ALIGN 16 - - -/** - * fill a rectangle. - * @param h height of the rectangle, should be a constant - * @param w width of the rectangle, should be a constant - * @param size the size of val (1, 2 or 4), should be a constant - */ -static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ - uint8_t *p= (uint8_t*)vp; - assert(size==1 || size==2 || size==4); - assert(w<=4); - - w *= size; - stride *= size; - - assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); - assert((stride&(w-1))==0); - if(w==2){ - const uint16_t v= size==4 ? val : val*0x0101; - *(uint16_t*)(p + 0*stride)= v; - if(h==1) return; - *(uint16_t*)(p + 1*stride)= v; - if(h==2) return; - *(uint16_t*)(p + 2*stride)= v; - *(uint16_t*)(p + 3*stride)= v; - }else if(w==4){ - const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101; - *(uint32_t*)(p + 0*stride)= v; - if(h==1) return; - *(uint32_t*)(p + 1*stride)= v; - if(h==2) return; - *(uint32_t*)(p + 2*stride)= v; - *(uint32_t*)(p + 3*stride)= v; - }else if(w==8){ - const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL; - *(uint64_t*)(p + 0*stride)= v; - if(h==1) return; - *(uint64_t*)(p + 1*stride)= v; - if(h==2) return; - *(uint64_t*)(p + 2*stride)= v; - *(uint64_t*)(p + 3*stride)= v; - }else if(w==16){ - const uint64_t v= val*0x0100000001ULL; - *(uint64_t*)(p + 0+0*stride)= v; - *(uint64_t*)(p + 8+0*stride)= v; - *(uint64_t*)(p + 0+1*stride)= v; - *(uint64_t*)(p + 8+1*stride)= v; - if(h==2) return; - *(uint64_t*)(p + 0+2*stride)= v; - *(uint64_t*)(p + 8+2*stride)= v; - *(uint64_t*)(p + 0+3*stride)= v; - *(uint64_t*)(p + 8+3*stride)= v; - }else - assert(0); - assert(h==4); -} - -#endif /* AVCODEC_RECTANGLE_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,508 +0,0 @@ -#define CELL_SPE - -#include -#include -#include -#include -#include "libavcodec/avcodec.h" -#include "h264_cabac_spu.h" -#include "cabac_spu.h" -#include "h264_types_spu.h" -#include "h264_tables.h" -#include "h264_dma.h" -#include "h264_tables.h" - -#define MB_WIDTH 240 -#define MB_STRIDE (MB_WIDTH+16) - -H264Cabac_spu hcabac; -CABACContext cabac; -DECLARE_ALIGNED_16(EDSlice_spu, slice[2]); -DECLARE_ALIGNED_16(H264Mb, mb[2]); -DECLARE_ALIGNED_16(H264spe, spe); - -DECLARE_ALIGNED_16(uint8_t, non_zero_count_table[2][MB_STRIDE][32]); -DECLARE_ALIGNED_16(uint8_t, mvd_table[2][2][8*MB_STRIDE][2]); -DECLARE_ALIGNED_16(uint8_t, direct_table[2][4*MB_STRIDE]); -DECLARE_ALIGNED_16(uint8_t, chroma_pred_mode_table[2][MB_STRIDE]); -DECLARE_ALIGNED_16(uint8_t, intra4x4_pred_mode_table[2][8*MB_STRIDE]); -DECLARE_ALIGNED_16(uint16_t,cbp_table[2][MB_STRIDE]); -DECLARE_ALIGNED_16(uint8_t, qscale_table[2][MB_STRIDE]); - -DECLARE_ALIGNED_16(uint32_t, mb_type_table[2][MB_STRIDE]); -DECLARE_ALIGNED_16(int8_t, ref_index_table[2][2][4*MB_STRIDE]); -DECLARE_ALIGNED_16(int16_t, motion_val_table[2][2][4*4*MB_WIDTH][2]); - -DECLARE_ALIGNED(128, uint8_t, bytestream_ls[4096]); -DECLARE_ALIGNED_16(uint32_t, list1_mb_type_table[2][MB_STRIDE]); -DECLARE_ALIGNED_16(int8_t, list1_ref_index_table[2][2][4*MB_STRIDE]); - -DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending -//mb position of neighbouring spes -DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1 -static int total_lines; - -static inline int dep_resolved(H264spe *p){ - int spe_id = p->spe_id; - volatile int lines_proc = src_spe.count; - if (spe_id==0) - return (total_lines < lines_proc-1 +p->mb_height)? 1:0; - else - return (total_lines < lines_proc-1)? 1:0; -} - -static void update_tgt_spe_dep(H264spe *p, int end){ - // if (end ){ - total_lines++; - spe_pos* dma_spe = &dma_temp; - spe_pos* tgt_spe = p->tgt_spe + (unsigned) &src_spe; //located in target spe local store - dma_spe->count = end? total_lines+1: total_lines; - spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), ED_put); - // } - -} - -static int init_cabac(H264spe *p, H264Cabac_spu *hc){ - hc->mb_height = p->mb_height; - hc->mb_width = p->mb_width; - hc->b_stride = 4*p->mb_width; - hc->mb_stride = p->mb_stride; - - for(int i=0; i<16; i++){ - #define T(x) (x>>2) | ((x<<2) & 0xF) - hc->zigzag_scan[i] = T(zigzag_scan[i]); - #undef T - } - for(int i=0; i<64; i++){ - #define T(x) (x>>3) | ((x&7)<<3) - hc->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]); - #undef T - } -} - -static void reset_cabac_buffers(){ - memset(intra4x4_pred_mode_table, 0, sizeof(intra4x4_pred_mode_table)); - memset(mvd_table, 0, sizeof(mvd_table)); - memset(direct_table, 0, sizeof(direct_table)); - memset(chroma_pred_mode_table, 0, sizeof(chroma_pred_mode_table)); - memset(cbp_table, 0, sizeof(cbp_table)); - memset(qscale_table, 0, sizeof(qscale_table)); - memset(mb_type_table, 0, sizeof(mb_type_table)); - memset(ref_index_table, 0, sizeof(ref_index_table)); - memset(motion_val_table, 0, sizeof(motion_val_table)); -} - -static void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int bufsize){ - int align = (unsigned) buf & 0xF; - int dma_size; - - c->bytestream_ea_start= - c->bytestream_ea= buf; - c->bytestream_ea_end= buf + bufsize; - c->bufsize = bufsize; - - if (bufsize + align >= sizeof(bytestream_ls)){ - dma_size = sizeof(bytestream_ls); - c->bufsize = c->bufsize +align - sizeof(bytestream_ls); - }else{ - int align_end = (bufsize+align) &0xF; - if (align_end) - dma_size = bufsize+align + 16-align_end; - else - dma_size = bufsize+align; - c->bufsize = 0; - } -// printf("%d\n", dma_size); - c->bytestream_end = &bytestream_ls[dma_size]; - c->bytestream_start= c->bytestream = &bytestream_ls[align]; - spu_dma_get(bytestream_ls, (unsigned) buf - align, dma_size, ED_get ); - c->bytestream_ea_start= - c->bytestream_ea= buf + dma_size -align; - - wait_dma_id(ED_get); - - if (align %2){ - c->low = (*c->bytestream++)<<18; - c->low+= (*c->bytestream++)<<10; - c->low+= ((*c->bytestream++)<<2) + 2; - }else { - c->low = (*c->bytestream++)<<18; - c->low+= (*c->bytestream++)<<10; - c->low+= (2<<8); - } - - c->range= 0x1FE; - bytecount=0; -} - -static void init_dequant8_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){ - int i,q,x; - const int transpose = HAVE_ALTIVEC; - hc->dequant8_coeff[0] = hc->dequant8_buffer[0]; - hc->dequant8_coeff[1] = hc->dequant8_buffer[1]; - - for(i=0; i<2; i++){ - if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){ - hc->dequant8_coeff[1] = hc->dequant8_buffer[0]; - break; - } - - for(q=0; q<52; q++){ - int shift = div6[q]; - int idx = rem6[q]; - for(x=0; x<64; x++) - hc->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] = - ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * - s->pps.scaling_matrix8[i][x]) << shift; - } - } -} - -static void init_dequant4_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){ - int i,j,q,x; - const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; - for(i=0; i<6; i++ ){ - hc->dequant4_coeff[i] = hc->dequant4_buffer[i]; - for(j=0; jpps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){ - hc->dequant4_coeff[i] = hc->dequant4_buffer[j]; - break; - } - } - if(jdequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] = - ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] * - s->pps.scaling_matrix4[i][x]) << shift; - } - } -} - -static void init_dequant_tables(EDSlice_spu *s, H264Cabac_spu *hc){ - int i,x; - - init_dequant4_coeff_table(s, hc); - if(s->pps.transform_8x8_mode) - init_dequant8_coeff_table(s, hc); - if(s->transform_bypass){ - for(i=0; i<6; i++) - for(x=0; x<16; x++) - hc->dequant4_coeff[i][0][x] = 1<<6; - if(s->pps.transform_8x8_mode) - for(i=0; i<2; i++) - for(x=0; x<64; x++) - hc->dequant8_coeff[i][0][x] = 1<<6; - } -} - -static void init_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s){ - hc->non_zero_count_top = non_zero_count_table[0]; - hc->non_zero_count = non_zero_count_table[1]; - hc->mvd_top[0] = mvd_table[0][0]; - hc->mvd[0] = mvd_table[0][1]; - hc->mvd_top[1] = mvd_table[1][0]; - hc->mvd[1] = mvd_table[1][1]; - hc->direct_top = direct_table[0]; - hc->direct = direct_table[1]; - hc->chroma_pred_mode_top = chroma_pred_mode_table[0]; - hc->chroma_pred_mode = chroma_pred_mode_table[1]; - hc->intra4x4_pred_mode_top = intra4x4_pred_mode_table[0]; - hc->intra4x4_pred_mode = intra4x4_pred_mode_table[1]; - hc->cbp_top = cbp_table[0]; - hc->cbp = cbp_table[1]; - hc->qscale_top = qscale_table[0] +1; - hc->qscale = qscale_table[1] +1; - - hc->mb_type_top = mb_type_table[0]+1; - hc->mb_type = mb_type_table[1]+1; - hc->ref_index_top[0] = ref_index_table[0][0]; - hc->ref_index_top[1] = ref_index_table[1][0]; - hc->ref_index[0] = ref_index_table[0][1]; - hc->ref_index[1] = ref_index_table[1][1]; - hc->motion_val_top[0] = motion_val_table[0][0]; - hc->motion_val_top[1] = motion_val_table[1][0]; - hc->motion_val[0] = motion_val_table[0][1]; - hc->motion_val[1] = motion_val_table[1][1]; - - int mb_stride = hc->mb_stride; - - if (s->slice_type_nos == FF_B_TYPE){ - while(!dep_resolved(&spe)); - spu_dma_get(list1_mb_type_table[0], (unsigned) (s->list1.mb_type -1), mb_stride*sizeof(uint32_t), ED_get); - spu_dma_get(list1_ref_index_table[0][0], (unsigned) s->list1.ref_index[0], mb_stride*4*sizeof(int8_t), ED_get); - spu_dma_get(list1_ref_index_table[0][1], (unsigned) s->list1.ref_index[1], mb_stride*4*sizeof(int8_t), ED_get); - wait_dma_id(ED_get); - spu_dma_get(list1_mb_type_table[1], (unsigned) (s->list1.mb_type -1 + mb_stride), mb_stride*sizeof(uint32_t), ED_get); - spu_dma_get(list1_ref_index_table[1][0], (unsigned) (s->list1.ref_index[0] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); - spu_dma_get(list1_ref_index_table[1][1], (unsigned) (s->list1.ref_index[1] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); - hc->list1_mb_type = list1_mb_type_table[0]+1; - hc->list1_ref_index[0] = list1_ref_index_table[0][0]; - hc->list1_ref_index[1] = list1_ref_index_table[0][1]; - } - -} - -static void update_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s, int line){ - int mb_stride = hc->mb_stride; - int mb_width = hc->mb_width; - int top = (line+1)%2; - int cur = line%2; - int bottom = (line+1)%2; //same as top, but to identify prebuffering of next line. - - hc->non_zero_count_top = non_zero_count_table[top]; - hc->non_zero_count = non_zero_count_table[cur]; - hc->mvd_top[0] = mvd_table[0][top]; - hc->mvd[0] = mvd_table[0][cur]; - hc->mvd_top[1] = mvd_table[1][top]; - hc->mvd[1] = mvd_table[1][cur]; - hc->direct_top = direct_table[top]; - hc->direct = direct_table[cur]; - hc->chroma_pred_mode_top = chroma_pred_mode_table[top]; - hc->chroma_pred_mode = chroma_pred_mode_table[cur]; - hc->intra4x4_pred_mode_top = intra4x4_pred_mode_table[top]; - hc->intra4x4_pred_mode = intra4x4_pred_mode_table[cur]; - hc->cbp_top = cbp_table[top]; - hc->cbp = cbp_table[cur]; - hc->qscale_top = qscale_table[top] +1; - hc->qscale = qscale_table[cur] +1; - - hc->mb_type_top = mb_type_table[top]+1; - hc->mb_type = mb_type_table[cur]+1; - hc->ref_index_top[0] = ref_index_table[0][top]; - hc->ref_index_top[1] = ref_index_table[1][top]; - hc->ref_index[0] = ref_index_table[0][cur]; - hc->ref_index[1] = ref_index_table[1][cur]; - hc->motion_val_top[0] = motion_val_table[0][top]; - hc->motion_val_top[1] = motion_val_table[1][top]; - hc->motion_val[0] = motion_val_table[0][cur]; - hc->motion_val[1] = motion_val_table[1][cur]; - - wait_dma_id(ED_put); - - spu_dma_put(mb_type_table[top], (unsigned) (s->pic.mb_type -1 + line*mb_stride), mb_stride*sizeof(uint32_t), ED_put); - spu_dma_put(ref_index_table[0][top], (unsigned) (s->pic.ref_index[0] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put); - spu_dma_put(ref_index_table[1][top], (unsigned) (s->pic.ref_index[1] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put); - spu_dma_put(motion_val_table[0][top], (unsigned) (s->pic.motion_val[0]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put); - spu_dma_put(motion_val_table[1][top], (unsigned) (s->pic.motion_val[1]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put); - - if (s->slice_type_nos == FF_B_TYPE){ - update_tgt_spe_dep(&spe, 0); - wait_dma_id(ED_get); - - if (line + 2 < hc->mb_height){ - while(!dep_resolved(&spe)); - spu_dma_get(list1_mb_type_table[cur], (unsigned) (s->list1.mb_type -1 + (line+2)*mb_stride), mb_stride*sizeof(uint32_t), ED_get); - spu_dma_get(list1_ref_index_table[cur][0], (unsigned) (s->list1.ref_index[0] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); - spu_dma_get(list1_ref_index_table[cur][1], (unsigned) (s->list1.ref_index[1] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); - } - hc->list1_mb_type = list1_mb_type_table[bottom]+1; - hc->list1_ref_index[0] = list1_ref_index_table[bottom][0]; - hc->list1_ref_index[1] = list1_ref_index_table[bottom][1]; - } - -} - -// void printmbdiff(EDSlice_spu *s, H264Cabac_spu *hc, H264Mb *mp, H264Mb *ms){ -// -// printf("mb_x %d, %d\n", mp->mb_x, ms->mb_x); -// printf("mb_y %d, %d\n", mp->mb_y, ms->mb_y); -// printf("mb_xy %d, %d\n", mp->mb_xy, ms->mb_xy); -// printf("top_mb_xy %d, %d\n", mp->top_mb_xy, ms->top_mb_xy); -// printf("left_mb_xy %d, %d\n", mp->left_mb_xy, ms->left_mb_xy); -// printf("chroma_pred_mode %d, %d\n", mp->chroma_pred_mode, ms->chroma_pred_mode); -// printf("intra16x16_pred_mode %d, %d\n", mp->intra16x16_pred_mode, ms->intra16x16_pred_mode); -// printf("topleft_samples %d, %d\n", mp->topleft_samples_available, ms->topleft_samples_available); -// printf("topright_samples %d, %d\n", mp->topright_samples_available, ms->topright_samples_available); -// printf("top_samples %d, %d\n", mp->top_samples_available, ms->top_samples_available); -// printf("left_samples %d, %d\n", mp->left_samples_available, ms->left_samples_available); -// -// if (memcmp(mp->intra4x4_pred_mode_cache, ms->intra4x4_pred_mode_cache, 40)){ -// for (int i=0; i<5; i++){ -// for (int j=0; j<8; j++){ -// printf("%d, %d\t", mp->intra4x4_pred_mode_cache[i*8+j],ms->intra4x4_pred_mode_cache[i*8+j]); -// } -// printf("\n"); -// } -// } -// -// if (memcmp(mp->non_zero_count_cache, ms->non_zero_count_cache, 48)){ -// for (int i=0; i<6; i++){ -// for (int j=0; j<8; j++){ -// printf("%u, %u\t", mp->non_zero_count_cache[i*8+j],ms->non_zero_count_cache[i*8+j]); -// } -// printf("\n"); -// } -// } -// -// if (memcmp(mp->sub_mb_type, ms->sub_mb_type, 8)){ -// for (int i=0; i<4; i++){ -// printf("%u, %u\t", mp->sub_mb_type[i], mp->sub_mb_type[i]); -// printf("\n"); -// } -// } -// -// if (memcmp(mp->mv_cache, ms->mv_cache, 320)){ -// for (int k=0; k<2; k++){ -// for (int i=0; i<5; i++){ -// for (int j=0; j<8; j++){ -// printf("%d, %d, %d, %d\t", mp->mv_cache[k][i*8+j][0], mp->mv_cache[k][i*8+j][1], ms->mv_cache[k][i*8+j][0], ms->mv_cache[k][i*8+j][1]); -// } -// printf("\n"); -// } -// } -// } -// -// if (memcmp(mp->ref_cache, ms->ref_cache, 80)){ -// for (int k=0; k<2; k++){ -// for (int i=0; i<5; i++){ -// for (int j=0; j<8; j++){ -// printf("%d, %d\t", mp->ref_cache[k][i*8+j], ms->ref_cache[k][i*8+j]); -// } -// printf("\n"); -// } -// } -// } -// -// printf("cbp %d, %d\n", mp->cbp, ms->cbp); -// for (int i=0; imb_stride; i++){ -// printf("%d, ", hc->cbp[i]); fflush(0); -// } -// printf("\n"); -// -// printf("mb_type %x, %x\n", mp->mb_type, ms->mb_type); -// printf("mb_type IS_INTRA %d, IS_INTRA16x16 %d, IS_DIRECT %d\n", IS_INTRA(ms->mb_type), IS_INTRA16x16(ms->mb_type), IS_DIRECT(ms->mb_type) ); -// printf("left_type %d, %d\n", mp->left_type, ms->left_type); -// printf("top_type %d, %d\n", mp->top_type, ms->top_type); -// printf("qscale_mb_xy %d, %d\n", mp->qscale_mb_xy, ms->qscale_mb_xy); -// printf("qscale_left_mb_xy %d, %d\n", mp->qscale_left_mb_xy, ms->qscale_left_mb_xy); -// printf("qscale_top_mb_xy %d, %d\n", mp->qscale_top_mb_xy, ms->qscale_top_mb_xy); -// // for (int i=0; imb_stride; i++){ -// // printf("%d, ", qscale_table[0][i]); fflush(0); -// // } -// -// if (memcmp(mp->mb, ms->mb, 768)){ -// for (int i=0; i<16; i++){ -// for (int j=0; j<16; j++){ -// printf("%d, %d\t", mp->mb[j + i*16], ms->ref_cache[j + i*16]); -// } -// printf("\n"); -// } -// for (int i=0; i<8; i++){ -// for (int j=0; j<8; j++){ -// printf("%d, %d\t", mp->mb[256 + j + i*8], ms->ref_cache[j + i*8]); -// } -// printf("\n"); -// } -// for (int i=0; i<8; i++){ -// for (int j=0; j<8; j++){ -// printf("%d, %d\t", mp->mb[320+ j + i*8], ms->ref_cache[j + i*8]); -// } -// printf("\n"); -// } -// } -// -// if (memcmp(mp->bS, ms->bS, 32)){ -// for (int k=0; k<2; k++){ -// for (int i=0; i<4; i++){ -// for (int j=0; j<4; j++){ -// printf("%d, %d\t", mp->bS[k][i][j], mp->mv_cache[k][i][j]); -// } -// printf("\n"); -// } -// } -// } -// if (memcmp(mp->edges, ms->edges, 4)){ -// printf("edges %d, %d, %d, %d\n", mp->edges[0], ms->edges[0], mp->edges[1], ms->edges[1]); -// printf("deblock %d, %d\n", mp->deblock_mb, ms->deblock_mb); -// } -// -// printf("dequant4_coeff_y %d, %d\n", mp->dequant4_coeff_y, ms->dequant4_coeff_y); -// printf("dequant4_coeff_cb %d, %d\n", mp->dequant4_coeff_cb, ms->dequant4_coeff_cb); -// printf("dequant4_coeff_cr %d, %d\n", mp->dequant4_coeff_cr, ms->dequant4_coeff_cr); -// } -// DECLARE_ALIGNED_16(H264Mb, tmp); - - -int main(unsigned long long id, unsigned long long argp){ - EDSlice_spu *s; - H264Cabac_spu *hc = &hcabac; - CABACContext *c = &cabac; - H264spe *p = &spe; - - spu_write_out_mbox((unsigned) slice); - spu_dma_get(p, (unsigned) argp, sizeof(H264spe), ED_spe); //ID_slice is used out of convienience - wait_dma_id(ED_spe); - - ff_init_cabac_states(); - init_cabac(p, hc); - hc->blocking=0; - for(;;){ - spu_read_in_mbox(); - s = &slice[0]; - reset_cabac_buffers(); - init_entropy_buf(hc, s); - - if (hc->blocking) wait_dma_id(ED_get); - //printf("framesize %d\n", s->byte_bufsize);fflush(0); - init_dequant_tables(s, hc); - ff_init_cabac_decoder( c, s->bytestream_start, s->byte_bufsize ); - ff_h264_init_cabac_states(s, c); - - int mb_slot=0; - for(int j=0; jmb_height; j++){ - for(int i=0; imb_width; i++){ - int eos,ret; - H264Mb *m = &mb[mb_slot]; - m->mb_x=i; - m->mb_y=j; - s->m = m; - - ret = ff_h264_decode_mb_cabac(hc, s, c); - -// spu_dma_get(&tmp, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_get); -// wait_dma_id(ED_get); -// if (memcmp(&tmp, m, sizeof(H264Mb))){ -// printf("coded pic num %d\n", s->coded_pic_num); -// printmbdiff(s, hc,&tmp, m); -// return 0; -// } - //printf("qscale %d\n", m->qscale_mb_xy); - if (!hc->blocking){ - if (mb_slot){ - spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb1); - wait_dma_id(ED_putmb0); - }else { - spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0); - wait_dma_id(ED_putmb1); - } - mb_slot++; mb_slot%=2; - }else { - spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0); - wait_dma_id(ED_putmb0); - } - - - eos = get_cabac_terminate( c); - - if( ret < 0) { - fprintf(stderr, "error at %d bytecount\n", bytecount); - return -1; - } - } - update_entropy_buf(hc, s, j); - if (hc->blocking){ wait_dma_id(ED_get); wait_dma_id(ED_put);} - } - wait_dma_id(ED_put); - spu_write_out_mbox(1); - - } - - return 0; - - -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c --- a/ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2009 TUDelft - * - * Cell Parallel SPU - 2DWave Macroblock Decoding. - */ - -/** - * @file libavcodec/cell/spu/h264_main_spu.c - * Cell Parallel SPU - 2DWave Macroblock Decoding - * @author C C Chi - * - * SIMD kernels - * H.264/AVC motion compensation - * @author Mauricio Alvarez - * @author Albert Paradis - */ - - -/* Enable this lines to enable simulator statistic or generate traces */ - -//#define ENABLE_SIMULATOR -//#define ENABLE_PARAVER_TRACING_CELL - -#ifdef ENABLE_SIMULATOR - #include "/opt/ibm/systemsim-cell/include/callthru/spu/profile.h" -#endif - -#ifdef ENABLE_TRACES - #include "spu_trace.h" -#endif -#include -#include -#include -#include -#include -#include -#include - -//#include "dsputil_cell.h" -#include "types_spu.h" -#include "h264_intra_spu.h" -#include "h264_decode_mb_spu.h" -#include "h264_mc_spu.h" -#include "h264_tables.h" -#include "h264_dma.h" - - -/** functions for supporting tracing with paraver for the SPU - * - */ -inline void trace_init_SPU(){ -#ifdef ENABLE_PARAVER_TRACING_CELL - SPUtrace_init (); -#endif -} - -inline void trace_fini_SPU(){ -#ifdef ENABLE_PARAVER_TRACING_CELL - SPUtrace_fini (); -#endif -} - -inline void trace_event_SPU(int event, int id){ -#ifdef ENABLE_PARAVER_TRACING_CELL - SPUtrace_event (event, id); -#else - (void) event; - (void) id; -#endif -} - -// for simulator statistic -inline void clear_statistic(){ -#ifdef ENABLE_SIMULATOR - prof_clear(); -#endif -} - -inline void start_statistic(){ -#ifdef ENABLE_SIMULATOR - prof_start(); -#endif -} - -inline void stop_statistic(){ -#ifdef ENABLE_SIMULATOR - prof_stop(); -#endif -} - -H264Context_spu h_context; // struct that contain all the params to decode a macroblock - -DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending -//mb position of neighbouring spes -DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1 -//DECLARE_ALIGNED_16(spe_pos, tgt_spe); //written by SPE_ID +1 - -/** -* Initializes the buffering of the mb data and associated mc data. The init_mb_buffer needs to -* be called before any get_next_mb and only once at the beginning of the slice. -* -* Note: init_mc_buffer and get_next_mb expect the width of the picture to be more than 2 mb's -*/ -#define TAG_OFFSET_MB MBD_buf1 -#define TAG_OFFSET_MC MBD_mc_buf1 -static void init_mb_buffer(H264Context_spu* h){ - H264slice *s = h->s; - H264Mb *next_mb; - int mb_height = s->mb_height; - int mb_width = s->mb_width; - - h->mc_idx =0; - - h->mb_dec = 0; - h->mb_mc = 0; - h->mb_dma = 0; - - h->curr_line %= mb_height; - h->next_mb_idx = h->curr_line * mb_width; - h->mb_id = h->curr_line * mb_width; - h->n_mc= h->curr_line * mb_width; - - next_mb = s->blocks + h->mb_id; - spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); - h->mb_dma++; - h->mb_id++; - - next_mb = s->blocks + h->mb_id; - spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); - h->mb_dma++; - h->mb_id++; - wait_dma_id(0 + TAG_OFFSET_MB); - - H264Mb *mb = &h->mb_buf[0]; - H264mc *mc = &h->mc_buf[0]; - if(!IS_INTRA(mb->mb_type)){ - calc_mc_params(mb, mc); - fill_ref_buf(h, mb, mc); - } - h->n_mc++; - h->mb_mc++; -} - -static void *get_next_mb(H264Context_spu *h){ - H264slice *s = h->s; - H264spe *spe = &h->spe; - H264Mb *mb_buf = h->mb_buf; - H264mc *mc_buf = h->mc_buf; - H264Mb *next_mb; - H264Mb *next_dma_mb; - - if (h->curr_line >= s->mb_height) - return NULL; - - if (h->mb_id < h->mb_total){ - next_dma_mb = s->blocks + h->mb_id; - spu_dma_get(&mb_buf[h->mb_dma], (unsigned) next_dma_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); - h->mb_dma = (h->mb_dma+1)%3; - h->mb_id++; - if (h->mb_id%s->mb_width ==0){ - h->mb_id+=(spe->spe_total-1)*s->mb_width; - } - } - - h->mc = &mc_buf[h->mc_idx]; - wait_dma_id(h->mc_idx + TAG_OFFSET_MC); - h->mc_idx = (h->mc_idx+1)%2; - if (h->n_mc < h->mb_total){ - wait_dma_id(h->mb_mc + TAG_OFFSET_MB); - H264Mb *mb = &mb_buf[h->mb_mc]; - H264mc *mc = &mc_buf[h->mc_idx]; - if(!IS_INTRA(mb->mb_type)){ - calc_mc_params(mb, mc); - fill_ref_buf(h, mb, mc); - } - h->n_mc++; - if (h->n_mc%s->mb_width ==0){ - h->n_mc+=(spe->spe_total-1)*s->mb_width; - } - } - h->next_mb_idx++; - if (h->next_mb_idx % s->mb_width ==0){ - h->next_mb_idx+=(spe->spe_total-1)*s->mb_width; - h->curr_line+=spe->spe_total; - } - - h->mb_mc = (h->mb_mc+1)%3; - next_mb = &mb_buf[h->mb_dec]; - h->mb_dec = (h->mb_dec+1)%3; - return next_mb; -} - -static void *get_next_mb_blocking(H264Context_spu *h){ - H264slice *s = h->s; - H264spe *spe = &h->spe; - H264Mb *mb_buf = h->mb_buf; - H264mc *mc_buf = h->mc_buf; - H264Mb *next_mb; - H264Mb *next_dma_mb; - - if (h->mb_id >= h->mb_total) - return NULL; - - //printf("%d\n", h->mb_id); - next_dma_mb = s->blocks + h->mb_id; - spu_dma_get(&mb_buf[0], (unsigned) next_dma_mb, sizeof(H264Mb), MBD_buf1); - //h->mb_dma = (h->mb_dma+1)%3; - h->mb_id++; - if (h->mb_id%s->mb_width ==0){ - h->mb_id+=(spe->spe_total-1)*s->mb_width; - } - wait_dma_id(MBD_buf1); - - h->mc = &mc_buf[0]; - //h->mc_idx = (h->mc_idx+1)%2; - //if (h->n_mc < h->mb_total){ - H264Mb *mb = &mb_buf[0]; - H264mc *mc = &mc_buf[0]; - if(!IS_INTRA(mb->mb_type)){ - calc_mc_params(mb, mc); - fill_ref_buf(h, mb, mc); - } - //h->n_mc++; - /*if (h->n_mc%s->mb_width ==0){ - h->n_mc+=(spe->spe_total-1)*s->mb_width; - }*/ -// wait_dma_id(MBD_mc_buf1); - -// h->next_mb_idx++; -// if (h->next_mb_idx % s->mb_width ==0){ -// h->next_mb_idx+=(spe->spe_total-1)*s->mb_width; -// h->curr_line+=spe->spe_total; -// } - -// h->mb_mc = (h->mb_mc+1)%3; - next_mb = &mb_buf[0]; -// h->mb_dec = (h->mb_dec+1)%3; - return next_mb; -} - - -#undef TAG_OFFSET_MB -#undef TAG_OFFSET_MC -static inline int dep_resolved(H264Context_spu *h){ - H264slice *s = h->s; - int spe_id = h->spe.spe_id; - volatile int mb_proc_dep = src_spe.count; - if (spe_id==0) - return (h->mb_proc < mb_proc_dep-1 +s->mb_width)? 1:0; - else - return (h->mb_proc < mb_proc_dep-1)? 1:0; -} - -void update_tgt_spe_dep(H264Context_spu *h, int end){ - H264Mb *mb = h->mb; - H264slice *s = h->s; - H264spe *spe = &h->spe; - int mb_x = mb->mb_x; - - if (end || (mb_x%2==0 && mb_x!=0) || mb_x==s->mb_width-1){ - spe_pos* dma_spe = &dma_temp; - spe_pos* tgt_spe = (spe_pos*) ((unsigned) spe->tgt_spe + (unsigned) &src_spe); //located in target spe local store - dma_spe->count = end? h->mb_proc+1: h->mb_proc; - spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), MBD_put); - } - h->mb_proc++; -} - - -int main(unsigned long long id, unsigned long long argp) -{ - (void) id; - H264Context_spu* h = &h_context; - H264spe *spe_params = (H264spe *) (unsigned) argp; - - spu_dma_get(&h->spe, (unsigned) spe_params, sizeof(H264spe), MBD_slice); //ID_slice is used out of convienience - wait_dma_id(MBD_slice); - - //clear_statistic(); - dsputil_h264_init_cell(&h->dsp); - ff_cropTbl_init(); - init_pred_ptrs(&h->hpc); - - //send slice_buf to ppe - spu_write_out_mbox((unsigned) h->slice_buf); - h->sl_idx=0; - // initialize tracing with paraver - //trace_init_SPU(); - h->frames =0; - src_spe.count =0; - h->mb_proc = 0; - - h->mb_id=0; - h->mc_idx=0; - h->mb_dec=0; - h->mb_mc=0; - h->mb_dma=0; - h->next_mb_idx=0; - - h->blocking=0; - - - H264spe* p = &h->spe; - h->curr_line =p->spe_id; - h->mb_total = p->mb_height*p->mb_width; - int stride_y = 32; - int stride_c = 16; - //init block_offset array - init_block_offset(stride_y, stride_c); - for(;;){ - spu_read_in_mbox(); - - h->s = &h->slice_buf[h->sl_idx]; - h->sl_idx++; h->sl_idx%=2; - - if (h->s->state< 0){ - break; - } - - { - if(!h->blocking){ - init_mb_buffer(h); - while((h->mb=(H264Mb *)get_next_mb(h))){ - while(!dep_resolved(h)); - //printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p- >spe_id); - hl_decode_mb_internal(h, stride_y, stride_c); - } - update_tgt_spe_dep(h, 1); - }else{ - h->mb_id=0; - while((h->mb=(H264Mb *)get_next_mb_blocking(h))){ - while(!dep_resolved(h)); - //printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p- >spe_id); - hl_decode_mb_internal(h, stride_y, stride_c); - } - update_tgt_spe_dep(h, 1); - } - - } - - h->frames++; - - if (p->spe_id == ((h->frames*p->mb_height -1)%p->spe_total)){ - //printf("spe %d, %d\n", atomic_read(p->rl_cnt), h->frames); - //MBSlice is copied beforehand. - //only inc cnt. - atomic_inc(p->rl_cnt); - } - { - atomic_dec(p->cnt); - } - } - - return 0; -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h --- a/ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2006 Guillaume Poirier - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef TYPES_SPU_H -#define TYPES_SPU_H - -/*********************************************************************** - * Scalar types - **********************************************************************/ - typedef signed char int8_t; - typedef signed short int16_t; - typedef signed int int32_t; - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - typedef unsigned long long uint64_t; - -// typedef short DCTELEM; // transform coeficients of dct - -/*********************************************************************** - * Vector types - **********************************************************************/ - typedef vector signed int vsint32_t; - typedef vector unsigned int vuint32_t; - typedef vector signed short vsint16_t; - typedef vector unsigned short vuint16_t; - typedef vector signed char vsint8_t; - typedef vector unsigned char vuint8_t; - -/*********************************************************************** - * Functions - **********************************************************************/ - typedef void (*qpel_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h); - typedef void (*h264_chroma_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h, int x, int y); - typedef void (*h264_idct_func)(uint8_t *dst, short *block, int stride); - typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); - typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, - int weights, int offset); - typedef void(* intra_pred4x4)(uint8_t *src, uint8_t *topright, int stride); - typedef void(* intra_pred16x16)(uint8_t *src, int stride); - typedef void(* intra_pred8x8)(uint8_t *src, int stride); - typedef void(* intra_pred8x8l)(uint8_t *src, int topleft, int topright, int stride); - - -#define AVV(x...) {x} - - -#endif // AVCODEC_TYPES_SPU_H - - - - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/dsputil.c --- a/ffmpeg_smp/h264dec/libavcodec/dsputil.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1057 +0,0 @@ -/* - * DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * DSP utils - */ - -#include "libavutil/log.h" -#include "dsputil.h" -#include "simple_idct.h" -#include "mathops.h" -#include "config.h" - -uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; -uint32_t ff_squareTbl[512] = {0, }; - -const uint8_t ff_zigzag_direct[64] = { - 0, 1, 8, 16, 9, 2, 3, 10, - 17, 24, 32, 25, 18, 11, 4, 5, - 12, 19, 26, 33, 40, 48, 41, 34, - 27, 20, 13, 6, 7, 14, 21, 28, - 35, 42, 49, 56, 57, 50, 43, 36, - 29, 22, 15, 23, 30, 37, 44, 51, - 58, 59, 52, 45, 38, 31, 39, 46, - 53, 60, 61, 54, 47, 55, 62, 63 -}; - - -#define PIXOP2(OPNAME, OP) \ -static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ - int i;\ - for(i=0; i>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - l1= (c&0x03030303UL)\ - + (d&0x03030303UL);\ - h1= ((c&0xFCFCFCFCUL)>>2)\ - + ((d&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - a= AV_RN32(&src1[i*src_stride1+4]);\ - b= AV_RN32(&src2[i*src_stride2+4]);\ - c= AV_RN32(&src3[i*src_stride3+4]);\ - d= AV_RN32(&src4[i*src_stride4+4]);\ - l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x02020202UL;\ - h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - l1= (c&0x03030303UL)\ - + (d&0x03030303UL);\ - h1= ((c&0xFCFCFCFCUL)>>2)\ - + ((d&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - }\ -}\ -\ -static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ - OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ -}\ -\ -static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ - OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ -}\ -\ -static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ - OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ -}\ -\ -static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ - OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ -}\ -\ -static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ - int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ - int i;\ - for(i=0; i>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - l1= (c&0x03030303UL)\ - + (d&0x03030303UL);\ - h1= ((c&0xFCFCFCFCUL)>>2)\ - + ((d&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - a= AV_RN32(&src1[i*src_stride1+4]);\ - b= AV_RN32(&src2[i*src_stride2+4]);\ - c= AV_RN32(&src3[i*src_stride3+4]);\ - d= AV_RN32(&src4[i*src_stride4+4]);\ - l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x01010101UL;\ - h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - l1= (c&0x03030303UL)\ - + (d&0x03030303UL);\ - h1= ((c&0xFCFCFCFCUL)>>2)\ - + ((d&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - }\ -}\ -static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ - int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ - OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ - OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ -}\ -static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ - int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ - OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ - OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ -}\ -\ -static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ -{\ - int i, a0, b0, a1, b1;\ - a0= pixels[0];\ - b0= pixels[1] + 2;\ - a0 += b0;\ - b0 += pixels[2];\ -\ - pixels+=line_size;\ - for(i=0; i>2; /* FIXME non put */\ - block[1]= (b1+b0)>>2;\ -\ - pixels+=line_size;\ - block +=line_size;\ -\ - a0= pixels[0];\ - b0= pixels[1] + 2;\ - a0 += b0;\ - b0 += pixels[2];\ -\ - block[0]= (a1+a0)>>2;\ - block[1]= (b1+b0)>>2;\ - pixels+=line_size;\ - block +=line_size;\ - }\ -}\ -\ -static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ -{\ - int i;\ - const uint32_t a= AV_RN32(pixels );\ - const uint32_t b= AV_RN32(pixels+1);\ - uint32_t l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x02020202UL;\ - uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - uint32_t l1,h1;\ -\ - pixels+=line_size;\ - for(i=0; i>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - pixels+=line_size;\ - block +=line_size;\ - a= AV_RN32(pixels );\ - b= AV_RN32(pixels+1);\ - l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x02020202UL;\ - h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - pixels+=line_size;\ - block +=line_size;\ - }\ -}\ -\ -static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ -{\ - int j;\ - for(j=0; j<2; j++){\ - int i;\ - const uint32_t a= AV_RN32(pixels );\ - const uint32_t b= AV_RN32(pixels+1);\ - uint32_t l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x02020202UL;\ - uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - uint32_t l1,h1;\ -\ - pixels+=line_size;\ - for(i=0; i>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - pixels+=line_size;\ - block +=line_size;\ - a= AV_RN32(pixels );\ - b= AV_RN32(pixels+1);\ - l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x02020202UL;\ - h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - pixels+=line_size;\ - block +=line_size;\ - }\ - pixels+=4-line_size*(h+1);\ - block +=4-line_size*h;\ - }\ -}\ -\ -static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ -{\ - int j;\ - for(j=0; j<2; j++){\ - int i;\ - const uint32_t a= AV_RN32(pixels );\ - const uint32_t b= AV_RN32(pixels+1);\ - uint32_t l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x01010101UL;\ - uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - uint32_t l1,h1;\ -\ - pixels+=line_size;\ - for(i=0; i>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - pixels+=line_size;\ - block +=line_size;\ - a= AV_RN32(pixels );\ - b= AV_RN32(pixels+1);\ - l0= (a&0x03030303UL)\ - + (b&0x03030303UL)\ - + 0x01010101UL;\ - h0= ((a&0xFCFCFCFCUL)>>2)\ - + ((b&0xFCFCFCFCUL)>>2);\ - OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ - pixels+=line_size;\ - block +=line_size;\ - }\ - pixels+=4-line_size*(h+1);\ - block +=4-line_size*h;\ - }\ -}\ -\ -CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ - -#define op_avg(a, b) a = rnd_avg32(a, b) - -#define op_put(a, b) a = b - -PIXOP2(avg, op_avg) -PIXOP2(put, op_put) -#undef op_avg -#undef op_put - - -#define H264_CHROMA_MC(OPNAME, OP)\ -static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ - const int A=(8-x)*(8-y);\ - const int B=( x)*(8-y);\ - const int C=(8-x)*( y);\ - const int D=( x)*( y);\ - int i;\ - \ - assert(x<8 && y<8 && x>=0 && y>=0);\ -\ - if(D){\ - for(i=0; i=0 && y>=0);\ -\ - if(D){\ - for(i=0; i=0 && y>=0);\ -\ - if(D){\ - for(i=0; i>6)+1)>>1) -#define op_put(a, b) a = (((b) + 32)>>6) - -H264_CHROMA_MC(put_ , op_put) -H264_CHROMA_MC(avg_ , op_avg) -#undef op_avg -#undef op_put - - -#define H264_LOWPASS(OPNAME, OP, OP2) \ -static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - const int h=2;\ - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ - int i;\ - for(i=0; i>5]+1)>>1) -#define op_put(a, b) a = cm[((b) + 16)>>5] -#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1) -#define op2_put(a, b) a = cm[((b) + 512)>>10] - -H264_LOWPASS(put_ , op_put, op2_put) -H264_LOWPASS(avg_ , op_avg, op2_avg) -H264_MC(put_, 2) -H264_MC(put_, 4) -H264_MC(put_, 8) -H264_MC(put_, 16) -H264_MC(avg_, 4) -H264_MC(avg_, 8) -H264_MC(avg_, 16) - -#undef op_avg -#undef op_put -#undef op2_avg -#undef op2_put - -static void clear_block_c(DCTELEM *block) -{ - memset(block, 0, sizeof(DCTELEM)*64); -} - -/** - * memset(blocks, 0, sizeof(DCTELEM)*6*64) - */ -static void clear_blocks_c(DCTELEM *blocks) -{ - memset(blocks, 0, sizeof(DCTELEM)*6*64); -} - -static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; } - -/* init static data */ -av_cold void dsputil_static_init(void) -{ - int i; - - for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; - for(i=0;i= 4.2.\n" - "Do not report crashes to FFmpeg developers.\n"); -#endif - did_fail=1; - } - return -1; - } - return 0; -} - -av_cold void dsputil_init(DSPContext* c) -{ - (void) avg_pixels2_c; // kill a warning, avg_pixels2_c is a macro created function. - ff_check_alignment(); - dsputil_static_init(); - - c->idct_put= ff_simple_idct_put; - c->idct_add= ff_simple_idct_add; - c->idct = ff_simple_idct; - - c->clear_block = clear_block_c; - c->clear_blocks = clear_blocks_c; - -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c - - - dspfunc(put_h264_qpel, 0, 16); - dspfunc(put_h264_qpel, 1, 8); - dspfunc(put_h264_qpel, 2, 4); - dspfunc(put_h264_qpel, 3, 2); - dspfunc(avg_h264_qpel, 0, 16); - dspfunc(avg_h264_qpel, 1, 8); - dspfunc(avg_h264_qpel, 2, 4); - -#undef dspfunc - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c; - c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c; - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c; - c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c; - - - c->prefetch= just_return; - - if (HAVE_MMX) dsputil_init_mmx (c); - if (ARCH_ARM) dsputil_init_arm (c); - if (HAVE_ALTIVEC) dsputil_init_ppc (c); //fixme PPC prefetch -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/dsputil.h --- a/ffmpeg_smp/h264dec/libavcodec/dsputil.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,465 +0,0 @@ -/* - * DSP utils - * Copyright (c) 2000, 2001, 2002 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * DSP utils. - * note, many functions in here may use MMX which trashes the FPU state, it is - * absolutely necessary to call emms_c() between dsp & float/double code - */ - -#ifndef AVCODEC_DSPUTIL_H -#define AVCODEC_DSPUTIL_H - -#include "libavutil/intreadwrite.h" -#include "avcodec.h" -#include "h264_idct.h" -// -void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, - const float *win, float add_bias, int len); -void ff_float_to_int16_c(int16_t *dst, const float *src, long len); -void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels); - -/* encoding scans */ -extern const uint8_t ff_alternate_horizontal_scan[64]; -extern const uint8_t ff_alternate_vertical_scan[64]; -extern const uint8_t ff_zigzag_direct[64]; -extern const uint8_t ff_zigzag248_direct[64]; - -/* pixel operations */ -#define MAX_NEG_CROP 1024 - -/* temporary */ -extern uint32_t ff_squareTbl[512]; -extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP]; - -/* VP3 DSP functions */ -void ff_vp3_idct_c(DCTELEM *block/* align 16*/); -void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); -void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); -void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); - -void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values); -void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values); - -/* VP6 DSP functions */ -void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride, - const int16_t *h_weights, const int16_t *v_weights); - -/* Bink functions */ -void ff_bink_idct_c (DCTELEM *block); -void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block); -void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); - -/* CAVS functions */ -void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); -void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); - -/* VC1 functions */ -void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); -void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); - -/* EA functions */ -void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); - -/* 1/2^n downscaling functions from imgconvert.c */ -void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); -void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); -void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); -void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); - -void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); - -/* minimum alignment rules ;) -If you notice errors in the align stuff, need more alignment for some ASM code -for some CPU or need to use a function with less aligned data then send a mail -to the ffmpeg-devel mailing list, ... - -!warning These alignments might not match reality, (missing attribute((align)) -stuff somewhere possible). -I (Michael) did not check them, these are just the alignments which I think -could be reached easily ... - -!future video codecs might need functions with less strict alignment -*/ - -/* -void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size); -void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); -void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); -void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); -void clear_blocks_c(DCTELEM *blocks); -*/ - -/* add and put pixel (decoding) */ -// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16 -//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4 -typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h); -typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h); -typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); -typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); - -typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h); - -#define DEF_OLD_QPEL(name)\ -void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ -void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ -void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); - -DEF_OLD_QPEL(qpel16_mc11_old_c) -DEF_OLD_QPEL(qpel16_mc31_old_c) -DEF_OLD_QPEL(qpel16_mc12_old_c) -DEF_OLD_QPEL(qpel16_mc32_old_c) -DEF_OLD_QPEL(qpel16_mc13_old_c) -DEF_OLD_QPEL(qpel16_mc33_old_c) -DEF_OLD_QPEL(qpel8_mc11_old_c) -DEF_OLD_QPEL(qpel8_mc31_old_c) -DEF_OLD_QPEL(qpel8_mc12_old_c) -DEF_OLD_QPEL(qpel8_mc32_old_c) -DEF_OLD_QPEL(qpel8_mc13_old_c) -DEF_OLD_QPEL(qpel8_mc33_old_c) - -#define CALL_2X_PIXELS(a, b, n)\ -static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ - b(block , pixels , line_size, h);\ - b(block+n, pixels+n, line_size, h);\ -} - -/* motion estimation */ -// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2 -// although currently h<4 is not used as functions with width <8 are neither used nor implemented -typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/; - -/** - * Scantable. - */ -typedef struct ScanTable{ - const uint8_t *scantable; - uint8_t permutated[64]; - uint8_t raster_end[64]; -#if ARCH_PPC - /** Used by dct_quantize_altivec to find last-non-zero */ - DECLARE_ALIGNED(16, uint8_t, inverse)[64]; -#endif -} ScanTable; - -void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable); - -void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, - int block_w, int block_h, - int src_x, int src_y, int w, int h); - - -/** - * DSPContext. - */ -typedef struct DSPContext { - /* pixel ops : interface with DCT */ - void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size); - void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride); - void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); - void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); - void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); - void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); - void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); - void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); - - void (*clear_block)(DCTELEM *block/*align 16*/); - void (*clear_blocks)(DCTELEM *blocks/*align 16*/); - - - /** - * Halfpel motion compensation with rounding (a+b+1)>>1. - * this is an array[4][4] of motion compensation functions for 4 - * horizontal blocksizes (8,16) and the 4 halfpel positions
- * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] - * @param block destination where the result is stored - * @param pixels source - * @param line_size number of bytes in a horizontal line of block - * @param h height - */ - op_pixels_func put_pixels_tab[4][4]; - - /** - * Halfpel motion compensation with rounding (a+b+1)>>1. - * This is an array[4][4] of motion compensation functions for 4 - * horizontal blocksizes (8,16) and the 4 halfpel positions
- * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] - * @param block destination into which the result is averaged (a+b+1)>>1 - * @param pixels source - * @param line_size number of bytes in a horizontal line of block - * @param h height - */ - op_pixels_func avg_pixels_tab[4][4]; - - /** - * Halfpel motion compensation with no rounding (a+b)>>1. - * this is an array[2][4] of motion compensation functions for 2 - * horizontal blocksizes (8,16) and the 4 halfpel positions
- * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] - * @param block destination where the result is stored - * @param pixels source - * @param line_size number of bytes in a horizontal line of block - * @param h height - */ - op_pixels_func put_no_rnd_pixels_tab[4][4]; - - /** - * Halfpel motion compensation with no rounding (a+b)>>1. - * this is an array[2][4] of motion compensation functions for 2 - * horizontal blocksizes (8,16) and the 4 halfpel positions
- * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] - * @param block destination into which the result is averaged (a+b)>>1 - * @param pixels source - * @param line_size number of bytes in a horizontal line of block - * @param h height - */ - op_pixels_func avg_no_rnd_pixels_tab[4][4]; - - void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h); - - - qpel_mc_func put_qpel_pixels_tab[2][16]; - qpel_mc_func avg_qpel_pixels_tab[2][16]; - qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16]; - qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16]; - qpel_mc_func put_mspel_pixels_tab[8]; - - /** - * h264 Chroma MC - */ - h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; - h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; - /* This is really one func used in VC-1 decoding */ - h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3]; - h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3]; - - qpel_mc_func put_h264_qpel_pixels_tab[4][16]; - qpel_mc_func avg_h264_qpel_pixels_tab[4][16]; - - qpel_mc_func put_2tap_qpel_pixels_tab[4][16]; - qpel_mc_func avg_2tap_qpel_pixels_tab[4][16]; - - - /* (I)DCT */ - void (*fdct)(DCTELEM *block/* align 16*/); - void (*fdct248)(DCTELEM *block/* align 16*/); - - /* IDCT really*/ - void (*idct)(DCTELEM *block/* align 16*/); - - /** - * block -> idct -> clip to unsigned 8 bit -> dest. - * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) - * @param line_size size in bytes of a horizontal line of dest - */ - void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); - - /** - * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. - * @param line_size size in bytes of a horizontal line of dest - */ - void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); - - void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w); -#define EDGE_WIDTH 32 - - void (*prefetch)(void *mem, int stride, int h); - -} DSPContext; - -void dsputil_static_init(void); -void dsputil_init(DSPContext* p); - -int ff_check_alignment(void); - -/** - * permute block according to permuatation. - * @param last last non zero element in scantable order - */ -void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last); - -void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type); - -#define BYTE_VEC32(c) ((c)*0x01010101UL) - -static inline uint32_t rnd_avg32(uint32_t a, uint32_t b) -{ - return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); -} - -static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b) -{ - return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); -} - - -/** - * Empty mmx state. - * this must be called between any dsp function and float/double code. - * for example sin(); dsp->idct_put(); emms_c(); cos() - */ -#define emms_c() - -/* should be defined by architectures supporting - one or more MultiMedia extension */ -int mm_support(void); -extern int mm_flags; - -void dsputil_init_arm(DSPContext* c); -void dsputil_init_mmx(DSPContext* c); -void dsputil_init_ppc(DSPContext* c); - -void ff_dsputil_init_dwt(DSPContext *c); - -#if HAVE_MMX - -#undef emms_c - -static inline void emms(void) -{ - __asm__ volatile ("emms;":::"memory"); -} - - -#define emms_c() \ -{\ - if (mm_flags & FF_MM_MMX)\ - emms();\ -} - -#elif ARCH_ARM - -#if HAVE_NEON -# define STRIDE_ALIGN 16 -#endif - -#elif ARCH_PPC || ARCH_PPC64 || ARCH_CELL - -#define STRIDE_ALIGN 16 - -#endif - -#ifndef STRIDE_ALIGN -# define STRIDE_ALIGN 8 -#endif - -#define WRAPPER8_16(name8, name16)\ -static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ - return name8(s, dst , src , stride, h)\ - +name8(s, dst+8 , src+8 , stride, h);\ -} - -#define WRAPPER8_16_SQ(name8, name16)\ -static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ - int score=0;\ - score +=name8(s, dst , src , stride, 8);\ - score +=name8(s, dst+8 , src+8 , stride, 8);\ - if(h==16){\ - dst += 8*stride;\ - src += 8*stride;\ - score +=name8(s, dst , src , stride, 8);\ - score +=name8(s, dst+8 , src+8 , stride, 8);\ - }\ - return score;\ -} - -static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) -{ - int i; - for(i=0; i - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * bitstream reader API header. - */ - -#ifndef AVCODEC_GET_BITS_H -#define AVCODEC_GET_BITS_H - -#include -#include -#include -#include "libavutil/bswap.h" -#include "libavutil/common.h" -#include "libavutil/intreadwrite.h" -#include "libavutil/log.h" -#include "mathops.h" - - -typedef struct GetBitContext { - uint8_t *rbsp; - unsigned int rbsp_size; - uint8_t *raw; - const uint8_t *buffer, *buffer_end; - unsigned int alloc_size; - unsigned int buf_size; - uint32_t *buffer_ptr; - uint32_t cache0; - uint32_t cache1; - int bit_count; - int size_in_bits; -} GetBitContext; - -/* Bitstream reader API docs: -name - arbitrary name which is used as prefix for the internal variables - -gb - getbitcontext - -OPEN_READER(name, gb) - loads gb into local variables - -CLOSE_READER(name, gb) - stores local vars in gb - -UPDATE_CACHE(name, gb) - refills the internal cache from the bitstream - after this call at least MIN_CACHE_BITS will be available, - -GET_CACHE(name, gb) - will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit) - -SHOW_UBITS(name, gb, num) - will return the next num bits - -SHOW_SBITS(name, gb, num) - will return the next num bits and do sign extension - -SKIP_BITS(name, gb, num) - will skip over the next num bits - note, this is equivalent to SKIP_CACHE; SKIP_COUNTER - -SKIP_CACHE(name, gb, num) - will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER) - -SKIP_COUNTER(name, gb, num) - will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS) - -LAST_SKIP_CACHE(name, gb, num) - will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing - -LAST_SKIP_BITS(name, gb, num) - is equivalent to LAST_SKIP_CACHE; SKIP_COUNTER - -for examples see get_bits, show_bits, skip_bits, get_vlc -*/ - -#define MIN_CACHE_BITS 32 - -#define OPEN_READER(name, gb)\ - int name##_bit_count=(gb)->bit_count;\ - uint32_t name##_cache0= (gb)->cache0;\ - uint32_t name##_cache1= (gb)->cache1;\ - uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\ - -#define CLOSE_READER(name, gb)\ - (gb)->bit_count= name##_bit_count;\ - (gb)->cache0= name##_cache0;\ - (gb)->cache1= name##_cache1;\ - (gb)->buffer_ptr= name##_buffer_ptr;\ - -#define UPDATE_CACHE(name, gb)\ - if(name##_bit_count > 0){\ - const uint32_t next= be2me_32( *name##_buffer_ptr );\ - name##_cache0 |= NEG_USR32(next,name##_bit_count);\ - name##_cache1 |= next<buffer_ptr - s->buffer)*8 - 32 + s->bit_count; -} - -static inline void skip_bits_long(GetBitContext *s, int n){ - OPEN_READER(re, s) - re_bit_count += n; - re_buffer_ptr += re_bit_count>>5; - re_bit_count &= 31; - re_cache0 = be2me_32( re_buffer_ptr[-1] ) << re_bit_count; - re_cache1 = 0; - UPDATE_CACHE(re, s) - CLOSE_READER(re, s) -} - -/** - * read mpeg1 dc style vlc (sign bit + mantisse with no MSB). - * if MSB not set it is negative - * @param n length in bits - * @author BERO - */ -static inline int get_xbits(GetBitContext *s, int n){ - register int sign; - register int32_t cache; - OPEN_READER(re, s) - UPDATE_CACHE(re, s) - cache = GET_CACHE(re,s); - sign=(~cache)>>31; - LAST_SKIP_BITS(re, s, n) - CLOSE_READER(re, s) - return (NEG_USR32(sign ^ cache, n) ^ sign) - sign; -} - -static inline int get_sbits(GetBitContext *s, int n){ - register int tmp; - OPEN_READER(re, s) - UPDATE_CACHE(re, s) - tmp= SHOW_SBITS(re, s, n); - LAST_SKIP_BITS(re, s, n) - CLOSE_READER(re, s) - return tmp; -} - -/** - * reads 1-17 bits. - * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't - */ -static inline unsigned int get_bits(GetBitContext *s, int n){ - register int tmp; - OPEN_READER(re, s) - UPDATE_CACHE(re, s) - tmp= SHOW_UBITS(re, s, n); - LAST_SKIP_BITS(re, s, n) - CLOSE_READER(re, s) - return tmp; -} - -/** - * shows 1-17 bits. - * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't - */ -static inline unsigned int show_bits(GetBitContext *s, int n){ - register int tmp; - OPEN_READER(re, s) - UPDATE_CACHE(re, s) - tmp= SHOW_UBITS(re, s, n); -// CLOSE_READER(re, s) - return tmp; -} - -static inline void skip_bits(GetBitContext *s, int n){ - //Note gcc seems to optimize this to s->index+=n for the ALT_READER :)) - OPEN_READER(re, s) - UPDATE_CACHE(re, s) - LAST_SKIP_BITS(re, s, n) - CLOSE_READER(re, s) -} - -static inline unsigned int get_bits1(GetBitContext *s){ - return get_bits(s, 1); -} - -static inline unsigned int show_bits1(GetBitContext *s){ - return show_bits(s, 1); -} - -static inline void skip_bits1(GetBitContext *s){ - skip_bits(s, 1); -} - -/** - * reads 0-32 bits. - */ -static inline unsigned int get_bits_long(GetBitContext *s, int n){ - if(n<=MIN_CACHE_BITS) return get_bits(s, n); - else{ - int ret= get_bits(s, 16) << (n-16); - return ret | get_bits(s, n-16); - } -} - -/** - * reads 0-32 bits as a signed integer. - */ -static inline int get_sbits_long(GetBitContext *s, int n) { - return sign_extend(get_bits_long(s, n), n); -} - -/** - * shows 0-32 bits. - */ -static inline unsigned int show_bits_long(GetBitContext *s, int n){ - if(n<=MIN_CACHE_BITS) return show_bits(s, n); - else{ - GetBitContext gb= *s; - return get_bits_long(&gb, n); - } -} - -static inline int check_marker(GetBitContext *s, const char *msg) -{ - int bit= get_bits1(s); - if(!bit) - av_log(AV_LOG_INFO, "Marker bit missing %s\n", msg); - - return bit; -} - -/** - * init GetBitContext. - * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits - * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end - * @param bit_size the size of the buffer in bits - * - * While GetBitContext stores the buffer size, for performance reasons you are - * responsible for checking for the buffer end yourself (take advantage of the padding)! - */ -static inline void init_get_bits(GetBitContext *s, - const uint8_t *buffer, int bit_size) -{ - int buffer_size= (bit_size+7)>>3; - if(buffer_size < 0 || bit_size < 0) { - buffer_size = bit_size = 0; - buffer = NULL; - } - - s->buffer= buffer; - s->size_in_bits= bit_size; - s->buffer_end= buffer + buffer_size; - - s->buffer_ptr = (uint32_t*)((intptr_t)buffer&(~3)); - s->bit_count = 32 + 8*((intptr_t)buffer&3); - skip_bits_long(s, 0); -} - -static inline void align_get_bits(GetBitContext *s) -{ - int n= (-get_bits_count(s)) & 7; - if(n) skip_bits(s, n); -} - -#define tprintf(p, ...) {} - -static inline int get_bits_left(GetBitContext *gb) -{ - return gb->size_in_bits - get_bits_count(gb); -} - -#endif /* AVCODEC_GET_BITS_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/golomb.c --- a/ffmpeg_smp/h264dec/libavcodec/golomb.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ -/* - * exp golomb vlc stuff - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * @brief - * exp golomb vlc stuff - * @author Michael Niedermayer - */ - -#include "libavutil/common.h" - -const uint8_t ff_log2_tab[256]={ - 0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, - 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 -}; - -const uint8_t ff_golomb_vlc_len[512]={ -14,13,12,12,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, -7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, -5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, -5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 -}; - -const uint8_t ff_ue_golomb_vlc_code[512]={ -31,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30, - 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -const int8_t ff_se_golomb_vlc_code[512]={ - 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 8, -8, 9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, - 4, 4, 4, 4, -4, -4, -4, -4, 5, 5, 5, 5, -5, -5, -5, -5, 6, 6, 6, 6, -6, -6, -6, -6, 7, 7, 7, 7, -7, -7, -7, -7, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - - -const uint8_t ff_ue_golomb_len[256]={ - 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,11, -11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,13, -13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, -13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,15, -15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, -15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, -15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, -15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,17, -}; - -const uint8_t ff_interleaved_golomb_vlc_len[256]={ -9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, -9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, -9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -}; - -const uint8_t ff_interleaved_ue_golomb_vlc_code[256]={ - 15,16,7, 7, 17,18,8, 8, 3, 3, 3, 3, 3, 3, 3, 3, - 19,20,9, 9, 21,22,10,10,4, 4, 4, 4, 4, 4, 4, 4, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 23,24,11,11,25,26,12,12,5, 5, 5, 5, 5, 5, 5, 5, - 27,28,13,13,29,30,14,14,6, 6, 6, 6, 6, 6, 6, 6, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -const int8_t ff_interleaved_se_golomb_vlc_code[256]={ - 8, -8, 4, 4, 9, -9, -4, -4, 2, 2, 2, 2, 2, 2, 2, 2, - 10,-10, 5, 5, 11,-11, -5, -5, -2, -2, -2, -2, -2, -2, -2, -2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 12,-12, 6, 6, 13,-13, -6, -6, 3, 3, 3, 3, 3, 3, 3, 3, - 14,-14, 7, 7, 15,-15, -7, -7, -3, -3, -3, -3, -3, -3, -3, -3, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]={ -0, 1, 0, 0, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, -4, 5, 2, 2, 6, 7, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 9, 4, 4, 10,11,5, 5, 2, 2, 2, 2, 2, 2, 2, 2, -12,13,6, 6, 14,15,7, 7, 3, 3, 3, 3, 3, 3, 3, 3, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}; diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/golomb.h --- a/ffmpeg_smp/h264dec/libavcodec/golomb.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,410 +0,0 @@ -/* - * exp golomb vlc stuff - * Copyright (c) 2003 Michael Niedermayer - * Copyright (c) 2004 Alex Beregszaszi - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * @brief - * exp golomb vlc stuff - * @author Michael Niedermayer and Alex Beregszaszi - */ - -#ifndef AVCODEC_GOLOMB_H -#define AVCODEC_GOLOMB_H - -#include -#include "get_bits.h" - -#define INVALID_VLC 0x80000000 - -extern const uint8_t ff_golomb_vlc_len[512]; -extern const uint8_t ff_ue_golomb_vlc_code[512]; -extern const int8_t ff_se_golomb_vlc_code[512]; -extern const uint8_t ff_ue_golomb_len[256]; - -extern const uint8_t ff_interleaved_golomb_vlc_len[256]; -extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256]; -extern const int8_t ff_interleaved_se_golomb_vlc_code[256]; -extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]; - - - /** - * read unsigned exp golomb code. - */ -static inline int get_ue_golomb(GetBitContext *gb){ - unsigned int buf; - int log; - - OPEN_READER(re, gb); - UPDATE_CACHE(re, gb); - buf=GET_CACHE(re, gb); - - if(buf >= (1<<27)){ - buf >>= 32 - 9; - LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); - CLOSE_READER(re, gb); - - return ff_ue_golomb_vlc_code[buf]; - }else{ - log= 2*av_log2_c(buf) - 31; - buf>>= log; - buf--; - LAST_SKIP_BITS(re, gb, 32 - log); - CLOSE_READER(re, gb); - - return buf; - } -} - - /** - * read unsigned exp golomb code, constraint to a max of 31. - * the return value is undefined if the stored value exceeds 31. - */ -static inline int get_ue_golomb_31(GetBitContext *gb){ - unsigned int buf; - - OPEN_READER(re, gb); - UPDATE_CACHE(re, gb); - buf=GET_CACHE(re, gb); - - buf >>= 32 - 9; - LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); - CLOSE_READER(re, gb); - - return ff_ue_golomb_vlc_code[buf]; -} - -static inline int svq3_get_ue_golomb(GetBitContext *gb){ - uint32_t buf; - - OPEN_READER(re, gb); - UPDATE_CACHE(re, gb); - buf=GET_CACHE(re, gb); - - if(buf&0xAA800000){ - buf >>= 32 - 8; - LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); - CLOSE_READER(re, gb); - - return ff_interleaved_ue_golomb_vlc_code[buf]; - }else{ - int ret = 1; - - while (1) { - buf >>= 32 - 8; - LAST_SKIP_BITS(re, gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8)); - - if (ff_interleaved_golomb_vlc_len[buf] != 9){ - ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1; - ret |= ff_interleaved_dirac_golomb_vlc_code[buf]; - break; - } - ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf]; - UPDATE_CACHE(re, gb); - buf = GET_CACHE(re, gb); - } - - CLOSE_READER(re, gb); - return ret - 1; - } -} - -/** - * read unsigned truncated exp golomb code. - */ -static inline int get_te0_golomb(GetBitContext *gb, int range){ - assert(range >= 1); - - if(range==1) return 0; - else if(range==2) return get_bits1(gb)^1; - else return get_ue_golomb(gb); -} - -/** - * read unsigned truncated exp golomb code. - */ -static inline int get_te_golomb(GetBitContext *gb, int range){ - assert(range >= 1); - - if(range==2) return get_bits1(gb)^1; - else return get_ue_golomb(gb); -} - - -/** - * read signed exp golomb code. - */ -static inline int get_se_golomb(GetBitContext *gb){ - unsigned int buf; - int log; - - OPEN_READER(re, gb); - UPDATE_CACHE(re, gb); - buf=GET_CACHE(re, gb); - - if(buf >= (1<<27)){ - buf >>= 32 - 9; - LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); - CLOSE_READER(re, gb); - - return ff_se_golomb_vlc_code[buf]; - }else{ - log= 2*av_log2_c(buf) - 31; - buf>>= log; - - LAST_SKIP_BITS(re, gb, 32 - log); - CLOSE_READER(re, gb); - - if(buf&1) buf= -(buf>>1); - else buf= (buf>>1); - - return buf; - } -} - -static inline int svq3_get_se_golomb(GetBitContext *gb){ - unsigned int buf; - int log; - - OPEN_READER(re, gb); - UPDATE_CACHE(re, gb); - buf=GET_CACHE(re, gb); - - if(buf&0xAA800000){ - buf >>= 32 - 8; - LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); - CLOSE_READER(re, gb); - - return ff_interleaved_se_golomb_vlc_code[buf]; - }else{ - LAST_SKIP_BITS(re, gb, 8); - UPDATE_CACHE(re, gb); - buf |= 1 | (GET_CACHE(re, gb) >> 8); - - if((buf & 0xAAAAAAAA) == 0) - return INVALID_VLC; - - for(log=31; (buf & 0x80000000) == 0; log--){ - buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30); - } - - LAST_SKIP_BITS(re, gb, 63 - 2*log - 8); - CLOSE_READER(re, gb); - - return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1; - } -} - -static inline int dirac_get_se_golomb(GetBitContext *gb){ - uint32_t buf; - uint32_t ret; - - ret = svq3_get_ue_golomb(gb); - - if (ret) { - OPEN_READER(re, gb); - UPDATE_CACHE(re, gb); - buf = SHOW_SBITS(re, gb, 1); - LAST_SKIP_BITS(re, gb, 1); - ret = (ret ^ buf) - buf; - CLOSE_READER(re, gb); - } - - return ret; -} - -/** - * read unsigned golomb rice code (ffv1). - */ -static inline int get_ur_golomb(GetBitContext *gb, int k, int limit, int esc_len){ - unsigned int buf; - int log; - - OPEN_READER(re, gb); - UPDATE_CACHE(re, gb); - buf=GET_CACHE(re, gb); - - log= av_log2_c(buf); - - if(log > 31-limit){ - buf >>= log - k; - buf += (30-log)<= 32-MIN_CACHE_BITS+(MIN_CACHE_BITS==32) && 32-log < limit){ - buf >>= log - k; - buf += (30-log)<>1; - else return -(v>>1); - -// return (v>>1) ^ -(v&1); -} - -/** - * read signed golomb rice code (flac). - */ -static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, int esc_len){ - int v= get_ur_golomb_jpegls(gb, k, limit, esc_len); - return (v>>1) ^ -(v&1); -} - -/** - * read unsigned golomb rice code (shorten). - */ -static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k){ - return get_ur_golomb_jpegls(gb, k, INT_MAX, 0); -} - -/** - * read signed golomb rice code (shorten). - */ -static inline int get_sr_golomb_shorten(GetBitContext* gb, int k) -{ - int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0); - if (uvar & 1) - return ~(uvar >> 1); - else - return uvar >> 1; -} - - - -#ifdef TRACE - -static inline int get_ue(GetBitContext *s, char *file, const char *func, int line){ - int show= show_bits(s, 24); - int pos= get_bits_count(s); - int i= get_ue_golomb(s); - int len= get_bits_count(s) - pos; - int bits= show>>(24-len); - - print_bin(bits, len); - - av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); - - return i; -} - -static inline int get_se(GetBitContext *s, char *file, const char *func, int line){ - int show= show_bits(s, 24); - int pos= get_bits_count(s); - int i= get_se_golomb(s); - int len= get_bits_count(s) - pos; - int bits= show>>(24-len); - - print_bin(bits, len); - - av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); - - return i; -} - -static inline int get_te(GetBitContext *s, int r, char *file, const char *func, int line){ - int show= show_bits(s, 24); - int pos= get_bits_count(s); - int i= get_te0_golomb(s, r); - int len= get_bits_count(s) - pos; - int bits= show>>(24-len); - - print_bin(bits, len); - - av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); - - return i; -} - -#define get_ue_golomb(a) get_ue(a, __FILE__, __PRETTY_FUNCTION__, __LINE__) -#define get_se_golomb(a) get_se(a, __FILE__, __PRETTY_FUNCTION__, __LINE__) -#define get_te_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__) -#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__) - -#endif - - -#endif /* AVCODEC_GOLOMB_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264.c --- a/ffmpeg_smp/h264dec/libavcodec/h264.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,215 +0,0 @@ -#include "config.h" -#include "h264.h" -#include "h264_misc.h" -#include - -H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int width, int height, h264_options *opts){ - int i; - const int mb_height = (height + 15) / 16; - const int mb_width = (width + 15) / 16; - const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16 - - ff_init_cabac_states(); - - H264Context *h= av_mallocz(sizeof(H264Context)); - - start_timer(h, TOTAL); - h->file_name = file_name; - h->profile = opts->profile; - for (i=0; itotal_time[i]=0; - - h->ifile=ifile; - h->ofile =ofile; - - h->verbose =opts->verbose; - h->no_mbd =opts->no_mbd; - h->static_3d =opts->static_3d; - h->pipe_bufs = opts->pipe_bufs; - h->slice_bufs = opts->slice_bufs; - - h->ed_ppe_threads =0; - if (opts->ppe_ed){ - h->ed_ppe_threads = (opts->threads >opts->ppe_ed)? opts->ppe_ed :opts->threads; - } - - h->threads = opts->threads - h->ed_ppe_threads; - h->smt = opts->smt; - if (h->smt){ - h->threads *= 2; - } - - h->num_frames = opts->numframes; - - h->frame_width = width; - h->frame_height = height; - - while ((width/2) %STRIDE_ALIGN) - width+=STRIDE_ALIGN; - h->width = width; - h->height = mb_height*16; - - h->mb_height = mb_height; - h->mb_width = mb_width; - h->mb_stride = mb_stride; - h->b4_stride = mb_width*4 + 1; - h->b_stride = mb_width*4; - - h->smb_width = opts->smb_size[0]; - h->smb_height = opts->smb_size[1] < h->smb_width ? opts->smb_size[1] : h->smb_width; - h->smbc = getSuperMBContext(h, h->smb_width, h->smb_height); - - h->wave_order = opts->wave_order; - - h->pipe_bufs = opts->pipe_bufs; - - h->max_dpb_cnt = DPB_SIZE + opts->pipe_bufs; - h->free_dpb_cnt = h->max_dpb_cnt; - h->dpb = av_mallocz (h->max_dpb_cnt* sizeof (DecodedPicture)); - - - h->free_sb_cnt = h->threads*opts->slice_bufs + (h->no_mbd != 0) ; //one extra to overlap some latency of signaling/freeing slicebuffers in entropy only mode - h->sb_size = h->free_sb_cnt; - h->sb = av_mallocz(h->sb_size* sizeof(SliceBufferEntry)); - - h->rl_q.size = FFMAX(1, FFMIN( (h->height-3 - 512)/16, h->mb_width/2)) +1; - h->rl_q.free = h->rl_q.size -1; - h->rl_q.ready=0; - h->rl_q.fi = h->rl_q.fo= 0; - h->rl_q.queue = av_malloc(h->rl_q.size* sizeof(RingLineEntry*)); - for (i=0; irl_q.size; i++){ - if( posix_memalign((void**)&h->rl_q.queue[i],64,sizeof(RingLineEntry))) - h->rl_q.queue[i]=NULL; - h->rl_q.queue[i]->top = av_malloc(h->mb_width*sizeof(TopBorder)); - } - - h->rl_q.queue[0]->prev_line = h->rl_q.queue[h->rl_q.size-1]; - for (i=1; irl_q.size; i++){ - h->rl_q.queue[i]->prev_line = h->rl_q.queue[i-1]; - } - - if( HAVE_MMX | HAVE_ALTIVEC| HAVE_NEON ){ - for(i=0; i<16; i++){ - #define T(x) (x>>2) | ((x<<2) & 0xF) - h->zigzag_scan[i] = T(zigzag_scan[i]); - #undef T - } - for(i=0; i<64; i++){ - #define T(x) (x>>3) | ((x&7)<<3) - h->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]); - #undef T - } - }else{ - memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t)); - memcpy(h->zigzag_scan8x8, ff_zigzag_direct, 64*sizeof(uint8_t)); - } - - pthread_mutex_init(&h->smb_lock, NULL); - pthread_mutex_init(&h->sdl_lock, NULL); - pthread_cond_init(&h->sdl_cond, NULL); - - ///pthread initialization - pthread_mutex_init(&h->ilock, NULL); - pthread_cond_init(&h->icond, NULL); - pthread_mutex_init(&h->slock, NULL); - pthread_cond_init(&h->scond, NULL); - pthread_mutex_init(&h->tlock, NULL); - pthread_cond_init(&h->tcond, NULL); - pthread_mutex_init(&h->tdlock, NULL); - pthread_cond_init(&h->tdcond, NULL); - h->start =!opts->numamap; //default dont wait for start signal - h->statmbd = opts->statmbd; - h->rl_side_touch= opts->numamap; - h->touch_start=0; - h->setaff =opts->statsched; - h->init_threads=0; - - pthread_mutex_init(&h->task_lock, NULL); - pthread_cond_init(&h->task_cond, NULL); - for (i=0; ilock[i], NULL); - pthread_cond_init (&h->cond[i], NULL); - - pthread_mutex_init (&h->sb_q[i].lock, NULL); - pthread_cond_init (&h->sb_q[i].cond, NULL); - h->sb_q[i].size = h->free_sb_cnt; //change to num threads later - h->sb_q[i].queue = av_malloc(h->free_sb_cnt* sizeof(SliceBufferEntry*)); - h->sb_q[i].cnt = h->sb_q[i].fi = h->sb_q[i].fo =0; - } - -#if HAVE_LIBSDL2 - h->sdlq.size=2; - h->sdlq.ready=2; - h->sdlq.queue = av_malloc(2* sizeof(SDL_Texture*)); - pthread_mutex_init (&h->sdlq.sdl_lock, NULL); - pthread_cond_init (&h->sdlq.sdl_cond, NULL); -#endif - - h->display=opts->display; - h->fullscreen=opts->fullscreen; - - return h; -} - - -void free_h264dec_context(H264Context *h) { - int i; - - for(i=0; imax_dpb_cnt; i++) - free_dp(&h->dpb[i]); - av_free (h->dpb); - - for(i=0; isb_size; i++){ - if (h->sb[i].initialized){ - free_sb_entry(&h->sb[i]); - } - } - av_freep(&h->sb); - - for (i=0; irl_q.size; i++){ - av_freep(&h->rl_q.queue[i]->top); - av_freep(&h->rl_q.queue[i]); - } - av_freep(&h->rl_q.queue); - - ///pthread cleanup - pthread_mutex_destroy (&h->task_lock); - pthread_cond_destroy (&h->task_cond); - for (i=0; ilock[i]); - pthread_cond_destroy (&h->cond[i]); - - pthread_mutex_destroy (&h->sb_q[i].lock); - pthread_cond_destroy (&h->sb_q[i].cond); - av_freep( &h->sb_q[i].queue); - } - pthread_mutex_destroy (&h->slock); - pthread_cond_destroy (&h->scond); - pthread_mutex_destroy (&h->ilock); - pthread_cond_destroy (&h->icond); - - pthread_mutex_destroy(&h->smb_lock); - pthread_mutex_destroy (&h->sdl_lock); - pthread_cond_destroy (&h->sdl_cond); -#if HAVE_LIBSDL2 - av_free(h->sdlq.queue); - pthread_mutex_destroy (&h->sdlq.sdl_lock); - pthread_cond_destroy (&h->sdlq.sdl_cond); -#endif - - stop_timer(h, TOTAL); - if (h->threads==0){ - for (i=0; itotal_time[i] /= h->num_frames; - double others = h->total_time[TOTAL]; - for (i=1; itotal_time[i]; - if (h->profile == 1){ - printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [MBREC %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED], h->total_time[REC], others); - }else if (h->profile ==2){ - printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [PRED %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED],h->total_time[REC], others); - } - } - - av_free(h); -} \ No newline at end of file diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264.h --- a/ffmpeg_smp/h264dec/libavcodec/h264.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -/* -* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder -* Copyright (c) 2003 Michael Niedermayer -* -* This file is part of FFmpeg. -* -* FFmpeg is free software; you can redistribute it and/or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* FFmpeg is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public -* License along with FFmpeg; if not, write to the Free Software -* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -*/ - -/** -* @file -* H.264 / AVC / MPEG4 part10 codec. -* @author Michael Niedermayer -*/ - -#ifndef H264_H -#define H264_H - -#include "h264_entropy.h" -#include "h264_data.h" -#include "h264_mc.h" -#include "h264_misc.h" -#include "h264_dsp.h" -#include "h264_pred.h" -#include "h264_parser.h" -#include "h264_nal.h" -#include "h264_rec.h" -#include "h264_deblock.h" -#include "h264_types.h" - -typedef struct h264_options{ - int statsched; - int statmbd; - int numamap; - int no_mbd; - int numframes; - int display; - int fullscreen; - int verbose; - int ppe_ed; // only useful for Cell - int profile; - int threads; - int smb_size[2]; // only useful for OmpSs - int wave_order; - int static_3d; - int pipe_bufs; - int slice_bufs; - int smt; -}h264_options; - -int h264_decode_cell(H264Context *h); -int h264_decode_cell_seq(H264Context *h); - -int h264_decode_ompss(H264Context *h); - -int h264_decode_pthread(H264Context *h); -int h264_decode_seq(H264Context *h); - - -H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int frame_width, int frame_height, h264_options *opts); -void free_h264dec_context(H264Context *h); - - -#endif /* AVCODEC_H264_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_cell.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_cell.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1242 +0,0 @@ - -#include "h264_types.h" -#include "h264_parser.h" -#include "h264_nal.h" -#include "h264_entropy.h" -#include "h264_rec.h" -#include "h264_misc.h" -#include "cell/h264_types_spu.h" -#include "h264_pthread.h" - -#include -#include -#include - -#include -#include -#include -#include - -// spe global variables -unsigned rl_cnt_var, rl_mutex_var, rl_cond_var; -atomic_ea_t rl_cnt; -cond_ea_t rl_cond; -mutex_ea_t rl_lock; - -H264spe * spe_params; -unsigned mutex_var[16]; -unsigned cond_var[16]; -unsigned atomic_var[16]; - -pthread_t * spe_tid; -spe_context_ptr_t *spe_context; -void** spe_control_area; -void** spe_ls_area; -H264slice **spe_slice_buf; - -H264spe * spe_ed_params; -unsigned mutex_ed_var[16]; -unsigned cond_ed_var[16]; -unsigned atomic_ed_var[16]; - -pthread_t * spe_ed_tid; -spe_context_ptr_t *spe_ed_context; -void** spe_ed_control_area; -void** spe_ed_ls_area; -EDSlice_spu **spe_ed_slice_buf; - -//structs to propagate stop signal -MBSlice last_slice; -EDSlice last_ed_slice; -DecodedPicture last_pic; -RawFrame last_frm; - -static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){ - int i; - int cnt = *poc_cnt; - for(i=0; iref_list[1][0]->poc){ - *poc_cnt=i+1; - while(++i poc) { i++;} - if ( i< cnt) - memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int)); - - poc_list[i]=poc; - (*poc_cnt)++; -} - -static void *spe_ed_thread(void *arg){ - H264spe *params = (H264spe *)arg; - unsigned int idx = params->idx; - unsigned int runflags = 0; - unsigned int entry = SPE_DEFAULT_ENTRY; - // run SPE context - spe_context_run(spe_ed_context[idx], &entry, runflags, (void*) params, NULL, NULL); - // done - now exit thread - pthread_exit(NULL); -} - -static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) { - int i; - int num_threads = ip_threads+b_threads; - spe_program_handle_t * spe_program = spe_image_open("spe_ed"); - // reserve memory for spe thread id, context and argument addresses - spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t)); - spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); - spe_ed_params = av_malloc(num_threads * sizeof (H264spe)); - spe_ed_control_area = av_malloc(num_threads * sizeof (void*)); - spe_ed_ls_area = av_malloc(num_threads * sizeof (void*)); - spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*)); - - if (spe_program == NULL) - av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); - - for (i = 0; i < num_threads; i++) { - // create context for spe program - spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL); - if (spe_ed_context[i] == NULL) - av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); - // load SPE program into main memory - if ((spe_program_load(spe_ed_context[i], spe_program)) == -1) - av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); - //get the control_area for fast mailboxing - if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL) - av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); - //get ls area for inter spe communication - if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL) - av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); - } - - for (i = 0; i < ip_threads; i++) { - spe_ed_params[i].mb_width = h->mb_width; - spe_ed_params[i].mb_stride = h->mb_stride; - spe_ed_params[i].mb_height = h->mb_height; - spe_ed_params[i].type = EDIP; - spe_ed_params[i].spe_id = i; - spe_ed_params[i].idx = i; - //spe_ed_params[i].spe_total = ip_threads; //not used - //spe_params[i].slice_params= &slice_params; - spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; - spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads]; - - spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; - spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; - spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); - - mutex_init(spe_ed_params[i].lock); - cond_init(spe_ed_params[i].cond); - if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) - av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); - - //slicebufaddr - spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); - av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); - } - for (int j = 0; j < b_threads; j++) { - i = j+ip_threads; - spe_ed_params[i].mb_width = h->mb_width; - spe_ed_params[i].mb_stride = h->mb_stride; - spe_ed_params[i].mb_height = h->mb_height; - spe_ed_params[i].type = EDB; - spe_ed_params[i].idx = i; - spe_ed_params[i].spe_id = j; - spe_ed_params[i].spe_total = b_threads; - //spe_params[i].slice_params= &slice_params; - //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; - spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads]; - - spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; - spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; - spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); - - mutex_init(spe_ed_params[i].lock); - cond_init(spe_ed_params[i].cond); - if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) - av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); - - //slicebufaddr - spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); - av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); - } - spe_image_close(spe_program); - -} - -static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){ - dst->pps = src->pps; - dst->mbs = src->mbs; - dst->state = src->state; - dst->qp_thresh = src->qp_thresh; - dst->pic = *src->current_picture; - - dst->ref_count[0] = src->ref_count[0]; - dst->ref_count[1] = src->ref_count[1]; - dst->slice_type = src->slice_type; - dst->slice_type_nos = src->slice_type_nos; - dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag; - dst->list_count = src->list_count; - dst->coded_pic_num = src->coded_pic_num; - - GetBitContext *gb = &src->gb; - align_get_bits( gb); - dst->bytestream_start = gb->buffer + get_bits_count(gb)/8; - dst->byte_bufsize = (get_bits_left(gb) + 7)/8; - - dst->transform_bypass = src->transform_bypass; - dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred; - memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int)); - memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int)); - dst->cabac_init_idc = src->cabac_init_idc; - memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int)); - dst->chroma_qp[0]= src->chroma_qp[0]; - dst->chroma_qp[1]= src->chroma_qp[1]; - dst->qscale = src->qscale; - dst->last_qscale_diff = src->last_qscale_diff; - - if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0]; -} - -static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){ - unsigned status; - - spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0); - spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status); - - - _spe_in_mbox_write(spe_ed_control_area[id], 0); - - while (!spe_out_mbox_status(spe_ed_context[id])){ - //pthread_yield(); - usleep(1000); - } - _spe_out_mbox_read(spe_ed_control_area[id]); -} - -static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){ - int i,j; - - if( !s->pps.cabac ){ - av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); - return -1; - } - DECLARE_ALIGNED(16, EDSlice_spu, slice); - fill_EDSlice_spu(&slice, s); - - send_slice_to_spe_and_wait(&slice, id); - - return 0; -} - -static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){ - int i,j; - - if( !s->pps.cabac ){ - av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); - return -1; - } - DECLARE_ALIGNED(16, EDSlice_spu, slice); - fill_EDSlice_spu(&slice, s); - - send_slice_to_spe_and_wait(&slice, 0); - - if (s->release_cnt>0) { - for (int i=0; irelease_cnt; i++){ - release_pib_entry(h, s->release_ref[i], 2); - } - s->release_cnt=0; - } - - release_pib_entry(h, s->current_picture, 1); - av_freep(&s->gb.raw); - if (s->gb.rbsp) - av_freep(&s->gb.rbsp); - - return 0; -} - -static void *entr_IP_spe_thread(void *arg){ - EDThreadContext *eip = (EDThreadContext *) arg; - H264Context *h = eip->h; -// printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid)); - for (int i=0; imbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); - } - - EntropyContext *ec = get_entropy_context(h); - EDSlice *s; - - for(;;){ - { - pthread_mutex_lock(&eip->ed_lock); - while (eip->ed_cnt <= 0) - pthread_cond_wait(&eip->ed_cond, &eip->ed_lock); - s = &eip->ed_q[eip->ed_fo]; - eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT; - pthread_mutex_unlock(&eip->ed_lock); - } - - if (s->state<0) - break; - { - pthread_mutex_lock(&eip->mbs_lock); - while (eip->mbs_cnt <= 0) - pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock); - - s->mbs = eip->mbs[eip->mbs_fo]; - s->ed = eip; - eip->mbs_cnt--; - eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS; - pthread_mutex_unlock(&eip->mbs_lock); - } - if (eip->cell){ - decode_slice_entropy_cell(ec, s, eip->thread_num); - }else{ - decode_slice_entropy(ec, s); - } - -// { -// pthread_mutex_lock(&h->lock[ENTROPY2]); -// h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc; -// while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT) -// h->ed_poc_fo++; -// -// pthread_cond_signal(&h->cond[ENTROPY2]); -// pthread_mutex_unlock(&h->lock[ENTROPY2]); -// } - - { - pthread_mutex_lock(&h->lock[ENTROPY4]); - while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); - h->ed_reorder_q[h->ed_reorder_fi] = *s; - h->ed_reorder_cnt++; - h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[ENTROPY4]); - pthread_mutex_unlock(&h->lock[ENTROPY4]); - } - - { - pthread_mutex_lock(&eip->ed_lock); - eip->ed_cnt--; - pthread_cond_signal(&eip->ed_cond); - pthread_mutex_unlock(&eip->ed_lock); - } - } - - free_entropy_context(ec); - - pthread_exit(NULL); - return NULL; -} - -static void *entr_B_spe_thread(void *arg){ - EDThreadContext *eb = (EDThreadContext *) arg; - H264Context *h = eb->h; -// printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid)); - for (int i=0; imbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); - } - - EntropyContext *ec = get_entropy_context(h); - EDSlice *s; - - for(;;){ - { - pthread_mutex_lock(&eb->ed_lock); - while (eb->ed_cnt <= 0) - pthread_cond_wait(&eb->ed_cond, &eb->ed_lock); - s = &eb->ed_q[eb->ed_fo]; - eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT; - pthread_mutex_unlock(&eb->ed_lock); - } - - if (s->state<0) - break; - { - pthread_mutex_lock(&eb->mbs_lock); - while (eb->mbs_cnt <= 0) - pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock); - s->mbs = eb->mbs[eb->mbs_fo]; - s->ed = eb; - eb->mbs_cnt--; - eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS; - pthread_mutex_unlock(&eb->mbs_lock); - } - //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed); - decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads); - - { - pthread_mutex_lock(&h->lock[ENTROPY4]); - while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); - h->ed_reorder_q[h->ed_reorder_fi] = *s; - h->ed_reorder_cnt++; - h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[ENTROPY4]); - pthread_mutex_unlock(&h->lock[ENTROPY4]); - - } - - { - pthread_mutex_lock(&eb->ed_lock); - eb->ed_cnt--; - pthread_cond_signal(&eb->ed_cond); - pthread_mutex_unlock(&eb->ed_lock); - } - } - eb->lines_cnt++; - - free_entropy_context(ec); - - pthread_exit(NULL); - return NULL; -} - -static void *entr_B_distribute(void *arg){ - H264Context *h = (H264Context *) arg; - EDSlice *s; - - int i, n=0, poc; - -// printf("eb dist, pid %d\n", syscall(SYS_gettid)); - - for(i=0; iedb_threads; i++){ - h->b[i].h =h; - h->b[i].thread_num =i; - h->b[i].thread_total =h->edb_threads; - pthread_mutex_init(&h->b[i].mbs_lock, NULL); - pthread_cond_init(&h->b[i].mbs_cond, NULL); - h->b[i].mbs_fo = 0; - h->b[i].mbs_cnt = SLICE_BUFS; - h->b[i].ed_fi =0; - h->b[i].ed_fo =0; - h->b[i].ed_cnt =0; - h->b[i].lines_cnt =0; - h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads]; - pthread_mutex_init(&h->b[i].ed_lock, NULL); - pthread_cond_init(&h->b[i].ed_cond, NULL); - pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]); - } - - for(;;){ - { - pthread_mutex_lock(&h->lock[ENTROPY3B]); - while (h->ed_B_cnt<=0) - pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); - s= &h->ed_B_q[h->ed_B_fo]; - h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT; - pthread_mutex_unlock(&h->lock[ENTROPY3B]); - - } - if (s->state<0) - break; - - if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){ - while (poc < s->ref_list[1][0]->poc){ - pthread_mutex_lock(&h->lock[ENTROPY2]); - while (poc == h->ed_poc) - pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]); - poc = h->ed_poc; - pthread_mutex_unlock(&h->lock[ENTROPY2]); - } - } - { - pthread_mutex_lock(&h->b[n].ed_lock); - while (h->b[n].ed_cnt >= MAX_SLICE_COUNT) - pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock); - h->b[n].ed_q[ h->b[n].ed_fi] = *s; - h->b[n].ed_cnt++; - h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->b[n].ed_cond); - pthread_mutex_unlock(&h->b[n].ed_lock); - - n++; n%=h->edb_threads; - } - { - pthread_mutex_lock(&h->lock[ENTROPY3B]); - h->ed_B_cnt--; - pthread_cond_signal(&h->cond[ENTROPY3B]); - pthread_mutex_unlock(&h->lock[ENTROPY3B]); - - } - - } - - for (i=0; iedb_threads; i++){ - pthread_mutex_lock(&h->b[i].ed_lock); - while (h->b[i].ed_cnt >= MAX_SLICE_COUNT) - pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock); - h->b[i].ed_q[ h->b[i].ed_fi] = *s; - h->b[i].ed_cnt++; - h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->b[i].ed_cond); - pthread_mutex_unlock(&h->b[i].ed_lock); - - } - for(int i=0; iedb_threads; i++){ - pthread_join(h->ed_B_thr[i], NULL); - } - pthread_exit(NULL); - return NULL; -} - - -static void *entr_IPB_distribute(void *arg){ - H264Context *h = (H264Context *) arg; - EDSlice *s; - int i,n=0; - - create_spe_ED_threads(h, h->edip_threads, h->edb_threads); - pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h); - for(i=0; iedip_threads + h->edip_ppe_threads; i++){ - h->ip[i].h =h; - h->ip[i].cell = (i >= h->edip_ppe_threads); - pthread_mutex_init(&h->ip[i].mbs_lock, NULL); - pthread_cond_init(&h->ip[i].mbs_cond, NULL); - h->ip[i].thread_num = i - h->edip_ppe_threads; - h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads; - h->ip[i].mbs_fo = 0; - h->ip[i].mbs_cnt = SLICE_BUFS; - h->ip[i].ed_fi =0; - h->ip[i].ed_fo =0; - pthread_mutex_init(&h->ip[i].ed_lock, NULL); - pthread_cond_init(&h->ip[i].ed_cond, NULL); - pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]); - } - - for(;;){ - { - pthread_mutex_lock(&h->lock[ENTROPY]); - while (h->ed_cnt<=0) - pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]); - s= &h->ed_q[h->ed_fo]; - - pthread_mutex_unlock(&h->lock[ENTROPY]); - h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT; - } - if (s->state<0) - break; - - assert(s->current_picture); - if (s->slice_type_nos == FF_B_TYPE ) - { - pthread_mutex_lock(&h->lock[ENTROPY3B]); - while (h->ed_B_cnt>=MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); - h->ed_B_q[h->ed_B_fi] = *s; - h->ed_B_cnt++; - h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[ENTROPY3B]); - pthread_mutex_unlock(&h->lock[ENTROPY3B]); - }else - { - ///round robin now, change to based on rawframes size. - pthread_mutex_lock(&h->ip[n].ed_lock); - while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT) - pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock); - h->ip[n].ed_q[ h->ip[n].ed_fi] = *s; - h->ip[n].ed_cnt++; - h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->ip[n].ed_cond); - pthread_mutex_unlock(&h->ip[n].ed_lock); - - n++; n %=(h->edip_threads+h->edip_ppe_threads); - } - { - pthread_mutex_lock(&h->lock[ENTROPY]); - h->ed_cnt--; - pthread_cond_signal(&h->cond[ENTROPY]); - pthread_mutex_unlock(&h->lock[ENTROPY]); - - } - } - - { - pthread_mutex_lock(&h->lock[ENTROPY3B]); - while (h->ed_B_cnt>=MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); - h->ed_B_q[h->ed_B_fi] = *s; - h->ed_B_cnt++; - h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[ENTROPY3B]); - pthread_mutex_unlock(&h->lock[ENTROPY3B]); - } - { - for (i=0; iedip_threads + h->edip_ppe_threads; i++){ - pthread_mutex_lock(&h->ip[i].ed_lock); - while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT) - pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock); - h->ip[i].ed_q[ h->ip[i].ed_fi] = *s; - h->ip[i].ed_cnt++; - h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->ip[i].ed_cond); - pthread_mutex_unlock(&h->ip[i].ed_lock); - } - } - { - pthread_mutex_lock(&h->lock[ENTROPY4]); - while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); - h->ed_reorder_q[h->ed_reorder_fi] = *s; - h->ed_reorder_cnt++; - h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[ENTROPY4]); - pthread_mutex_unlock(&h->lock[ENTROPY4]); - - } - pthread_join(h->ed_B_dist, NULL); - for(i=0; iedip_threads; i++){ - pthread_join(h->ed_IP_thr[i], NULL); - } - pthread_exit(NULL); - return NULL; -} - -static pthread_t ed_IPB_dist; -static void *entropy_IPB_cell_thread(void *arg){ - H264Context *h = (H264Context *) arg; - int i; - EDSlice reorder[MAX_SLICE_COUNT]; - int ip_poc[MAX_SLICE_COUNT][2]={0,}; - int next_ip_id=0; - int ip_poc_cnt=0; - EDSlice *s; - int reorder_cnt=0; - unsigned next_pic_num=0; - - pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h); - int count =0; - for(;;){ - //signals received from the entropy decoders - { - pthread_mutex_lock(&h->lock[ENTROPY4]); - while (h->ed_reorder_cnt<=0) - pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); - s= &h->ed_reorder_q[h->ed_reorder_fo]; - h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT; - pthread_mutex_unlock(&h->lock[ENTROPY4]); - } - - if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){ - for (i=0; iip_id < ip_poc[i][0]){ - memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int)); - break; - } - } - ip_poc[i][0]= s->ip_id; - ip_poc[i][1]= s->current_picture->poc; - ip_poc_cnt++; - - while (next_ip_id == ip_poc[0][0]){ - pthread_mutex_lock(&h->lock[ENTROPY2]); - h->ed_poc = ip_poc[0][1]; - - pthread_cond_signal(&h->cond[ENTROPY2]); - pthread_mutex_unlock(&h->lock[ENTROPY2]); - memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int)); - ip_poc_cnt--; - next_ip_id++; - } - } - - for(i=reorder_cnt; i>0; i--){ - if (s->coded_pic_num < reorder[i-1].coded_pic_num) - break; - reorder[i]=reorder[i-1]; - } - reorder[i]=*s; - - while(reorder_cnt>=0){ - if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){ - break; - } - EDSlice *es = &reorder[reorder_cnt]; - - { - pthread_mutex_lock(&h->lock[MBDEC]); - while (h->mbdec_cnt >= MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); - copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es); - - h->mbdec_cnt++; - h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[MBDEC]); - pthread_mutex_unlock(&h->lock[MBDEC]); - - } - - if (es->state<0) - goto end; - - assert(es->current_picture); - for (int i=0; irelease_cnt; i++){ - release_pib_entry(h, es->release_ref[i], 2); - } - release_pib_entry(h, es->current_picture, 1); - av_freep(&es->gb.raw); - if (es->gb.rbsp) - av_freep(&es->gb.rbsp); - - next_pic_num++; - reorder_cnt--; - } - reorder_cnt++; - - { - pthread_mutex_lock(&h->lock[ENTROPY4]); - h->ed_reorder_cnt--; - pthread_cond_signal(&h->cond[ENTROPY4]); - pthread_mutex_unlock(&h->lock[ENTROPY4]); - } - } - -end: - pthread_join(ed_IPB_dist, NULL); - pthread_exit(NULL); - return NULL; -} - - -static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){ - dst->deblocking_filter =1; - dst->linesize = src->current_picture->linesize[0]; - dst->uvlinesize = src->current_picture->linesize[1]; - dst->mb_width = h->mb_width; - dst->mb_height = h->mb_height; - dst->use_weight = src->use_weight; - dst->use_weight_chroma = src->use_weight_chroma; - dst->luma_log2_weight_denom = src->luma_log2_weight_denom; - dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom; - - //weights later - memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t)); - memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t)); - memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t)); - - for(int list=0; list<2; list++){ - for (int i=0; iref_count[list]; i++){ - Picture_spu *p_dst = &dst->ref_list[list][i]; - DecodedPicture *p_src = src->ref_list[list][i]; - if (p_src){ - p_dst->data[0] = p_src->data[0]; - p_dst->data[1] = p_src->data[1]; - p_dst->data[2] = p_src->data[2]; - } - } - } - dst->state = src->state; - - dst->emu_edge_width =32; - dst->emu_edge_height =32; - dst->slice_type = src->slice_type; - dst->slice_type_nos = src->slice_type_nos; - dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset; - dst->slice_beta_offset = src->slice_beta_offset; - - memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64); - - dst->blocks = src->mbs; - dst->dst_y = src->current_picture->data[0]; - dst->dst_cb = src->current_picture->data[1]; - dst->dst_cr = src->current_picture->data[2]; -} - -static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){ - static int rl_fi=0; - - DECLARE_ALIGNED(16, H264slice, spe_slice); - H264spe *p=&spe_params[0]; - unsigned status; - uint8_t *dst_y, *dst_cb, *dst_cr; - - DecodedPicture *dp; - - for (int i=0; i<2; i++){ - for(int j=0; j< s->ref_count[i]; j++){ - if (s->ref_list_cpn[i][j] ==-1) - continue; - int k; - for (k=0; kdpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ - s->ref_list[i][j] = &h->dpb[k]; - break; - } - } - } - } - - dp = get_dpb_entry(h); - init_dpb_entry(dp, s, d->width, d->height); - - if (h->no_mbd) - return; - - - fill_spe_slice(&spe_slice, s, h); - spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0); - spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status); - rl_fi++; rl_fi %= 2; - - _spe_in_mbox_write(spe_control_area[0], 0); - while (atomic_read(rl_cnt)<=0){ - //pthread_yield(); - usleep(1000); - } - atomic_dec(rl_cnt); - - -/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/ -// memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16); -// memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8); -// memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8); -// -// memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16); -// memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8); -// memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8); -// -// decode_slice_mb_seq(d, s); -// -// for (int i=0; imb_height*16; i++){ -// for (int j=0; jwidth; j++){ -// if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){ -// printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]); -// return; -// } -// } -// } -// -// for (int i=0; imb_height*8; i++){ -// for (int j=0; jwidth/2; j++){ -// if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){ -// printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]); -// return; -// } -// } -// } -// -// for (int i=0; imb_height*8; i++){ -// for (int j=0; jwidth/2; j++){ -// if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){ -// printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]); -// return; -// } -// } -// } - - - //printf("dst_y %p\n", dst_y); - - - for (int i=0; irelease_cnt; i++){ - for(int j=0; jdpb[j].cpn== s->release_ref_cpn[i]){ - release_dpb_entry(h, &h->dpb[j], 2); - break; - } - } - } - s->release_cnt=0; - -} - -static void *h264_spe_thread(void * thread_args ) { - H264spe *params = (H264spe *)thread_args; - unsigned int spe_id = params->spe_id; - unsigned int runflags = 0; - unsigned int entry = SPE_DEFAULT_ENTRY; - // run SPE context - spe_context_run(spe_context[spe_id], &entry, runflags, (void*) params, NULL, NULL); - // done - now exit thread - pthread_exit(NULL); -} - -static int create_spe_MBR_threads(H264Context *h, int num_threads) { - int i; - - // reserve memory for spe thread id, context and argument addresses - spe_tid = av_malloc(num_threads * sizeof (pthread_t)); - spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); - spe_params = av_malloc(num_threads * sizeof (H264spe)); - spe_control_area = av_malloc(num_threads * sizeof (void*)); - spe_ls_area = av_malloc(num_threads * sizeof (void*)); - spe_slice_buf = av_malloc(num_threads * sizeof (void*)); - - spe_program_handle_t *spe_program = spe_image_open("spe_mbd"); - - if (spe_program == NULL) - av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); - - for (i = 0; i < num_threads; i++) { - // create context for spe program - spe_context[i] = spe_context_create(SPE_MAP_PS, NULL); - if (spe_context[i] == NULL) - av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); - // load SPE program into main memory - if ((spe_program_load(spe_context[i], spe_program)) == -1) - av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); - //get the control_area for fast mailboxing - if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL) - av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); - //get ls area for inter spe communication - if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL) - av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); - } - - for (i = 0; i < num_threads; i++) { - spe_params[i].mb_width = h->mb_width; - spe_params[i].mb_height = h->mb_height; - spe_params[i].mb_stride = h->mb_stride; - spe_params[i].spe_id = i; - spe_params[i].spe_total = num_threads; - //spe_params[i].slice_params= &slice_params; - spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads]; - spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads]; - - spe_params[i].rl_lock = rl_lock; - spe_params[i].rl_cond = rl_cond; - spe_params[i].rl_cnt = rl_cnt; - spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i]; - spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i]; - spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0); - - mutex_init(spe_params[i].lock); - cond_init(spe_params[i].cond); - if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i])) - av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); - - //slicebufaddr - spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]); - - av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); - } - spe_image_close(spe_program); - return 0; -} - -//_spe_out_mbox_read(spe_control_area[i]); -/** -* joins all the spe worker threads. -*/ -static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) { - int i; - ///just to keep coding consistency. - { - for (i=0; icnt)>=2) {//double buffered - usleep(1000);//cond_wait(p->cond, p->lock); - } - - spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0); - spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); - //mutex_unlock(p->lock); - _spe_in_mbox_write(spe_control_area[i], 0); - } - } - - for (i=0; irl_threads); - for(;;){ - { - pthread_mutex_lock(&h->lock[MBDEC]); - while (h->mbdec_cnt<=0) - pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); - s= &h->mbdec_q[h->mbdec_fo]; - h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT; - pthread_mutex_unlock(&h->lock[MBDEC]); - } - - if (s->state<0){ - break; - } - for (int i=0; i<2; i++){ - for(int j=0; j< s->ref_count[i]; j++){ - if (s->ref_list_cpn[i][j] ==-1) - continue; - int k; - for (k=0; kdpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ - s->ref_list[i][j] = &h->dpb[k]; - break; - } - } - - } - } - dp = get_dpb_entry(h); - init_dpb_entry(dp, s, h->width, h->height); - assert(s->current_picture); - { - while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ - usleep(1000); - } - h->mbrel_q[h->mbrel_fi] = *s; - - h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; - } - { - if(h->no_mbd){ - atomic_inc(rl_cnt); - }else { - fill_spe_slice(&spe_slice, s, h); - for (i=0; irl_threads; i++){ - H264spe *p=&spe_params[i]; - unsigned status; - while (atomic_read(p->cnt)>=2){ //double buffered - usleep(1000); - //cond_wait(p->cond, p->lock); - } - spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0); - spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); - rl_fi[i]++; rl_fi[i] %= 2; - atomic_inc(p->cnt); - - _spe_in_mbox_write(spe_control_area[i], 0); - } - } - } - - { - pthread_mutex_lock(&h->lock[MBDEC]); - h->mbdec_cnt--; - pthread_cond_signal(&h->cond[MBDEC]); - pthread_mutex_unlock(&h->lock[MBDEC]); - } - - } - - { - while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ - usleep(1000); - } - h->mbrel_q[h->mbrel_fi] = *s; - - h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; - } - spe_slice.state=-1; - join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi); - pthread_exit(NULL); - return NULL; -} - -static void *mbdec_cell_thread(void *arg){ - H264Context *h = (H264Context *) arg; - - rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; - rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; - rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; - atomic_set(rl_cnt, 0); - mutex_init(rl_lock); - cond_init(rl_cond); -// printf("mbdec, pid %d\n", syscall(SYS_gettid)); - pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h); - - for(;;){ - MBSlice *s=NULL; - { - while (atomic_read(rl_cnt)<=0){ - usleep(1000); - } - s= &h->mbrel_q[h->mbrel_fo]; - h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT; - } - - if (s->state<0) - break; - - for (int i=0; irelease_cnt; i++){ - for(int j=0; jdpb[j].cpn== s->release_ref_cpn[i]){ - release_dpb_entry(h, &h->dpb[j], 2); - break; - } - } - } - - { - EDThreadContext *ed = s->ed; - pthread_mutex_lock(&ed->mbs_lock); - ed->mbs_cnt++; - pthread_cond_signal(&ed->mbs_cond); - pthread_mutex_unlock(&ed->mbs_lock); - } - - { - pthread_mutex_lock(&h->lock[WRITE]); - while (h->write_cnt>= DPB_SIZE) - pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); - assert(s); - assert(s->current_picture); - h->write_q[h->write_fi]= s->current_picture; - h->write_cnt++; - h->write_fi++; h->write_fi %= DPB_SIZE; - pthread_cond_signal(&h->cond[WRITE]); - pthread_mutex_unlock(&h->lock[WRITE]); - - } - { - atomic_dec(rl_cnt); - } - - } - - {//propagate exit - pthread_mutex_lock(&h->lock[WRITE]); - while (h->write_cnt>= DPB_SIZE) - pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); - last_pic.reference = -1; - h->write_q[h->write_fi] = &last_pic; - h->write_cnt++; - h->write_fi++; h->write_fi %= DPB_SIZE; - pthread_cond_signal(&h->cond[WRITE]); - pthread_mutex_unlock(&h->lock[WRITE]); - - } - pthread_join(h->rl_dist_thr, NULL); - pthread_exit(NULL); - return NULL; -} - -/* -* The following code is the main loop of the file converter -*/ -int h264_decode_cell(H264Context *h) { - - pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; - - start_timer(); - - pthread_create(&read_thr, NULL, read_thread, h); - pthread_create(&parsenal_thr, NULL, parsenal_thread, h); - pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h); - pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h); - pthread_create(&write_thr, NULL, write_thread, h); - - pthread_join(read_thr, NULL); - pthread_join(parsenal_thr, NULL); - pthread_join(entropy_thr, NULL); - pthread_join(mbdec_thr, NULL); - pthread_join(write_thr, NULL); - - return 0; -} - -/* -* The following code is the main loop of the file converter -*/ -int h264_decode_cell_seq(H264Context *h) { -ParserContext *pc; - NalContext *nc; - EntropyContext *ec; - MBRecContext *rc; - OutputContext *oc; - - RawFrame frm; - EDSlice slice, *s=&slice; - MBSlice mbslice, *s2=&mbslice; - PictureInfo *pic=NULL; - DecodedPicture *out; - int size; - int frames=0; - - pc = get_parse_context(h->ifile); - nc = get_nal_context(h->width, h->height); - ec = get_entropy_context( h ); - rc = get_mbrec_context(h); - oc = get_output_context( h ); - - rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; - rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; - rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; - atomic_set(rl_cnt, 0); - mutex_init(rl_lock); - cond_init(rl_cond); - - memset(s, 0, sizeof(EDSlice)); - ff_init_slice(nc, s); - s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb)); - - DecodedPicture tmp; - tmp.base[0]=0; - ///fix this when want to debug the Cell errors - //init_dpb_entry(&tmp, h->width, h->height); - - create_spe_ED_threads(h, 1, 0); - create_spe_MBR_threads(h, 1); - - start_timer(); - - while(!pc->final_frame && frames++ < h->num_frames){ - - av_read_frame_internal(pc, &frm); - - PictureInfo *pic=get_pib_entry(h); - ff_alloc_picture_info(nc, s, pic); - decode_nal_units(nc, s, &frm); - - copyEDtoMBSlice(s2, s); - decode_slice_entropy_cell_seq(h, ec, s); - - decode_slice_mb_seq_cell(h, rc, s2, &tmp); - - out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height); - - if (out){ - release_dpb_entry(h, out, 1); - } - print_report(oc->frame_number, oc->video_size, 0, h->verbose); - } - while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; - - print_report(oc->frame_number, oc->video_size, 1, h->verbose); - - /* finished ! */ - av_freep(&s->mbs); - - free_parse_context(pc); - free_nal_context (nc); - free_entropy_context(ec); - free_mbrec_context(rc); - free_output_context(oc); - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_data.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_data.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,243 +0,0 @@ -/* - * H26L/H264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * @brief - * H264 / AVC / MPEG4 part10 codec data table - * @author Michael Niedermayer - */ - -#ifndef AVCODEC_H264DATA_H -#define AVCODEC_H264DATA_H - -#include -#include "avcodec.h" -//#include "h264.h" - -/* -o-o o-o - / / / -o-o o-o - ,---' -o-o o-o - / / / -o-o o-o -*/ -//This table must be here because scan8[constant] must be known at compiletime -static const uint8_t scan8[16 + 2*4]={ - 4+1*8, 5+1*8, 4+2*8, 5+2*8, - 6+1*8, 7+1*8, 6+2*8, 7+2*8, - 4+3*8, 5+3*8, 4+4*8, 5+4*8, - 6+3*8, 7+3*8, 6+4*8, 7+4*8, - 1+1*8, 2+1*8, - 1+2*8, 2+2*8, - 1+4*8, 2+4*8, - 1+5*8, 2+5*8, -}; - -static const uint8_t golomb_to_pict_type[5]= -{FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE}; - -static const uint8_t golomb_to_intra4x4_cbp[48]={ - 47, 31, 15, 0, 23, 27, 29, 30, 7, 11, 13, 14, 39, 43, 45, 46, - 16, 3, 5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44, 1, 2, 4, - 8, 17, 18, 20, 24, 6, 9, 22, 25, 32, 33, 34, 36, 40, 38, 41 -}; - -static const uint8_t golomb_to_inter_cbp[48]={ - 0, 16, 1, 2, 4, 8, 32, 3, 5, 10, 12, 15, 47, 7, 11, 13, - 14, 6, 9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46, - 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41 -}; - -static const uint8_t zigzag_scan[16]={ - 0+0*4, 1+0*4, 0+1*4, 0+2*4, - 1+1*4, 2+0*4, 3+0*4, 2+1*4, - 1+2*4, 0+3*4, 1+3*4, 2+2*4, - 3+1*4, 3+2*4, 2+3*4, 3+3*4, -}; - -static const uint8_t field_scan[16]={ - 0+0*4, 0+1*4, 1+0*4, 0+2*4, - 0+3*4, 1+1*4, 1+2*4, 1+3*4, - 2+0*4, 2+1*4, 2+2*4, 2+3*4, - 3+0*4, 3+1*4, 3+2*4, 3+3*4, -}; - -static const uint8_t luma_dc_zigzag_scan[16]={ - 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64, - 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64, - 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64, - 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64, -}; - -static const uint8_t luma_dc_field_scan[16]={ - 0*16 + 0*64, 2*16 + 0*64, 1*16 + 0*64, 0*16 + 2*64, - 2*16 + 2*64, 3*16 + 0*64, 1*16 + 2*64, 3*16 + 2*64, - 0*16 + 1*64, 2*16 + 1*64, 0*16 + 3*64, 2*16 + 3*64, - 1*16 + 1*64, 3*16 + 1*64, 1*16 + 3*64, 3*16 + 3*64, -}; - -static const uint8_t chroma_dc_scan[4]={ - (0+0*2)*16, (1+0*2)*16, - (0+1*2)*16, (1+1*2)*16, //FIXME -}; - - -static const uint8_t field_scan8x8[64]={ - 0+0*8, 0+1*8, 0+2*8, 1+0*8, - 1+1*8, 0+3*8, 0+4*8, 1+2*8, - 2+0*8, 1+3*8, 0+5*8, 0+6*8, - 0+7*8, 1+4*8, 2+1*8, 3+0*8, - 2+2*8, 1+5*8, 1+6*8, 1+7*8, - 2+3*8, 3+1*8, 4+0*8, 3+2*8, - 2+4*8, 2+5*8, 2+6*8, 2+7*8, - 3+3*8, 4+1*8, 5+0*8, 4+2*8, - 3+4*8, 3+5*8, 3+6*8, 3+7*8, - 4+3*8, 5+1*8, 6+0*8, 5+2*8, - 4+4*8, 4+5*8, 4+6*8, 4+7*8, - 5+3*8, 6+1*8, 6+2*8, 5+4*8, - 5+5*8, 5+6*8, 5+7*8, 6+3*8, - 7+0*8, 7+1*8, 6+4*8, 6+5*8, - 6+6*8, 6+7*8, 7+2*8, 7+3*8, - 7+4*8, 7+5*8, 7+6*8, 7+7*8, -}; - -typedef struct IMbInfo{ - uint16_t type; - uint8_t pred_mode; - uint8_t cbp; -} IMbInfo; - -static const IMbInfo i_mb_type_info[26]={ -{MB_TYPE_INTRA4x4 , -1, -1}, -{MB_TYPE_INTRA16x16, 2, 0}, -{MB_TYPE_INTRA16x16, 1, 0}, -{MB_TYPE_INTRA16x16, 0, 0}, -{MB_TYPE_INTRA16x16, 3, 0}, -{MB_TYPE_INTRA16x16, 2, 16}, -{MB_TYPE_INTRA16x16, 1, 16}, -{MB_TYPE_INTRA16x16, 0, 16}, -{MB_TYPE_INTRA16x16, 3, 16}, -{MB_TYPE_INTRA16x16, 2, 32}, -{MB_TYPE_INTRA16x16, 1, 32}, -{MB_TYPE_INTRA16x16, 0, 32}, -{MB_TYPE_INTRA16x16, 3, 32}, -{MB_TYPE_INTRA16x16, 2, 15+0}, -{MB_TYPE_INTRA16x16, 1, 15+0}, -{MB_TYPE_INTRA16x16, 0, 15+0}, -{MB_TYPE_INTRA16x16, 3, 15+0}, -{MB_TYPE_INTRA16x16, 2, 15+16}, -{MB_TYPE_INTRA16x16, 1, 15+16}, -{MB_TYPE_INTRA16x16, 0, 15+16}, -{MB_TYPE_INTRA16x16, 3, 15+16}, -{MB_TYPE_INTRA16x16, 2, 15+32}, -{MB_TYPE_INTRA16x16, 1, 15+32}, -{MB_TYPE_INTRA16x16, 0, 15+32}, -{MB_TYPE_INTRA16x16, 3, 15+32}, -{MB_TYPE_INTRA_PCM , -1, -1}, -}; - -typedef struct PMbInfo{ - uint16_t type; - uint8_t partition_count; -} PMbInfo; - -static const PMbInfo p_mb_type_info[5]={ -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 4}, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4}, -}; - -static const PMbInfo p_sub_mb_type_info[4]={ -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, -{MB_TYPE_16x8 |MB_TYPE_P0L0 , 2}, -{MB_TYPE_8x16 |MB_TYPE_P0L0 , 2}, -{MB_TYPE_8x8 |MB_TYPE_P0L0 , 4}, -}; - -static const PMbInfo b_mb_type_info[23]={ -{MB_TYPE_DIRECT2|MB_TYPE_L0L1 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, -{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, -}; - -static const PMbInfo b_sub_mb_type_info[13]={ -{MB_TYPE_DIRECT2 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, -{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, -{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, -{MB_TYPE_8x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 4, }, -{MB_TYPE_8x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 4, }, -{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, -}; - -static const uint8_t dequant4_coeff_init[6][3]={ - {10,13,16}, - {11,14,18}, - {13,16,20}, - {14,18,23}, - {16,20,25}, - {18,23,29}, -}; - -static const uint8_t dequant8_coeff_init_scan[16] = { - 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 -}; -static const uint8_t dequant8_coeff_init[6][6]={ - {20,18,32,19,25,24}, - {22,19,35,21,28,26}, - {26,23,42,24,33,31}, - {28,25,45,26,35,33}, - {32,28,51,30,40,38}, - {36,32,58,34,46,43}, -}; - -#endif /* AVCODEC_H264DATA_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_deblock.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_deblock.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,507 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... loop filter - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 loop filter. - * @author Michael Niedermayer - */ - -#include "dsputil.h" -#include "mathops.h" -#include "rectangle.h" -#include "h264_types.h" -#include "h264_misc.h" -#include "h264_data.h" -//#undef NDEBUG -#include - -/* Deblocking filter (p153) */ -static const uint8_t alpha_table[52*3] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, - 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, - 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, - 80, 90,101,113,127,144,162,182,203,226, - 255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, - 255,255,255,255,255,255,255,255,255,255,255,255,255, -}; -static const uint8_t beta_table[52*3] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, - 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, - 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, - 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, - 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, - 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, -}; -static const uint8_t tc0_table[52*3][4] = { - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, - {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, - {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, - {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, - {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, - {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, - {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, - {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, - {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, -}; - -av_always_inline static void filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s) { - const unsigned int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - tc[0] = tc0_table[index_a][bS[0]]; - tc[1] = tc0_table[index_a][bS[1]]; - tc[2] = tc0_table[index_a][bS[2]]; - tc[3] = tc0_table[index_a][bS[3]]; - mrc->hdsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc); - } else { - mrc->hdsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta); - } -} - -av_always_inline static void filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { - const unsigned int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - tc[0] = tc0_table[index_a][bS[0]]+1; - tc[1] = tc0_table[index_a][bS[1]]+1; - tc[2] = tc0_table[index_a][bS[2]]+1; - tc[3] = tc0_table[index_a][bS[3]]+1; - mrc->hdsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc); - } else { - mrc->hdsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta); - } -} - - -av_always_inline static void filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { - const unsigned int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - tc[0] = tc0_table[index_a][bS[0]]; - tc[1] = tc0_table[index_a][bS[1]]; - tc[2] = tc0_table[index_a][bS[2]]; - tc[3] = tc0_table[index_a][bS[3]]; - mrc->hdsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc); - } else { - mrc->hdsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta); - } -} - -av_always_inline static void filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { - const unsigned int index_a = qp + s->slice_alpha_c0_offset; - const int alpha = alpha_table[index_a]; - const int beta = beta_table[qp + s->slice_beta_offset]; - if (alpha ==0 || beta == 0) return; - - if( bS[0] < 4 ) { - int8_t tc[4]; - tc[0] = tc0_table[index_a][bS[0]]+1; - tc[1] = tc0_table[index_a][bS[1]]+1; - tc[2] = tc0_table[index_a][bS[2]]+1; - tc[3] = tc0_table[index_a][bS[3]]+1; - mrc->hdsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc); - } else { - mrc->hdsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta); - } -} - -static av_always_inline void filter_mb_dir(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, int dir) { - const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type; - const int qp_xy= m->qscale_mb_xy; - const int qp_dir = dir == 0 ? m->qscale_left_mb_xy : m->qscale_top_mb_xy; - const int linesize = mrc->linesize; - const int uvlinesize = mrc->uvlinesize; - const int mb_type = m->mb_type; - int edge; - const int edges = mrs->edges[dir]; - - if(mbm_type){ - int16_t* bS=mrs->bS[dir][0]; - /* Filter edge */ - // Do not use s->qscale as luma quantizer because it has not the same - // value in IPCM macroblocks. - if(bS[0]+bS[1]+bS[2]+bS[3]){ - int qp = ( qp_xy + qp_dir + 1 ) >> 1; - if( dir == 0 ) { - filter_mb_edgev( &img_y[0], linesize, bS, qp, mrc, s ); - { - int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; - filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, mrc, s); - filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, mrc, s); - } - } else { - filter_mb_edgeh( &img_y[0], linesize, bS, qp, mrc, s ); - { - int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; - filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, mrc, s); - filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, mrc, s); - } - } - } - } - - for( edge = 1; edge < edges; edge++ ) { - int16_t* bS=mrs->bS[dir][edge]; - int qp = qp_xy; - - if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) - continue; - - if(bS[0]+bS[1]+bS[2]+bS[3] == 0) - continue; - - /* Filter edge */ - // Do not use s->qscale as luma quantizer because it has not the same - // value in IPCM macroblocks. - - if( dir == 0 ) { - filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, mrc, s); - if( (edge&1) == 0 ) { - filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s); - filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s); - } - } else { - filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, mrc, s ); - if( (edge&1) == 0 ) { - filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s); - filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s); - } - } - } -} - -static int check_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, long b_idx, long bn_idx, int mvy_limit){ - int v; - v= mrs->ref_cache[0][b_idx] != mrs->ref_cache[0][bn_idx]; - if(!v && mrs->ref_cache[0][b_idx]!=-1) - // absolute value >= 7 | ... - v= ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) | - ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit); - - if(s->list_count==2){ - if(!v) - v = (mrs->ref_cache[1][b_idx] != mrs->ref_cache[1][bn_idx]) | - ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) | - ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit); - - if(v){ - if((mrs->ref_cache[0][b_idx] != mrs->ref_cache[1][bn_idx]) | - (mrs->ref_cache[1][b_idx] != mrs->ref_cache[0][bn_idx])) - return 1; - return - ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) | - ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit) | - ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) | - ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit); - } - } - - return v; -} - -static void calc_bS_values(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mvy_limit, int dir) { - int mb_type = m->mb_type; - int edge; - const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type; - - // how often to recheck mv-based bS when iterating between edges - static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1}, - {0,3,1,1,3,3,3,3}}; - const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7]; - const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4; - // how often to recheck mv-based bS when iterating along each edge - const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); - - mrs->edges[dir]= edges; - - if(mbm_type){ - int16_t* bS=mrs->bS[dir][0]; - if( IS_INTRA(mb_type|mbm_type)) { - AV_WN64A(bS, 0x0004000400040004ULL); - } else { - int i; - int mv_done; - if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { - int b_idx= 8 + 4; - int bn_idx= b_idx - (dir ? 8:1); - - bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, 8 + 4, bn_idx, mvy_limit); - mv_done = 1; - } - else - mv_done = 0; - - for( i = 0; i < 4; i++ ) { - int x = dir == 0 ? 0 : i; - int y = dir == 0 ? i : 0; - int b_idx= 8 + 4 + x + 8*y; - int bn_idx= b_idx - (dir ? 8:1); - - if( mrs->non_zero_count_cache[b_idx] | - mrs->non_zero_count_cache[bn_idx] ) { - bS[i] = 2; - } - else if(!mv_done) - { - bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); - } - } - } - } - - /* Calculate bS */ - for( edge = 1; edge < edges; edge++ ) { - int16_t* bS=mrs->bS[dir][edge]; - - if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) - continue; - - if( IS_INTRA(mb_type)) { - AV_WN64A(bS, 0x0003000300030003ULL); - } else { - int i; - int mv_done; - - if( edge & mask_edge ) { - AV_ZERO64(bS); - mv_done = 1; - } - else if( mask_par0 ) { - int b_idx= 8 + 4 + edge * (dir ? 8:1); - int bn_idx= b_idx - (dir ? 8:1); - - bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); - mv_done = 1; - } - else - mv_done = 0; - - for( i = 0; i < 4; i++ ) { - int x = dir == 0 ? edge : i; - int y = dir == 0 ? i : edge; - int b_idx= 8 + 4 + x + 8*y; - int bn_idx= b_idx - (dir ? 8:1); - - if( mrs->non_zero_count_cache[b_idx] | - mrs->non_zero_count_cache[bn_idx] ) { - bS[i] = 2; - } - else if(!mv_done) - { - bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); - } - } - - if(bS[0]+bS[1]+bS[2]+bS[3] == 0) - continue; - } - - } -} - - -/** -* -* @return zero if the loop filter can be skiped -*/ -static int fill_filter_caches(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ - H264Mb *m_top = m - mrc->mb_width; - H264Mb *m_left = m - 1; - const int mb_x = m->mb_x; - const int mb_y = m->mb_y; - int top_type, left_type; - int qp, top_qp, left_qp; - int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice - - qp = m->qscale_mb_xy ; - left_qp = m->qscale_left_mb_xy ; - top_qp = m->qscale_top_mb_xy ; - - //for sufficiently low qp, filtering wouldn't do anything - //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp - if(qp <= qp_thresh - && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh) - && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){ - return 0; - } - - if(IS_INTRA(mb_type)){ - return 1; - } - - { - int list; - for(list=0; listlist_count; list++){ - int8_t *ref; - - if(!USES_LIST(mb_type, list)){ - fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); - fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); - AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - AV_WN32A(&mrs->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); - continue; - } - - ref = &mrs->ref_index[list][4*mb_x]; - { - int (*ref2frm)[64] =(void *) (s->ref2frm[0] + 2); - AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - ref += 2; - - AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - AV_WN32A(&mrs->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); - } - } - } - - /* - 0 . T T. T T T T - 1 L . .L . . . . - 2 L . .L . . . . - 3 . T TL . . . . - 4 L . .L . . . . - 5 L . .. . . . . - */ - - if (IS_SKIP(mb_type)){ - memset(mrs->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui - } - - //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) - top_type = mrs->top_type; - left_type = mrs->left_type; - if(top_type){ - AV_COPY32(&mrs->non_zero_count_cache[4+8*0], &m_top->non_zero_count[3*4]); - } - - if(left_type){ - mrs->non_zero_count_cache[3+8*1]= m_left->non_zero_count[3+0*4]; - mrs->non_zero_count_cache[3+8*2]= m_left->non_zero_count[3+1*4]; - mrs->non_zero_count_cache[3+8*3]= m_left->non_zero_count[3+2*4]; - mrs->non_zero_count_cache[3+8*4]= m_left->non_zero_count[3+3*4]; - } - - if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ - int list; - for(list=0; listlist_count; list++){ - if(USES_LIST(top_type, list)){ - const int b_xy= 4*mb_x + 3*mrc->b_stride; - const int b8_x= 4*mb_x + 2; - int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); - AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]); - - mrs->ref_cache[list][scan8[0] + 0 - 1*8]= - mrs->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 0]]; - mrs->ref_cache[list][scan8[0] + 2 - 1*8]= - mrs->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 1]]; - }else{ - AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]); - AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u); - } - - if(USES_LIST(left_type, list)){ - const int b_x = 4*(mb_x-1) + 3; - const int b8_x= 4*(mb_x-1) + 1; - int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); - AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 0 ], mrs->motion_val[list][b_x + mrc->b_stride*0]); - AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 8 ], mrs->motion_val[list][b_x + mrc->b_stride*1]); - AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +16 ], mrs->motion_val[list][b_x + mrc->b_stride*2]); - AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +24 ], mrs->motion_val[list][b_x + mrc->b_stride*3]); - - mrs->ref_cache[list][scan8[0] - 1 + 0 ]= - mrs->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*0]]; - mrs->ref_cache[list][scan8[0] - 1 +16 ]= - mrs->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*1]]; - - }else{ - AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 0 ]); - AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 8 ]); - AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +16 ]); - AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +24 ]); - - mrs->ref_cache[list][scan8[0] - 1 + 0 ]= - mrs->ref_cache[list][scan8[0] - 1 + 8 ]= - mrs->ref_cache[list][scan8[0] - 1 + 16 ]= - mrs->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED; - } - } - } - return 1; -} - -void ff_h264_filter_mb(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) { - if (fill_filter_caches(mrc, mrs, s, m, m->mb_type)){ - calc_bS_values(mrc, mrs, s, m, 4, 0); - calc_bS_values(mrc, mrs, s, m, 4, 1); - filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 0); - filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 1); - } -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_deblock.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_deblock.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -#ifndef H264_LOOPFILTER_H -#define H264_LOOPFILTER_H - -#include "h264_types.h" - -void ff_h264_filter_mb(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_dsp.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_dsp.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,320 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003-2010 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 DSP functions. - * @author Michael Niedermayer - */ - -#include -#include "avcodec.h" -#include "h264_dsp.h" - -#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) -#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) -#define H264_WEIGHT(W,H) \ -static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ - int y; \ - offset <<= log2_denom; \ - if(log2_denom) offset += 1<<(log2_denom-1); \ - for(y=0; y> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); - tc++; - } - if( FFABS( q2 - q0 ) < beta ) { - if(tc0[i]) - pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); - tc++; - } - - i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ - pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ - } - pix += ystride; - } - } -} -static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); -} -static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); -} - -static av_always_inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) -{ - int d; - for( d = 0; d < 16; d++ ) { - const int p2 = pix[-3*xstride]; - const int p1 = pix[-2*xstride]; - const int p0 = pix[-1*xstride]; - - const int q0 = pix[ 0*xstride]; - const int q1 = pix[ 1*xstride]; - const int q2 = pix[ 2*xstride]; - - if( FFABS( p0 - q0 ) < alpha && - FFABS( p1 - p0 ) < beta && - FFABS( q1 - q0 ) < beta ) { - - if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ - if( FFABS( p2 - p0 ) < beta) - { - const int p3 = pix[-4*xstride]; - /* p0', p1', p2' */ - pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; - pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; - pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; - } else { - /* p0' */ - pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - } - if( FFABS( q2 - q0 ) < beta) - { - const int q3 = pix[3*xstride]; - /* q0', q1', q2' */ - pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; - pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; - pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; - } else { - /* q0' */ - pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } - }else{ - /* p0', q0' */ - pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; - pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; - } - } - pix += ystride; - } -} -static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); -} -static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); -} - -static av_always_inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) -{ - int i, d; - for( i = 0; i < 4; i++ ) { - const int tc = tc0[i]; - if( tc <= 0 ) { - pix += 2*ystride; - continue; - } - for( d = 0; d < 2; d++ ) { - const int p0 = pix[-1*xstride]; - const int p1 = pix[-2*xstride]; - const int q0 = pix[0]; - const int q1 = pix[1*xstride]; - - if( FFABS( p0 - q0 ) < alpha && - FFABS( p1 - p0 ) < beta && - FFABS( q1 - q0 ) < beta ) { - - int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); - - pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ - pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ - } - pix += ystride; - } - } -} -static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); -} -static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); -} - -static av_always_inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) -{ - int d; - for( d = 0; d < 8; d++ ) { - const int p0 = pix[-1*xstride]; - const int p1 = pix[-2*xstride]; - const int q0 = pix[0]; - const int q1 = pix[1*xstride]; - - if( FFABS( p0 - q0 ) < alpha && - FFABS( p1 - p0 ) < beta && - FFABS( q1 - q0 ) < beta ) { - - pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ - pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ - } - pix += ystride; - } -} -static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); -} -static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); -} - -void ff_h264dsp_init(H264DSPContext *c) -{ - c->h264_idct_add= ff_h264_idct_add_c; - c->h264_idct8_add= ff_h264_idct8_add_c; - c->h264_idct_dc_add= ff_h264_idct_dc_add_c; - c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; - c->h264_idct_add16 = ff_h264_idct_add16_c; - c->h264_idct8_add4 = ff_h264_idct8_add4_c; - c->h264_idct_add8 = ff_h264_idct_add8_c; - c->h264_idct_add16intra= ff_h264_idct_add16intra_c; - - c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; - c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; - c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; - c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; - c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; - c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; - c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; - c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; - c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; - c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; - c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; - c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; - c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; - c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; - c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; - c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; - c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; - c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; - c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; - c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; - - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; - c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; - c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; - c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; - c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; - c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; - c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; - c->h264_loop_filter_strength= NULL; - - if (ARCH_ARM) ff_h264dsp_init_arm(c); - if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c); - if (HAVE_MMX) ff_h264dsp_init_x86(c); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_dsp.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_dsp.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2003-2010 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 DSP functions. - * @author Michael Niedermayer - */ - -#ifndef AVCODEC_H264DSP_H -#define AVCODEC_H264DSP_H - -#include -#include "dsputil.h" - -//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); -typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); -typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); - -/** - * Context for storing H.264 DSP functions - */ -typedef struct H264DSPContext{ - /* weighted MC */ - h264_weight_func weight_h264_pixels_tab[10]; - h264_biweight_func biweight_h264_pixels_tab[10]; - - /* loop filter */ - void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); - void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0); - /* v/h_loop_filter_luma_intra: align 16 */ - void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); - void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); - void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0); - void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0); - void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); - void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); - // h264_loop_filter_strength: simd only. the C version is inlined in h264.c - void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], - int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field); - - /* IDCT */ - /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them - NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them - The reason for above, is that no 2 out of one list may use a different permutation. - */ - void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); - void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); - void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); - void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); - void (*h264_dct)(DCTELEM block[4][4]); - void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); - void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); - void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); - void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); - - qpel_mc_func (*qpel_put)[16]; - qpel_mc_func (*qpel_avg)[16]; -}H264DSPContext; - -void ff_h264dsp_init(H264DSPContext *c); -void ff_h264dsp_init_arm(H264DSPContext *c); -void ff_h264dsp_init_ppc(H264DSPContext *c); -void ff_h264dsp_init_x86(H264DSPContext *c); - -#endif /* AVCODEC_H264DSP_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_entropy.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_entropy.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2065 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 cabac decoding. - * @author Michael Niedermayer - */ - -#include "avcodec.h" -#include "h264_types.h" -#include "h264_data.h" -#include "cabac.h" -#include "rectangle.h" -#include "h264_misc.h" - -// #undef NDEBUG -#include - -/* Cabac pre state table */ - -static const int8_t cabac_context_init_I[460][2] = -{ - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 unsused for I */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, - - /* 24- 39 */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - - /* 40 - 53 */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, - - /* 54 - 59 */ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 -> 87 */ - { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, - { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, - { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, - { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, - { -12, 115 },{ -16, 122 }, - - /* 88 -> 104 */ - { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, - { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, - { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, - { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, - { -22, 125 }, - - /* 105 -> 135 */ - { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, - { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, - { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, - { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, - { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, - { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, - { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, - { 14, 62 }, { -13, 108 },{ -15, 100 }, - - /* 136 -> 165 */ - { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, - { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, - { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, - { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, - { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, - { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, - { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, - { 0, 62 }, { 12, 72 }, - - /* 166 -> 196 */ - { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, - { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, - { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, - { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, - { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, - { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, - { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, - { 0, 89 }, { 26, -19 }, { 22, -17 }, - - /* 197 -> 226 */ - { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, - { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, - { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, - { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, - { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, - { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, - { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, - { 12, 68 }, { 2, 97 }, - - /* 227 -> 251 */ - { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, - { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, - { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, - { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, - { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, - { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, - { -4, 65 }, - - /* 252 -> 275 */ - { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, - { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, - { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, - { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, - { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, - { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 -> 307 */ - { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, - { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, - { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, - { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, - { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, - { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, - { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, - { 9, 64 }, { -12, 104 },{ -11, 97 }, - - /* 308 -> 337 */ - { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, - { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, - { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, - { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, - { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, - { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, - { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, - { 5, 64 }, { 12, 70 }, - - /* 338 -> 368 */ - { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, - { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, - { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, - { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, - { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, - { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, - { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, - { -12, 109 },{ 36, -35 }, { 36, -34 }, - - /* 369 -> 398 */ - { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, - { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, - { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, - { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, - { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, - { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, - { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, - { 29, 39 }, { 19, 66 }, - - /* 399 -> 435 */ - { 31, 21 }, { 31, 31 }, { 25, 50 }, - { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, - { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, - { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, - { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, - { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, - { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, - { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, - { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, - { 0, 68 }, { -9, 92 }, - - /* 436 -> 459 */ - { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, - { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, - { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, - { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, - { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, - { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 } -}; - -static const int8_t cabac_context_init_PB[3][460][2] = -{ - /* i_cabac_init_idc == 0 */ - { - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 */ - { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, - { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, - { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, - { 17, 50 }, - - /* 24 - 39 */ - { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, - { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, - { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, - { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, - - /* 40 - 53 */ - { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, - { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, - { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, - { -3, 81 }, { 0, 88 }, - - /* 54 - 59 */ - { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, - { -7, 72 }, { 1, 58 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 - 87 */ - { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, - { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, - { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, - { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, - { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, - { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, - { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, - { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, - { 0, 68 }, { -4, 69 }, { -8, 88 }, - - /* 105 -> 165 */ - { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, - { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, - { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, - { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, - { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, - { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, - { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, - { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, - { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, - { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, - { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, - { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, - { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, - { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, - { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, - { 9, 69 }, - - /* 166 - 226 */ - { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, - { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, - { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, - { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, - { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, - { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, - { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, - { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, - { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, - { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, - { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, - { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, - { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, - { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, - { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, - { -9, 108 }, - - /* 227 - 275 */ - { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, - { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, - { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, - { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, - { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, - { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, - { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, - { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, - { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, - { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, - { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, - { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, - { -8, 85 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 - 337 */ - { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, - { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, - { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, - { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, - { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, - { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, - { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, - { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, - { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, - { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, - { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, - { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, - { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, - { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, - { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, - { 26, 43 }, - - /* 338 - 398 */ - { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, - { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, - { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, - { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, - { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, - { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, - { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, - { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, - { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, - { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, - { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, - { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, - { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, - { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, - { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, - { 11, 86 }, - - /* 399 - 435 */ - { 12, 40 }, { 11, 51 }, { 14, 59 }, - { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, - { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, - { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, - { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, - { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, - { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, - { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, - { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, - { -8, 66 }, { -8, 76 }, - - /* 436 - 459 */ - { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, - { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, - { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, - { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, - { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, - { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, - }, - - /* i_cabac_init_idc == 1 */ - { - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 */ - { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, - { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, - { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, - { 10, 54 }, - - /* 24 - 39 */ - { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, - { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, - { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, - { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, - - /* 40 - 53 */ - { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, - { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, - { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, - { -7, 86 },{ -5, 95 }, - - /* 54 - 59 */ - { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, - { -5, 72 },{ 0, 61 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 - 104 */ - { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, - { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, - { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, - { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, - { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, - { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, - { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, - { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, - { 0, 68 }, { -7, 74 }, { -9, 88 }, - - /* 105 -> 165 */ - { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, - { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, - { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, - { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, - { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, - { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, - { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, - { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, - { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, - { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, - { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, - { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, - { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, - { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, - { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, - { 0, 89 }, - - /* 166 - 226 */ - { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, - { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, - { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, - { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, - { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, - { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, - { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, - { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, - { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, - { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, - { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, - { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, - { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, - { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, - { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, - { -10, 116 }, - - /* 227 - 275 */ - { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, - { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, - { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, - { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, - { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, - { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, - { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, - { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, - { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, - { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, - { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, - { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, - { -4, 78 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 - 337 */ - { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, - { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, - { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, - { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, - { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, - { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, - { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, - { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, - { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, - { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, - { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, - { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, - { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, - { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, - { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, - { 18, 50 }, - - /* 338 - 398 */ - { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, - { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, - { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, - { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, - { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, - { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, - { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, - { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, - { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, - { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, - { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, - { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, - { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, - { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, - { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, - { 11, 83 }, - - /* 399 - 435 */ - { 25, 32 }, { 21, 49 }, { 21, 54 }, - { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, - { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, - { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, - { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, - { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, - { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, - { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, - { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, - { -4, 67 }, { -7, 82 }, - - /* 436 - 459 */ - { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, - { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, - { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, - { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, - { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, - { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, - }, - - /* i_cabac_init_idc == 2 */ - { - /* 0 - 10 */ - { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, - { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, - { -6, 53 }, { -1, 54 }, { 7, 51 }, - - /* 11 - 23 */ - { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, - { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, - { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, - { 14, 57 }, - - /* 24 - 39 */ - { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, - { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, - { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, - { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, - - /* 40 - 53 */ - { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, - { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, - { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, - { -3, 90 },{ -1, 101 }, - - /* 54 - 59 */ - { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, - { -7, 50 },{ 1, 60 }, - - /* 60 - 69 */ - { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, - { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, - { 13, 41 }, { 3, 62 }, - - /* 70 - 104 */ - { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, - { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, - { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, - { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, - { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, - { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, - { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, - { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, - { 3, 68 }, { -8, 71 }, { -13, 98 }, - - /* 105 -> 165 */ - { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, - { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, - { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, - { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, - { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, - { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, - { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, - { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, - { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, - { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, - { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, - { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, - { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, - { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, - { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, - { -22, 127 }, - - /* 166 - 226 */ - { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, - { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, - { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, - { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, - { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, - { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, - { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, - { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, - { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, - { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, - { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, - { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, - { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, - { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, - { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, - { -24, 127 }, - - /* 227 - 275 */ - { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, - { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, - { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, - { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, - { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, - { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, - { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, - { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, - { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, - { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, - { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, - { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, - { -10, 87 }, - - /* 276 a bit special (not used, bypass is used instead) */ - { 0, 0 }, - - /* 277 - 337 */ - { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, - { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, - { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, - { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, - { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, - { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, - { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, - { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, - { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, - { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, - { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, - { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, - { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, - { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, - { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, - { 25, 42 }, - - /* 338 - 398 */ - { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, - { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, - { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, - { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, - { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, - { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, - { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, - { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, - { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, - { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, - { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, - { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, - { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, - { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, - { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, - { 25, 61 }, - - /* 399 - 435 */ - { 21, 33 }, { 19, 50 }, { 17, 61 }, - { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, - { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, - { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, - { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, - { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, - { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, - { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, - { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, - { -6, 68 }, { -10, 79 }, - - /* 436 - 459 */ - { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, - { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, - { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, - { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, - { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, - { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, - } -}; - -static const uint8_t left_block_options[4][16]={ - {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, - {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, - {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, - {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} -}; - -static const uint8_t rem6[52]={ -0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, -}; - -static const uint8_t div6[52]={ -0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, -}; - -static void init_dequant8_coeff_table(H264Slice *s, EntropyContext *ec){ - int i,q,x; - const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; - ec->dequant8_coeff[0] = ec->dequant8_buffer[0]; - ec->dequant8_coeff[1] = ec->dequant8_buffer[1]; - - for(i=0; i<2; i++){ - if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){ - ec->dequant8_coeff[1] = ec->dequant8_buffer[0]; - break; - } - - for(q=0; q<52; q++){ - int shift = div6[q]; - int idx = rem6[q]; - for(x=0; x<64; x++) - ec->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] = - ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * - s->pps.scaling_matrix8[i][x]) << shift; - } - } -} - -static void init_dequant4_coeff_table(H264Slice *s, EntropyContext *ec){ - int i,j,q,x; - const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; - for(i=0; i<6; i++ ){ - ec->dequant4_coeff[i] = ec->dequant4_buffer[i]; - for(j=0; jpps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){ - ec->dequant4_coeff[i] = ec->dequant4_buffer[j]; - break; - } - } - if(jdequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] = - ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] * - s->pps.scaling_matrix4[i][x]) << shift; - } - } -} - -void init_dequant_tables(H264Slice *s, EntropyContext *ec){ - int i,x; - - init_dequant4_coeff_table(s, ec); - if(s->pps.transform_8x8_mode) - init_dequant8_coeff_table(s, ec); - if(s->transform_bypass){ - for(i=0; i<6; i++) - for(x=0; x<16; x++) - ec->dequant4_coeff[i][0][x] = 1<<6; - if(s->pps.transform_8x8_mode) - for(i=0; i<2; i++) - for(x=0; x<64; x++) - ec->dequant8_coeff[i][0][x] = 1<<6; - } -} - -void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c) { - int i; - const int8_t (*tab)[2]; - - if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I; - else tab = cabac_context_init_PB[s->cabac_init_idc]; - - /* calculate pre-state */ - for( i= 0; i < 460; i++ ) { - int pre = 2*(((tab[i][0] * ec->curr_qscale) >>4 ) + tab[i][1]) - 127; - - pre^= pre>>31; - if(pre > 124) - pre= 124 + (pre&1); - - c->cabac_state[i] = pre; - } -} - -static void fill_decode_neighbors(EntropyContext *ec, H264Slice *s){ - H264Mb *m = ec->m; - const int mb_x = m->mb_x; - - if (m->mb_y){ - ec->top_type = ec->mb_type_top[mb_x]; - ec->topright_type= ec->mb_type_top[mb_x+1]; - ec->topleft_type = ec->mb_type_top[mb_x-1]; - m->qscale_top_mb_xy = ec->qscale_top[mb_x]; - } else { - ec->top_type = 0; - ec->topright_type= 0; - ec->topleft_type = 0; - m->qscale_top_mb_xy = 0; - } - - ec->left_type = ec->mb_type[mb_x-1] ; - m->qscale_left_mb_xy = ec->qscale[mb_x-1]; - -} - -static void fill_decode_caches(EntropyContext *ec, H264Slice *s, int mb_type){ - H264Mb *m = ec->m; - int topleft_type, top_type, topright_type, left_type; - const uint8_t * left_block= left_block_options[0]; - const int mb_x = m->mb_x; - int i; - - topleft_type = ec->topleft_type; - top_type = ec->top_type; - topright_type= ec->topright_type; - left_type = ec->left_type; - - if(!IS_SKIP(mb_type)){ - if(top_type){ - AV_COPY32(&ec->non_zero_count_cache[4+8*0], &ec->non_zero_count_top[mb_x][0]); - ec->non_zero_count_cache[1+8*0]= ec->non_zero_count_top[mb_x][4]; - ec->non_zero_count_cache[2+8*0]= ec->non_zero_count_top[mb_x][5]; - ec->non_zero_count_cache[1+8*3]= ec->non_zero_count_top[mb_x][6]; - ec->non_zero_count_cache[2+8*3]= ec->non_zero_count_top[mb_x][7]; - - }else { - ec->non_zero_count_cache[1+8*0]= - ec->non_zero_count_cache[2+8*0]= - ec->non_zero_count_cache[1+8*3]= - ec->non_zero_count_cache[2+8*3]= - AV_WN32A(&ec->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040); - } - - if(left_type){ - for (i=0; i<2; i++) { - ec->non_zero_count_cache[3+8*1 + 2*8*i]= ec->non_zero_count_left[i*2+0]; - ec->non_zero_count_cache[3+8*2 + 2*8*i]= ec->non_zero_count_left[i*2+1]; - ec->non_zero_count_cache[0+8*1 + 3*8*i]= ec->non_zero_count_left[4+i*2+0]; - ec->non_zero_count_cache[0+8*2 + 3*8*i]= ec->non_zero_count_left[4+i*2+1]; - } - } - else{ - for (i=0; i<2; i++) { - ec->non_zero_count_cache[3+8*1 + 2*8*i]= - ec->non_zero_count_cache[3+8*2 + 2*8*i]= - ec->non_zero_count_cache[0+8*1 + 3*8*i]= - ec->non_zero_count_cache[0+8*2 + 3*8*i]= !IS_INTRA(mb_type) ? 0 : 64; - } - } - - // top_cbp - if(top_type) { - ec->top_cbp = ec->cbp_top[mb_x]; - } else { - ec->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; - } - // left_cbp - if (left_type) { - ec->left_cbp = (ec->cbp[mb_x-1] & 0x1f0) - | ((ec->cbp[mb_x-1]>>(left_block[0]&(~1)))&2) - | (((ec->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2); - } else { - ec->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; - } - } - - if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ - int list; - - ec->ref_cache[0][scan8[5 ]+1] = ec->ref_cache[0][scan8[7 ]+1] = ec->ref_cache[0][scan8[13]+1] = - ec->ref_cache[1][scan8[5 ]+1] = ec->ref_cache[1][scan8[7 ]+1] = ec->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; - - for(list=0; listlist_count; list++){ - if(!USES_LIST(mb_type, list)){ - continue; - } - assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); - - if(USES_LIST(top_type, list)){ - ec->ref_cache[list][scan8[0] + 0 - 1*8]= - ec->ref_cache[list][scan8[0] + 1 - 1*8]= ec->ref_index_top[list][4*mb_x + 2]; - ec->ref_cache[list][scan8[0] + 2 - 1*8]= - ec->ref_cache[list][scan8[0] + 3 - 1*8]= ec->ref_index_top[list][4*mb_x + 3]; - }else{ - AV_WN32A(&ec->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); - } - - if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ - for(i=0; i<2; i++){ - int cache_idx = scan8[0] - 1 + i*2*8; - if(USES_LIST(left_type, list)){ - const int b8_x= 4*(mb_x-1) + 1; - ec->ref_cache[list][cache_idx ]= ec->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; - ec->ref_cache[list][cache_idx+8]= ec->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; - }else{ - ec->ref_cache[list][cache_idx ]= - ec->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); - } - } - }else{ - if(USES_LIST(left_type, list)){ - const int b8_x= 4*(mb_x-1) + 1; - ec->ref_cache[list][scan8[0] - 1]= ec->ref_index[list][b8_x + (left_block[0]&~1)]; - }else{ - ec->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - } - - if(USES_LIST(topright_type, list)){ - ec->ref_cache[list][scan8[0] + 4 - 1*8]= ec->ref_index_top[list][4*(mb_x+1) + 2]; - }else{ - ec->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - if(ec->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ - int topleft_partition= -1; - if(USES_LIST(topleft_type, list)){ - const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); - ec->ref_cache[list][scan8[0] - 1 - 1*8]= ec->ref_index_top[list][b8_x]; - }else{ - ec->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - } - - if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) - continue; - - if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { - ec->ref_cache[list][scan8[4 ]] = - ec->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; - - /* XXX beurk, Load mvd */ - if(USES_LIST(top_type, list)){ - AV_COPY64(ec->mvd_cache[list][scan8[0] + 0 - 1*8], ec->mvd_top[list][8*mb_x + 0]); - }else{ - AV_ZERO64(ec->mvd_cache[list][scan8[0] + 0 - 1*8]); - } - if(USES_LIST(left_type, list)){ - AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 0*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[0]]); - AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 1*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[1]]); - }else{ - AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 0*8]); - AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 1*8]); - } - if(USES_LIST(left_type, list)){ - AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 2*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[2]]); - AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 3*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[3]]); - }else{ - AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 2*8]); - AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 3*8]); - } - AV_ZERO16(ec->mvd_cache [list][scan8[4 ]]); - AV_ZERO16(ec->mvd_cache [list][scan8[12]]); - if(s->slice_type_nos == FF_B_TYPE){ - fill_rectangle(&ec->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); - - if(IS_DIRECT(top_type)){ - AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1)); - }else if(IS_8X8(top_type)){ - int b8_x = 4*mb_x; - ec->direct_cache[scan8[0] + 0 - 1*8]= ec->direct_top[b8_x + 2]; - ec->direct_cache[scan8[0] + 2 - 1*8]= ec->direct_top[b8_x + 3]; - }else{ - AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1)); - } - - if(IS_DIRECT(left_type)) - ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1; - else if(IS_8X8(left_type)) - ec->direct_cache[scan8[0] - 1 + 0*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)]; - else - ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1; - - if(IS_DIRECT(left_type)) - ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1; - else if(IS_8X8(left_type)) - ec->direct_cache[scan8[0] - 1 + 2*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)]; - else - ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1; - } - } - } - } - ec->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type); -} - -static inline void write_back_non_zero_count(EntropyContext *ec, H264Slice *s){ - H264Mb *m = ec->m; - const int mb_x= m->mb_x; - - //bottom nnz - AV_COPY32(&ec->non_zero_count[mb_x][0], &ec->non_zero_count_cache[4+8*4] ); - ec->non_zero_count[mb_x][4] = ec->non_zero_count_cache[1+8*2]; - ec->non_zero_count[mb_x][5] = ec->non_zero_count_cache[2+8*2]; - ec->non_zero_count[mb_x][6] = ec->non_zero_count_cache[1+8*5]; - ec->non_zero_count[mb_x][7] = ec->non_zero_count_cache[2+8*5]; - - for (int i=0; i<2; i++) { - ec->non_zero_count_left[i*2+0] = ec->non_zero_count_cache[7+8*1 + 2*8*i]; - ec->non_zero_count_left[i*2+1] = ec->non_zero_count_cache[7+8*2 + 2*8*i]; - ec->non_zero_count_left[4+i*2+0] = ec->non_zero_count_cache[2+8*1 + 3*8*i]; - ec->non_zero_count_left[4+i*2+1] = ec->non_zero_count_cache[2+8*2 + 3*8*i]; - } - - AV_COPY32(&m->non_zero_count[ 0], &ec->non_zero_count_cache[4+8*1]); - AV_COPY32(&m->non_zero_count[ 4], &ec->non_zero_count_cache[4+8*2]); - AV_COPY32(&m->non_zero_count[ 8], &ec->non_zero_count_cache[4+8*3]); - AV_COPY32(&m->non_zero_count[12], &ec->non_zero_count_cache[4+8*4]); - - for (int i=0; i<2; i++) { - m->non_zero_count[16 + i*2 ] = ec->non_zero_count_cache[8*1 + 8*i + 1]; - m->non_zero_count[16 + i*2 +1] = ec->non_zero_count_cache[8*1 + 8*i + 2]; - m->non_zero_count[20 + i*2 ] = ec->non_zero_count_cache[8*4 + 8*i + 1]; - m->non_zero_count[20 + i*2 +1] = ec->non_zero_count_cache[8*4 + 8*i + 2]; - } -} - -static inline void write_back_motion(EntropyContext *ec, H264Slice *s, int mb_type){ - H264Mb *m = ec->m; - const int mb_x = m->mb_x; - const int b_x = 4*m->mb_x; //try mb2b(8)_xy - int list; - - for(list=0; listlist_count; list++){ - if(!USES_LIST(mb_type, list)) - continue; - - { - uint8_t (*mvd_dst)[2] = (void *) ec->mvd[list][8*mb_x]; - uint8_t (*mvd_src)[2] = &ec->mvd_cache[list][scan8[0]]; - if(IS_SKIP(mb_type)) - AV_ZERO128(mvd_dst); - else{ - AV_COPY64(mvd_dst, mvd_src + 8*3); - AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); - AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); - AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); - } - } - int8_t *ref_index = &ec->ref_index[list][b_x]; - { - ref_index[0+0*2]= ec->ref_cache[list][scan8[0]]; - ref_index[1+0*2]= ec->ref_cache[list][scan8[4]]; - ref_index[0+1*2]= ec->ref_cache[list][scan8[8]]; - ref_index[1+1*2]= ec->ref_cache[list][scan8[12]]; - } - } - - if(s->slice_type_nos == FF_B_TYPE){ - if(IS_8X8(mb_type)){ - uint8_t *direct = &ec->direct[4*mb_x]; - direct[1] = m->sub_mb_type[1]>>1; - direct[2] = m->sub_mb_type[2]>>1; - direct[3] = m->sub_mb_type[3]>>1; - } - } -} - -static inline int get_dct8x8_allowed(EntropyContext *ec, H264Slice *s){ - H264Mb *m = ec->m; - if(s->direct_8x8_inference_flag) - return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); - else - return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); -} - -/** - * decodes a P_SKIP or B_SKIP macroblock - */ -static void decode_mb_skip(EntropyContext *ec, H264Slice *s){ - H264Mb *m = ec->m; - const int mb_x = m->mb_x; - int mb_type; - - if( s->slice_type_nos == FF_B_TYPE ) - mb_type= MB_TYPE_16x16|MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; - else - mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; - - fill_rectangle(&ec->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); - write_back_motion(ec, s, mb_type); - m->mb_type = ec->mb_type[mb_x] = mb_type; - m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale; - - AV_ZERO64(ec->non_zero_count[mb_x]); - AV_ZERO64(ec->non_zero_count_left); - memset(m->non_zero_count, 0, 24); -} - -static int decode_cabac_intra_mb_type(EntropyContext *ec, H264Slice *s, CABACContext *c, int ctx_base, int intra_slice) { - uint8_t *state= &c->cabac_state[ctx_base]; - int mb_type; - - if(intra_slice){ - int ctx=0; - if( ec->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) - ctx++; - if( ec->top_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) - ctx++; - if( get_cabac_noinline( c, &state[ctx] ) == 0 ) - return 0; /* I4x4 */ - state += 2; - }else{ - if( get_cabac_noinline( c, state ) == 0 ) - return 0; /* I4x4 */ - } - - if( get_cabac_terminate( c ) ) - return 25; /* PCM */ - - mb_type = 1; /* I16x16 */ - mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */ - if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */ - mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] ); - mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] ); - mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] ); - return mb_type; -} - -static int decode_cabac_mb_skip(EntropyContext *ec, H264Slice *s, H264Mb *m, CABACContext *c) { - int ctx = 0; - - if( m->mb_x>0 && !IS_SKIP( ec->left_type )) - ctx++; - if( m->mb_y>0 && !IS_SKIP( ec->top_type )) - ctx++; - - if( s->slice_type_nos == FF_B_TYPE ) - ctx += 13; - return get_cabac_noinline(c, &c->cabac_state[11+ctx] ); -} - -static int decode_cabac_mb_intra4x4_pred_mode_delta( CABACContext *c) { - int mode = 0; - - if( get_cabac(c, &c->cabac_state[68] ) ) - return -1; - - mode += 1 * get_cabac(c, &c->cabac_state[69] ); - mode += 2 * get_cabac(c, &c->cabac_state[69] ); - mode += 4 * get_cabac(c, &c->cabac_state[69] ); - - return mode; -} - -static int decode_cabac_mb_chroma_pre_mode(EntropyContext *ec, H264Slice *s, CABACContext *c) { - H264Mb *m = ec->m; - const int mb_x = m->mb_x; - - int ctx = 0; - - /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */ - if( ec->left_type && ec->chroma_pred_mode[mb_x-1] != 0 ) - ctx++; - - if( ec->top_type && ec->chroma_pred_mode_top[mb_x] != 0 ) - ctx++; - - if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 ) - return 0; - - if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) - return 1; - if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) - return 2; - else - return 3; -} - -static int decode_cabac_mb_cbp_luma(EntropyContext *ec, CABACContext *c) { - int cbp_b, cbp_a, ctx, cbp = 0; - - cbp_a = ec->left_cbp; - cbp_b = ec->top_cbp; - - ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]); - ctx = !(cbp & 0x01) + 2 * !(cbp_b & 0x08); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1; - ctx = !(cbp_a & 0x08) + 2 * !(cbp & 0x01); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2; - ctx = !(cbp & 0x04) + 2 * !(cbp & 0x02); - cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3; - return cbp; -} -static int decode_cabac_mb_cbp_chroma(EntropyContext *ec, CABACContext *c) { - int ctx; - int cbp_a, cbp_b; - - cbp_a = (ec->left_cbp>>4)&0x03; - cbp_b = (ec-> top_cbp>>4)&0x03; - - ctx = 0; - if( cbp_a > 0 ) ctx++; - if( cbp_b > 0 ) ctx += 2; - if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 ) - return 0; - - ctx = 4; - if( cbp_a == 2 ) ctx++; - if( cbp_b == 2 ) ctx += 2; - return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] ); -} - -static int decode_cabac_p_mb_sub_type( CABACContext *c) { - if( get_cabac(c, &c->cabac_state[21] ) ) - return 0; /* 8x8 */ - if( !get_cabac(c, &c->cabac_state[22] ) ) - return 1; /* 8x4 */ - if( get_cabac(c, &c->cabac_state[23] ) ) - return 2; /* 4x8 */ - return 3; /* 4x4 */ -} -static int decode_cabac_b_mb_sub_type(CABACContext *c) { - int type; - if( !get_cabac(c, &c->cabac_state[36] ) ) - return 0; /* B_Direct_8x8 */ - if( !get_cabac(c, &c->cabac_state[37] ) ) - return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */ - type = 3; - if( get_cabac(c, &c->cabac_state[38] ) ) { - if( get_cabac(c, &c->cabac_state[39] ) ) - return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */ - type += 4; - } - type += 2*get_cabac(c, &c->cabac_state[39] ); - type += get_cabac(c, &c->cabac_state[39] ); - return type; -} - -static int decode_cabac_mb_ref(EntropyContext *ec, H264Slice *s, CABACContext *c, int list, int n ) { - int refa = ec->ref_cache[list][scan8[n] - 1]; - int refb = ec->ref_cache[list][scan8[n] - 8]; - int ref = 0; - int ctx = 0; - - if( s->slice_type_nos == FF_B_TYPE) { - if( refa > 0 && !(ec->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) ) - ctx++; - if( refb > 0 && !(ec->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) ) - ctx += 2; - } else { - if( refa > 0 ) - ctx++; - if( refb > 0 ) - ctx += 2; - } - - while( get_cabac(c, &c->cabac_state[54+ctx] ) ) { - ref++; - ctx = (ctx>>2)+4; - if(ref >= 32 /*h->ref_list[list]*/){ - return -1; - } - } - return ref; -} - -static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) { - int mvd; - - if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){ - *mvda= 0; - return 0; - } - - mvd= 1; - ctxbase+= 3; - while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) { - if( mvd < 4 ) - ctxbase++; - mvd++; - } - - if( mvd >= 9 ) { - int k = 3; - while( get_cabac_bypass(c ) ) { - mvd += 1 << k; - k++; - if(k>24){ - av_log(AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n"); - return INT_MIN; - } - } - while( k-- ) { - mvd += get_cabac_bypass(c )<mvd_cache[list][scan8[n] - 1][0] +\ - ec->mvd_cache[list][scan8[n] - 8][0];\ - int amvd1 = ec->mvd_cache[list][scan8[n] - 1][1] +\ - ec->mvd_cache[list][scan8[n] - 8][1];\ -\ - m->mvd[list][mp][0] = decode_cabac_mb_mvd( c, 40, amvd0, &mpx ); \ - m->mvd[list][mp][1] = decode_cabac_mb_mvd( c, 47, amvd1, &mpy ); \ - mp++; \ -} - -static av_always_inline int get_cabac_cbf_ctx(EntropyContext *ec, H264Slice *s, int cat, int idx, int is_dc ) { - int nza, nzb; - int ctx = 0; - - if( is_dc ) { - if( cat == 0 ) { - nza = ec->left_cbp&0x100; - nzb = ec-> top_cbp&0x100; - } else { - nza = (ec->left_cbp>>(6+idx))&0x01; - nzb = (ec-> top_cbp>>(6+idx))&0x01; - } - } else { - assert(cat == 1 || cat == 2 || cat == 4); - nza = ec->non_zero_count_cache[scan8[idx] - 1]; - nzb = ec->non_zero_count_cache[scan8[idx] - 8]; - } - - if( nza > 0 ) - ctx++; - - if( nzb > 0 ) - ctx += 2; - - return ctx + 4 * cat; -} - -DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = { - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 -}; - -static const int significant_coeff_flag_offset[2][6] = { - { 105+0, 105+15, 105+29, 105+44, 105+47, 402 }, - { 277+0, 277+15, 277+29, 277+44, 277+47, 436 } -}; -static const int last_coeff_flag_offset[2][6] = { - { 166+0, 166+15, 166+29, 166+44, 166+47, 417 }, - { 338+0, 338+15, 338+29, 338+44, 338+47, 451 } -}; -static const int coeff_abs_level_m1_offset[6] = { - 227+0, 227+10, 227+20, 227+30, 227+39, 426 -}; -static const uint8_t significant_coeff_flag_offset_8x8[2][63] = { - { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, - 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, - 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, - 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 }, - { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5, - 6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, - 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, - 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 } -}; -/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). -* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). -* map node ctx => cabac ctx for level=1 */ -static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; -/* map node ctx => cabac ctx for level>1 */ -static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; -static const uint8_t coeff_abs_level_transition[2][8] = { - /* update node ctx after decoding a level=1 */ - { 1, 2, 3, 3, 4, 5, 6, 7 }, - /* update node ctx after decoding a level>1 */ - { 4, 4, 4, 4, 5, 6, 7, 7 } -}; - -static av_always_inline void decode_cabac_residual_internal(EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { - H264Mb *m = ec->m; - const int mb_x = m->mb_x; - int index[64]; - - int av_unused last; - int coeff_count = 0; - int node_ctx = 0; - - uint8_t *significant_coeff_ctx_base; - uint8_t *last_coeff_ctx_base; - uint8_t *abs_level_m1_ctx_base; - - /* read coded block flag */ - if( is_dc || cat != 5 ) { - if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( ec, s, cat, n, is_dc ) ] ) == 0 ) { - if( !is_dc ) - ec->non_zero_count_cache[scan8[n]] = 0; - return; - } - } - - significant_coeff_ctx_base = c->cabac_state - + significant_coeff_flag_offset[0][cat]; - last_coeff_ctx_base = c->cabac_state - + last_coeff_flag_offset[0][cat]; - abs_level_m1_ctx_base = c->cabac_state - + coeff_abs_level_m1_offset[cat]; - - if( !is_dc && cat == 5 ) { -#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \ - for(last= 0; last < coefs; last++) { \ - uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \ - if( get_cabac( c, sig_ctx )) { \ - uint8_t *last_ctx = last_coeff_ctx_base + last_off; \ - index[coeff_count++] = last; \ - if( get_cabac( c, last_ctx ) ) { \ - last= max_coeff; \ - break; \ - } \ - } \ - }\ - if( last == max_coeff -1 ) {\ - index[coeff_count++] = last;\ - } - - const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0]; - DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); - } else { - DECODE_SIGNIFICANCE( max_coeff - 1, last, last ); - } - assert(coeff_count > 0); - - if( is_dc ) { - if( cat == 0 ) - ec->cbp[mb_x] |= 0x100; - else - ec->cbp[mb_x] |= 0x40 << n; - } else { - if( cat == 5 ) - fill_rectangle(&ec->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); - else { - assert( cat == 1 || cat == 2 || cat == 4 ); - ec->non_zero_count_cache[scan8[n]] = coeff_count; - } - } - - do { - uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; - - int j= scantable[index[--coeff_count]]; - - if( get_cabac( c, ctx ) == 0 ) { - node_ctx = coeff_abs_level_transition[0][node_ctx]; - if( is_dc ) { - block[j] = get_cabac_bypass_sign( c, -1); - }else{ - block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6; - } - } else { - int coeff_abs = 2; - ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; - node_ctx = coeff_abs_level_transition[1][node_ctx]; - - while( coeff_abs < 15 && get_cabac( c, ctx ) ) { - coeff_abs++; - } - - if( coeff_abs >= 15 ) { - int j = 0; - while( get_cabac_bypass( c ) ) { - j++; - } - - coeff_abs=1; - while( j-- ) { - coeff_abs += coeff_abs + get_cabac_bypass( c ); - } - coeff_abs+= 14; - } - - if( is_dc ) { - block[j] = get_cabac_bypass_sign( c, -coeff_abs ); - }else{ - block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6; - } - } - } while( coeff_count ); - -} - -static void decode_cabac_residual_dc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { - decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, NULL, max_coeff, 1); -} - -static void decode_cabac_residual_nondc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { - decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, qmul, max_coeff, 0); -} - -/** - * decodes a macroblock - * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed - */ -int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c) { - H264Mb *m = ec->m; - int mb_x = m->mb_x; - int mb_type, partition_count, cbp = 0; - int dct8x8_allowed= s->pps.transform_8x8_mode; - - fill_decode_neighbors(ec, s); - - if( s->slice_type_nos != FF_I_TYPE ) { - int skip; - /* a skipped mb needs the aff flag from the following mb */ - skip = decode_cabac_mb_skip( ec, s, m, c); - - /* read skip flags */ - if( skip ) { - decode_mb_skip(ec, s); - m->cbp = ec->cbp[mb_x] = 0; - ec->chroma_pred_mode[mb_x] = 0; - ec->last_qscale_diff = 0; - return 0; - } - } - - if( s->slice_type_nos == FF_B_TYPE ) { - int ctx = 0; - - if( !IS_DIRECT( ec->left_type-1 ) ) - ctx++; - if( !IS_DIRECT( ec->top_type-1 ) ) - ctx++; - - if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){ - mb_type= 0; /* B_Direct_16x16 */ - }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) { - mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */ - }else{ - int bits; - bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3; - bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2; - bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1; - bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ); - if( bits < 8 ){ - mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */ - }else if( bits == 13 ){ - mb_type= decode_cabac_intra_mb_type(ec, s, c, 32, 0); - goto decode_intra_mb; - }else if( bits == 14 ){ - mb_type= 11; /* B_L1_L0_8x16 */ - }else if( bits == 15 ){ - mb_type= 22; /* B_8x8 */ - }else{ - bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] ); - mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */ - } - } - partition_count= b_mb_type_info[mb_type].partition_count; - mb_type= b_mb_type_info[mb_type].type; - } else if( s->slice_type_nos == FF_P_TYPE ) { - if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) { - /* P-type */ - if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) { - /* P_L0_D16x16, P_8x8 */ - mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] ); - } else { - /* P_L0_D8x16, P_L0_D16x8 */ - mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] ); - } - partition_count= p_mb_type_info[mb_type].partition_count; - mb_type= p_mb_type_info[mb_type].type; - } else { - mb_type= decode_cabac_intra_mb_type(ec, s, c, 17, 0); - goto decode_intra_mb; - } - } else { - mb_type= decode_cabac_intra_mb_type(ec, s ,c, 3, 1); - if(s->slice_type == FF_SI_TYPE && mb_type) - mb_type--; - assert(s->slice_type_nos == FF_I_TYPE); -decode_intra_mb: - partition_count = 0; - cbp= i_mb_type_info[mb_type].cbp; - m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode; - mb_type= i_mb_type_info[mb_type].type; - } - - if(IS_INTRA_PCM(mb_type)) { - const uint8_t *ptr; - // We assume these blocks are very rare so we do not optimize it. - // FIXME The two following lines get the bitstream position in the cabac - // decode, I think it should be done by a function in cabac.h (or cabac.c). - ptr=c->bytestream; - if(c->low&0x1) ptr--; - if(CABAC_BITS==16){ - if(c->low&0x1FF) ptr--; - } - //printf("pcm\n"); - // The pixels are stored in the same order as levels in h->mb array. - memcpy(m->mb, ptr, 256); ptr+=256; - memcpy(m->mb+128, ptr, 128); ptr+=128; - - ff_init_cabac_decoder(c, ptr, c->bytestream_end - ptr); - - // All blocks are present - m->cbp= ec->cbp[mb_x] = 0x1ef; - ec->chroma_pred_mode[mb_x] = 0; - // In deblocking, the quantizer is 0 - m->qscale_mb_xy = ec->qscale[mb_x]= 0; - // All coeffs are present - memset(ec->non_zero_count[mb_x], 16, 8); - m->mb_type = ec->mb_type[mb_x]= mb_type; - ec->last_qscale_diff = 0; - - return 0; - } - - fill_decode_caches(ec, s, mb_type); - - int mp = 0; - if( IS_INTRA( mb_type ) ) { - int i, pred_mode; - if( IS_INTRA4x4( mb_type ) ) { - if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ) ) { - mb_type |= MB_TYPE_8x8DCT; - for( i = 0; i < 16; i+=4 ) { - m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c); - } - } else { - for( i = 0; i < 16; i++ ) { - m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c); - } - } - } - - m->chroma_pred_mode= ec->chroma_pred_mode[mb_x] = - pred_mode = decode_cabac_mb_chroma_pre_mode( ec, s, c ); - - } else if( partition_count == 4 ) { - int i, j, sub_partition_count[4], list; - - if( s->slice_type_nos == FF_B_TYPE ) { - for( i = 0; i < 4; i++ ) { - m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c ); - sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; - m->sub_mb_type[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].type; - } - if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | - m->sub_mb_type[2] | m->sub_mb_type[3]) ) { - ec->ref_cache[0][scan8[4]] = - ec->ref_cache[1][scan8[4]] = - ec->ref_cache[0][scan8[12]] = - ec->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; - - for( i = 0; i < 4; i++ ) - fill_rectangle( &ec->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 ); - } - } else { - for( i = 0; i < 4; i++ ) { - m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c ); - sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; - m->sub_mb_type[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].type; - } - } - - for( list = 0; list < s->list_count; list++ ) { - for( i = 0; i < 4; i++ ) { - if(IS_DIRECT(m->sub_mb_type[i])) continue; - if(IS_DIR(m->sub_mb_type[i], 0, list)){ - if( s->ref_count[list] > 1 ){ - m->ref_index[list][i] = decode_cabac_mb_ref(ec, s, c, list, 4*i ); - if(m->ref_index[list][i] >= s->ref_count[list]){ - av_log(AV_LOG_ERROR, "Reference %d >= %d\n", m->ref_index[list][i], s->ref_count[list]); - return -1; - } - }else - m->ref_index[list][i] = 0; - } else { - m->ref_index[list][i] = -1; - } - ec->ref_cache[list][ scan8[4*i] ]=ec->ref_cache[list][ scan8[4*i]+1 ]= - ec->ref_cache[list][ scan8[4*i]+8 ]=ec->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i]; - } - } - - if(dct8x8_allowed){ -// assert(0); - dct8x8_allowed = get_dct8x8_allowed(ec, s); - } - - for(list=0; listlist_count; list++){ - for(i=0; i<4; i++){ -// ec->ref_cache[list][ scan8[4*i] ]=ec->ref_cache[list][ scan8[4*i]+1 ]; - if(IS_DIRECT(m->sub_mb_type[i])){ - fill_rectangle(ec->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); - continue; - } - - if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){ - const int sub_mb_type= m->sub_mb_type[i]; - const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; - for(j=0; jmvd_cache[list][ scan8[index]]; - - DECODE_CABAC_MB_MVD( ec, c, list, index) - - if(IS_SUB_8X8(sub_mb_type)){ - mvd_cache[ 1 ][0]= - mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx; - mvd_cache[ 1 ][1]= - mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy; - }else if(IS_SUB_8X4(sub_mb_type)){ - mvd_cache[ 1 ][0]= mpx; - mvd_cache[ 1 ][1]= mpy; - }else if(IS_SUB_4X8(sub_mb_type)){ - mvd_cache[ 8 ][0]= mpx; - mvd_cache[ 8 ][1]= mpy; - } - mvd_cache[ 0 ][0]= mpx; - mvd_cache[ 0 ][1]= mpy; - } - }else{ - fill_rectangle(ec->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); - } - } - } - } else if( IS_DIRECT(mb_type) ) { - mb_type |= MB_TYPE_16x16; - fill_rectangle(ec->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); - fill_rectangle(ec->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); - dct8x8_allowed &= s->direct_8x8_inference_flag; - } else { - int list, i; - if(IS_16X16(mb_type)){ - for(list=0; listlist_count; list++){ - if(IS_DIR(mb_type, 0, list)){ - int ref; - if(s->ref_count[list] > 1){ - ref= decode_cabac_mb_ref(ec, s, c, list, 0); - if(ref >= s->ref_count[list]){ - av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); - return -1; - } - }else - ref=0; - m->ref_index[list][0]= ref; - fill_rectangle(&ec->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); - } - } - for(list=0; listlist_count; list++){ - if(IS_DIR(mb_type, 0, list)){ - int mpx,mpy; - DECODE_CABAC_MB_MVD( ec, c, list, 0) - - fill_rectangle(ec->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); - } - - } - } - else if(IS_16X8(mb_type)){ - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ - int ref; - if(s->ref_count[list] > 1){ - ref= decode_cabac_mb_ref(ec, s, c, list, 8*i ); - if(ref >= s->ref_count[list]){ - av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); - return -1; - } - }else - ref=0; - m->ref_index[list][i]= ref; - fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); - }else{ - m->ref_index[list][i]= LIST_NOT_USED; - fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); - } - } - } - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ - int mpx,mpy; - DECODE_CABAC_MB_MVD( ec, c, list, 8*i) - - fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); - }else{ - fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); - } - } - } - }else{ - assert(IS_8X16(mb_type)); - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ //FIXME optimize - int ref; - if(s->ref_count[list] > 1){ - ref= decode_cabac_mb_ref(ec, s, c, list, 4*i ); - if(ref >= s->ref_count[list]){ - av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); - return -1; - } - }else - ref=0; - m->ref_index[list][i]= ref; - fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); - }else{ - m->ref_index[list][i]= LIST_NOT_USED; - fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); - } - } - } - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ - int mpx,mpy; - DECODE_CABAC_MB_MVD( ec, c, list, 4*i) - - fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); - }else{ - fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); - } - } - } - } - } - - if( IS_INTER( mb_type ) ||(IS_DIRECT(mb_type))) { - ec->chroma_pred_mode[mb_x] = 0; - write_back_motion( ec, s, mb_type ); - } - - if( !IS_INTRA16x16( mb_type ) ) { - cbp = decode_cabac_mb_cbp_luma( ec, c); - cbp |= decode_cabac_mb_cbp_chroma( ec, c ) << 4; - } - - ec->cbp[mb_x] = m->cbp = cbp; - - if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) { - int t = get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ); - mb_type |= MB_TYPE_8x8DCT * t; - } - m->mb_type = ec->mb_type[mb_x] = mb_type; - - if( cbp || IS_INTRA16x16( mb_type ) ) { - const uint8_t *scan, *scan8x8, *dc_scan; - const uint32_t *qmul; - - - if (s->transform_bypass && ec->curr_qscale){ - scan8x8= ff_zigzag_direct; - scan= zigzag_scan; - }else{ - scan8x8= ec->zigzag_scan8x8; - scan= ec->zigzag_scan; - } - dc_scan= luma_dc_zigzag_scan; - - // decode_cabac_mb_dqp - if(get_cabac_noinline(c, &c->cabac_state[60 + (ec->last_qscale_diff != 0)])){ - int val = 1; - int ctx= 2; - - while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) { - ctx= 3; - val++; - if(val > 102){ //prevent infinite loop - av_log(AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", m->mb_x, m->mb_y); - return -1; - } - } - - if( val&0x01 ) - val= (val + 1)>>1 ; - else - val= -((val + 1)>>1); - ec->last_qscale_diff = val; - ec->curr_qscale += val; - if(((unsigned)ec->curr_qscale) > 51){ - if(ec->curr_qscale<0) ec->curr_qscale+= 52; - else ec->curr_qscale-= 52; - } - ec->chroma_qp[0] = get_chroma_qp( s, 0, ec->curr_qscale); - ec->chroma_qp[1] = get_chroma_qp( s, 1, ec->curr_qscale); - }else - ec->last_qscale_diff=0; - - memset(m->mb, 0, 16*16 * sizeof(DCTELEM)); - if( IS_INTRA16x16( mb_type ) ) { - int i; - - //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); - decode_cabac_residual_dc( ec, s, c, m->mb, 0, 0, dc_scan, 16); - qmul = ec->dequant4_coeff[0][ec->curr_qscale]; - if( cbp&15 ) { - for( i = 0; i < 16; i++ ) { - //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i ); - decode_cabac_residual_nondc( ec, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15); - } - } else { - fill_rectangle(&ec->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1); - } - h264_luma_dc_dequant_idct_c(m->mb, qmul[0]); - } else { - - int i8x8, i4x4; - for( i8x8 = 0; i8x8 < 4; i8x8++ ) { - if( cbp & (1<mb + 64*i8x8, 5, 4*i8x8, - scan8x8, ec->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][ec->curr_qscale], 64); - } else { - qmul = ec->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][ec->curr_qscale]; - for( i4x4 = 0; i4x4 < 4; i4x4++ ) { - const int index = 4*i8x8 + i4x4; - //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index ); -//START_TIMER - decode_cabac_residual_nondc(ec, s, c, m->mb + 16*index, 2, index, scan, qmul, 16); -//STOP_TIMER("decode_residual") - } - } - } else { - uint8_t * const nnz= &ec->non_zero_count_cache[ scan8[4*i8x8] ]; - nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; - } - } - } - - if( cbp&0x30 ){ - memset(m->mb + 256, 0, 2*64 * sizeof(DCTELEM)); - for( int i = 0; i < 2; i++ ) { - const uint32_t dequant4_coeff = ec->dequant4_coeff[IS_INTRA(mb_type) ? 1+i:4+i][ec->chroma_qp[i]][0]; - - //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); - decode_cabac_residual_dc(ec, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4); - chroma_dc_dequant_idct_c(m->mb + 256 + 16*4*i, dequant4_coeff); - } - } - - if( cbp&0x20 ) { - int i, j; - for( i = 0; i < 2; i++ ) { - qmul = ec->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][ec->chroma_qp[i]]; - for( j = 0; j < 4; j++ ) { - const int index = 16 + 4 * i + j; - //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 ); - decode_cabac_residual_nondc( ec, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15); - } - } - } else { - uint8_t * const nnz= &ec->non_zero_count_cache[0]; - nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = - nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; - } - - } else { - uint8_t * const nnz= &ec->non_zero_count_cache[0]; - fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1); - nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = - nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; - ec->last_qscale_diff = 0; - } - - m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale; - write_back_non_zero_count(ec, s); - - - return 0; -} - -void free_entropy_context(EntropyContext *ec){ - av_freep(&ec->non_zero_count_row[0]); - av_freep(&ec->non_zero_count_row[1]); - av_freep(&ec->mvd_table[0][0]); - av_freep(&ec->mvd_table[0][1]); - av_freep(&ec->mvd_table[1][0]); - av_freep(&ec->mvd_table[1][1]); - - av_freep(&ec->direct_table[0]); - av_freep(&ec->direct_table[1]); - av_freep(&ec->chroma_pred_mode_table[0]); - av_freep(&ec->chroma_pred_mode_table[1]); - av_freep(&ec->cbp_table[0]); - av_freep(&ec->cbp_table[1]); - av_freep(&ec->qscale_table[0]); - av_freep(&ec->qscale_table[1]); - - av_freep(&ec->mb_type_table[0]); - av_freep(&ec->mb_type_table[1]); - av_freep(&ec->ref_index_table[0][0]); - av_freep(&ec->ref_index_table[0][1]); - av_freep(&ec->ref_index_table[1][0]); - av_freep(&ec->ref_index_table[1][1]); - - - av_free(ec); -} - -EntropyContext *get_entropy_context(H264Context *h){ - const int mb_height = h->mb_height; - const int mb_width = h->mb_width; - const int mb_stride = h->mb_stride; - - EntropyContext *ec = av_mallocz(sizeof(EntropyContext)); - - ec->mb_width = mb_width; - ec->mb_height = mb_height; - ec->b_stride = mb_width*4; - ec->mb_stride = mb_stride; - - FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[0], mb_stride * 8 * sizeof(uint8_t), fail) - FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[1], mb_stride * 8 * sizeof(uint8_t), fail) - - FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][0], 16*mb_stride * sizeof(uint8_t), fail); - FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][1], 16*mb_stride * sizeof(uint8_t), fail); - FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][0], 16*mb_stride * sizeof(uint8_t), fail); - FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][1], 16*mb_stride * sizeof(uint8_t), fail); - - FF_ALLOCZ_OR_GOTO(ec->direct_table[0], 4*mb_stride * sizeof(uint8_t) , fail); - FF_ALLOCZ_OR_GOTO(ec->direct_table[1], 4*mb_stride * sizeof(uint8_t) , fail); - - FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[0], mb_stride * sizeof(uint8_t), fail) - FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[1], mb_stride * sizeof(uint8_t), fail) - - FF_ALLOCZ_OR_GOTO(ec->cbp_table[0], mb_stride * sizeof(uint16_t), fail) - FF_ALLOCZ_OR_GOTO(ec->cbp_table[1], mb_stride * sizeof(uint16_t), fail) - - FF_ALLOCZ_OR_GOTO(ec->qscale_table[0], mb_stride * sizeof(uint8_t) , fail) - FF_ALLOCZ_OR_GOTO(ec->qscale_table[1], mb_stride * sizeof(uint8_t) , fail) - - FF_ALLOCZ_OR_GOTO(ec->mb_type_table[0] , (mb_stride+1) * sizeof(uint32_t), fail) - FF_ALLOCZ_OR_GOTO(ec->mb_type_table[1] , (mb_stride+1) * sizeof(uint32_t), fail) - - FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][0], 4*mb_stride * sizeof(int8_t), fail) - FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][0], 4*mb_stride * sizeof(int8_t), fail) - FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][1], 4*mb_stride * sizeof(int8_t), fail) - FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][1], 4*mb_stride * sizeof(int8_t), fail) - - ec->zigzag_scan = h->zigzag_scan; - ec->zigzag_scan8x8 = h->zigzag_scan8x8; - - return ec; -fail: - free_entropy_context(ec); - return NULL; -} - -void init_entropy_buf(EntropyContext *ec, H264Slice *s, int line){ - int top = (line+1)%2; - int cur = line%2; - - ec->non_zero_count_top = ec->non_zero_count_row[top]; - ec->non_zero_count = ec->non_zero_count_row[cur]; - ec->mvd_top[0] = ec->mvd_table[0][top]; - ec->mvd[0] = ec->mvd_table[0][cur]; - ec->mvd_top[1] = ec->mvd_table[1][top]; - ec->mvd[1] = ec->mvd_table[1][cur]; - ec->direct_top = ec->direct_table[top]; - ec->direct = ec->direct_table[cur]; - ec->chroma_pred_mode_top = ec->chroma_pred_mode_table[top]; - ec->chroma_pred_mode = ec->chroma_pred_mode_table[cur]; - ec->cbp_top = ec->cbp_table[top]; - ec->cbp = ec->cbp_table[cur]; - ec->qscale_top = ec->qscale_table[top] +1; - ec->qscale = ec->qscale_table[cur] +1; - ec->mb_type_top = ec->mb_type_table[top]+1; - ec->mb_type = ec->mb_type_table[cur]+1; - ec->ref_index_top[0] = ec->ref_index_table[0][top]; - ec->ref_index_top[1] = ec->ref_index_table[1][top]; - ec->ref_index[0] = ec->ref_index_table[0][cur]; - ec->ref_index[1] = ec->ref_index_table[1][cur]; - -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_entropy.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_entropy.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -#ifndef H264_CABAC_H -#define H264_CABAC_H - -#include "h264_types.h" -#include "cabac.h" - -/** - * decodes a CABAC coded macroblock - * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed - */ - -int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c); -void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c); - -int init_entropy_buf(EntropyContext *ec, H264Slice *s, int line); -EntropyContext * get_entropy_context(H264Context *h); -void init_dequant_tables(H264Slice *s, EntropyContext *ec); -void free_entropy_context(EntropyContext *ec); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_idct.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_idct.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,270 +0,0 @@ -/* - * H.264 IDCT - * Copyright (c) 2004 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 IDCT. - * @author Michael Niedermayer - */ - -#include "dsputil.h" -#include "h264_data.h" - -static av_always_inline void idct_internal(uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add){ - int i; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - - block[0] += 1<<(shift-1); - - for(i=0; i<4; i++){ - const int z0= block[0 + block_stride*i] + block[2 + block_stride*i]; - const int z1= block[0 + block_stride*i] - block[2 + block_stride*i]; - const int z2= (block[1 + block_stride*i]>>1) - block[3 + block_stride*i]; - const int z3= block[1 + block_stride*i] + (block[3 + block_stride*i]>>1); - - block[0 + block_stride*i]= z0 + z3; - block[1 + block_stride*i]= z1 + z2; - block[2 + block_stride*i]= z1 - z2; - block[3 + block_stride*i]= z0 - z3; - } - - for(i=0; i<4; i++){ - const int z0= block[i + block_stride*0] + block[i + block_stride*2]; - const int z1= block[i + block_stride*0] - block[i + block_stride*2]; - const int z2= (block[i + block_stride*1]>>1) - block[i + block_stride*3]; - const int z3= block[i + block_stride*1] + (block[i + block_stride*3]>>1); - - dst[i + 0*stride]= cm[ add*dst[i + 0*stride] + ((z0 + z3) >> shift) ]; - dst[i + 1*stride]= cm[ add*dst[i + 1*stride] + ((z1 + z2) >> shift) ]; - dst[i + 2*stride]= cm[ add*dst[i + 2*stride] + ((z1 - z2) >> shift) ]; - dst[i + 3*stride]= cm[ add*dst[i + 3*stride] + ((z0 - z3) >> shift) ]; - } -} - -void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride){ - idct_internal(dst, block, stride, 4, 6, 1); -} - -void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){ - idct_internal(dst, block, stride, 8, 3, 1); -} - -void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){ - idct_internal(dst, block, stride, 8, 3, 0); -} - -void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){ - int i; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - - block[0] += 32; - - for( i = 0; i < 8; i++ ) - { - const int a0 = block[0+i*8] + block[4+i*8]; - const int a2 = block[0+i*8] - block[4+i*8]; - const int a4 = (block[2+i*8]>>1) - block[6+i*8]; - const int a6 = (block[6+i*8]>>1) + block[2+i*8]; - - const int b0 = a0 + a6; - const int b2 = a2 + a4; - const int b4 = a2 - a4; - const int b6 = a0 - a6; - - const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1); - const int a3 = block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1); - const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1); - const int a7 = block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1); - - const int b1 = (a7>>2) + a1; - const int b3 = a3 + (a5>>2); - const int b5 = (a3>>2) - a5; - const int b7 = a7 - (a1>>2); - - block[0+i*8] = b0 + b7; - block[7+i*8] = b0 - b7; - block[1+i*8] = b2 + b5; - block[6+i*8] = b2 - b5; - block[2+i*8] = b4 + b3; - block[5+i*8] = b4 - b3; - block[3+i*8] = b6 + b1; - block[4+i*8] = b6 - b1; - } - for( i = 0; i < 8; i++ ) - { - const int a0 = block[i+0*8] + block[i+4*8]; - const int a2 = block[i+0*8] - block[i+4*8]; - const int a4 = (block[i+2*8]>>1) - block[i+6*8]; - const int a6 = (block[i+6*8]>>1) + block[i+2*8]; - - const int b0 = a0 + a6; - const int b2 = a2 + a4; - const int b4 = a2 - a4; - const int b6 = a0 - a6; - - const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1); - const int a3 = block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1); - const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1); - const int a7 = block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1); - - const int b1 = (a7>>2) + a1; - const int b3 = a3 + (a5>>2); - const int b5 = (a3>>2) - a5; - const int b7 = a7 - (a1>>2); - - dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ]; - dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ]; - dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ]; - dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ]; - dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ]; - dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ]; - dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ]; - dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; - } -} - -// assumes all AC coefs are 0 -void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ - int i, j; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - int dc = (block[0] + 32) >> 6; - for( j = 0; j < 4; j++ ) - { - for( i = 0; i < 4; i++ ) - dst[i] = cm[ dst[i] + dc ]; - dst += stride; - } -} - -void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ - int i, j; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - int dc = (block[0] + 32) >> 6; - for( j = 0; j < 8; j++ ) - { - for( i = 0; i < 8; i++ ) - dst[i] = cm[ dst[i] + dc ]; - dst += stride; - } -} - -void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); - else idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); - } - } -} - -void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); - else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); - } -} - -void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_c (dst + block_offset[i], block + i*16, stride); - } - } -} - -void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_c (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - else if(block[i*16]) - ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} - -/** -* IDCT transforms the 16 dc values and dequantizes them. -* @param qp quantization parameter -*/ -void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul){ - #define stride 16 - int i; - int temp[16]; //FIXME check if this is a good idea - static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; - static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; - - //return; - for(i=0; i<4; i++){ - const int offset= y_offset[i]; - const int z0= block[offset+stride*0] + block[offset+stride*4]; - const int z1= block[offset+stride*0] - block[offset+stride*4]; - const int z2= block[offset+stride*1] - block[offset+stride*5]; - const int z3= block[offset+stride*1] + block[offset+stride*5]; - - temp[4*i+0]= z0+z3; - temp[4*i+1]= z1+z2; - temp[4*i+2]= z1-z2; - temp[4*i+3]= z0-z3; - } - - for(i=0; i<4; i++){ - const int offset= x_offset[i]; - const int z0= temp[4*0+i] + temp[4*2+i]; - const int z1= temp[4*0+i] - temp[4*2+i]; - const int z2= temp[4*1+i] - temp[4*3+i]; - const int z3= temp[4*1+i] + temp[4*3+i]; - - block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual - block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); - block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); - block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); - } -} - -#undef xStride -#undef stride - -void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){ - const int stride= 16*2; - const int xStride= 16; - int a,b,c,d,e; - - a= block[stride*0 + xStride*0]; - b= block[stride*0 + xStride*1]; - c= block[stride*1 + xStride*0]; - d= block[stride*1 + xStride*1]; - - e= a-b; - a= a+b; - b= c-d; - c= c+d; - - block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; - block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; - block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; - block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_idct.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_idct.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -#ifndef H264_IDCT_H -#define H264_IDCT_H - -#include "avcodec.h" - -void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); -void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); -void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); -void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); -void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); -void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); -void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); -void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul); -void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_mc.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_mc.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,272 +0,0 @@ -#include "h264_types.h" -#include "h264_data.h" - -static inline void mc_dir_part(MBRecContext *d, MBRecState *mrs, H264Mb *m, DecodedPicture *pic, int n, int square, - int chroma_height, int delta, int list,uint8_t *dest_y, - uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset, - qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){ - const int mx= mrs->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; - const int my= mrs->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; - const int luma_xy= (mx&3) + ((my&3)<<2); - const int pic_width = 16*d->mb_width; - const int pic_height = 16*d->mb_height; - - uint8_t *src_y, *src_cb, *src_cr; - int ymx= mx>>2; - int ymy= my>>2; - int cmy= my>>3; - int cmx= mx>>3; - - //truncate the motion vectors references - if(ymy>= pic_height+2){ - ymy=pic_height+1; - }else if(ymy <=-19){ - ymy=-18; - } - if(ymx>= pic_width+2){ - ymx= pic_width+1; - }else if(ymx<=-19){ - ymx=-19; - } - - src_y = pic->data[0] + ymx + ymy*d->linesize; - qpix_op[luma_xy](dest_y, src_y, d->linesize); //FIXME try variable height perhaps? - if(!square){ - qpix_op[luma_xy](dest_y + delta, src_y + delta, d->linesize); - } - - if(cmy >= pic_height>>1){ - cmy = (pic_height>>1) -1; - }else if(cmy<=-9){ - cmy=-8; - } - if(cmx >= pic_width>>1){ - cmx = (pic_width>>1) -1; - }else if(cmx<=-9){ - cmx=-8; - } - - src_cb= pic->data[1] + cmx + cmy*d->uvlinesize; - src_cr= pic->data[2] + cmx + cmy*d->uvlinesize; - - chroma_op(dest_cb, src_cb, d->uvlinesize, chroma_height, mx&7, my&7); - chroma_op(dest_cr, src_cr, d->uvlinesize, chroma_height, mx&7, my&7); -} - -static inline void mc_part_std(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - int list0, int list1){ - qpel_mc_func *qpix_op= qpix_put; - h264_chroma_mc_func chroma_op= chroma_put; - - dest_y += 2*x_offset + 2*y_offset*d-> linesize; - dest_cb += x_offset + y_offset*d->uvlinesize; - dest_cr += x_offset + y_offset*d->uvlinesize; - x_offset += 8*m->mb_x; - y_offset += 8*m->mb_y; - - if(list0){ - DecodedPicture *ref= s->dp_ref_list[0][ mrs->ref_cache[0][ scan8[n] ] ]; - mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 0, - dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op); - - qpix_op= qpix_avg; - chroma_op= chroma_avg; - } - - if(list1){ - DecodedPicture *ref= s->dp_ref_list[1][ mrs->ref_cache[1][ scan8[n] ] ]; - mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 1, - dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op); - } -} - -static inline void mc_part_weighted(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, - h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, - int list0, int list1){ - dest_y += 2*x_offset + 2*y_offset*d-> linesize; - dest_cb += x_offset + y_offset*d->uvlinesize; - dest_cr += x_offset + y_offset*d->uvlinesize; - x_offset += 8*m->mb_x; - y_offset += 8*m->mb_y; - - if(list0 && list1){ - /* don't optimize for luma-only case, since B-frames usually - * use implicit weights => chroma too. */ - uint8_t *tmp_y = d->scratchpad_y + 2*x_offset +16 ; - uint8_t *tmp_cb = d->scratchpad_cb + x_offset + 8; - uint8_t *tmp_cr = d->scratchpad_cr + x_offset + 8; - -/* - uint8_t *tmp_cb = d->scratchpad; - uint8_t *tmp_cr = d->scratchpad + 8; - uint8_t *tmp_y = d->scratchpad + 8*d->uvlinesize;*/ - int refn0 = mrs->ref_cache[0][ scan8[n] ]; - int refn1 = mrs->ref_cache[1][ scan8[n] ]; - - mc_dir_part(d, mrs, m, s->dp_ref_list[0][refn0], n, square, chroma_height, delta, 0, - dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put); - mc_dir_part(d, mrs, m, s->dp_ref_list[1][refn1], n, square, chroma_height, delta, 1, - tmp_y, tmp_cb, tmp_cr, x_offset, y_offset, qpix_put, chroma_put); - - if(s->use_weight == 2){ - int weight0 = s->implicit_weight[refn0][refn1][m->mb_y&1]; - int weight1 = 64 - weight0; - luma_weight_avg( dest_y, tmp_y, d-> linesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, 5, weight0, weight1, 0); - chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, 5, weight0, weight1, 0); - }else{ - luma_weight_avg(dest_y, tmp_y, d->linesize, s->luma_log2_weight_denom, - s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], - s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]); - chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, s->chroma_log2_weight_denom, - s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], - s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]); - chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, s->chroma_log2_weight_denom, - s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], - s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]); - } - }else{ - int list = list1 ? 1 : 0; - int refn = mrs->ref_cache[list][ scan8[n] ]; - DecodedPicture *ref= s->dp_ref_list[list][refn]; - mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, list, - dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put); - - luma_weight_op(dest_y, d->linesize, s->luma_log2_weight_denom, - s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]); - if(s->use_weight_chroma){ - chroma_weight_op(dest_cb, d->uvlinesize, s->chroma_log2_weight_denom, - s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]); - chroma_weight_op(dest_cr, d->uvlinesize, s->chroma_log2_weight_denom, - s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]); - } - } -} - -static inline void mc_part(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, - uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - int x_offset, int y_offset, - qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, - qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, - h264_weight_func *weight_op, h264_biweight_func *weight_avg, - int list0, int list1){ - if((s->use_weight==2 && list0 && list1 - && (s->implicit_weight[ mrs->ref_cache[0][scan8[n]] ][ mrs->ref_cache[1][scan8[n]] ][m->mb_y&1] != 32)) - || s->use_weight==1) - mc_part_weighted(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, - weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1); - else - mc_part_std(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, - x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1); -} - -static inline void prefetch_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int list){ - /* fetch pixels for estimated mv 4 macroblocks ahead - * optimized for 64byte cache lines */ - const int refn = mrs->ref_cache[list][scan8[0]]; - - if(refn >= 0){ - const int mx= (mrs->mv_cache[list][scan8[0]][0]>>2) + 16*m->mb_x + 8; - const int my= (mrs->mv_cache[list][scan8[0]][1]>>2) + 16*m->mb_y; - uint8_t **src= s->dp_ref_list[list][refn]->data; - int off= mx + (my + (m->mb_x&3)*4)*d->linesize + 64; - - d->dsp.prefetch(src[0]+off, d->linesize, 4); - off= (mx>>1) + ((my>>1) + (m->mb_x&7))*d->uvlinesize + 64; - d->dsp.prefetch(src[1]+off, src[2]-src[1], 2); - } -} - -void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), - qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), - h264_weight_func *weight_op, h264_biweight_func *weight_avg){ - const int mb_type= m->mb_type; - assert(IS_INTER(mb_type)); - - if (mb_type & MB_TYPE_L0) - prefetch_motion(d, mrs, s, m, 0); - if (mb_type & MB_TYPE_L1) - prefetch_motion(d, mrs, s, m, 1); - - if(IS_16X16(mb_type)){ - mc_part(d, mrs, s, m, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], - weight_op, weight_avg, - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - }else if(IS_16X8(mb_type)){ - mc_part(d, mrs, s, m, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - mc_part(d, mrs, s, m, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4, - qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], - &weight_op[1], &weight_avg[1], - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); - }else if(IS_8X16(mb_type)){ - mc_part(d, mrs, s, m, 0, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 0, 0, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], - IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); - mc_part(d, mrs, s, m, 4, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 4, 0, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[2], &weight_avg[2], - IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); - }else{ - int i; - - assert(IS_8X8(mb_type)); - - for(i=0; i<4; i++){ - const int sub_mb_type= m->sub_mb_type[i]; - const int n= 4*i; - int x_offset= (i&1)<<2; - int y_offset= (i&2)<<1; - - if(IS_SUB_8X8(sub_mb_type)){ - mc_part(d, mrs, s, m, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], - &weight_op[3], &weight_avg[3], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else if(IS_SUB_8X4(sub_mb_type)){ - mc_part(d, mrs, s, m, n, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - mc_part(d, mrs, s, m, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, - qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], - &weight_op[4], &weight_avg[4], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else if(IS_SUB_4X8(sub_mb_type)){ - mc_part(d, mrs, s, m, n, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - mc_part(d, mrs, s, m, n+1, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[5], &weight_avg[5], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - }else{ - int j; - assert(IS_SUB_4X4(sub_mb_type)); - for(j=0; j<4; j++){ - int sub_x_offset= x_offset + 2*(j&1); - int sub_y_offset= y_offset + (j&2); - mc_part(d, mrs, s, m, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, - qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], - &weight_op[6], &weight_avg[6], - IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); - } - } - } - } -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_mc.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_mc.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -#ifndef H264_MC_H -#define H264_MC_H - -#include "dsputil.h" -#include "h264_types.h" - -void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, - qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), - qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), - h264_weight_func *weight_op, h264_biweight_func *weight_avg); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_misc.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_misc.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,944 +0,0 @@ -#include "config.h" - -#include "h264_types.h" - -#include -#include -#include -#include -#include -#undef NDEBUG -#include - -#if HAVE_LIBSDL2 -#include -#if HAVE_LIBSDL_TTF -#include -#endif -#endif - -void start_timer(H264Context *h, int stage){ - clock_gettime(CLOCK_REALTIME, &h->start_time[stage]); -} - -void stop_timer(H264Context *h, int stage){ - clock_gettime(CLOCK_REALTIME, &h->end_time[stage]); - double time = (double) 1.e3*(h->end_time[stage].tv_sec - h->start_time[stage].tv_sec) + 1.e-6*(h->end_time[stage].tv_nsec - h->start_time[stage].tv_nsec); - h->last_time [stage] = time; - h->total_time[stage] += time; -} - -void init_sb_entry(H264Context *h, SliceBufferEntry *sbe){ - sbe->mbs = av_malloc(h->mb_width*h->mb_height* sizeof(H264Mb)); - sbe->initialized = 1; -} - -void free_sb_entry(SliceBufferEntry *sbe){ - av_free(sbe->mbs); - av_freep(&sbe->gb.raw); - if (sbe->gb.rbsp) - av_freep(&sbe->gb.rbsp); - sbe->initialized = 0; -} - -SliceBufferEntry *get_sb_entry(H264Context *h){ - SliceBufferEntry *sb = NULL; - - pthread_mutex_lock(&h->lock[PARSE]); - while (h->free_sb_cnt<=0) - pthread_cond_wait(&h->cond[PARSE], &h->lock[PARSE]); - /* use first free picture */ - for(int i=0; isb_size; i++){ - if(h->sb[i].state==0){ - sb= &h->sb[i]; - sb->state=1; - sb->lines_taken=0; - sb->lines_total=h->mb_height; - break; - } - } - h->free_sb_cnt--; - - pthread_mutex_unlock(&h->lock[PARSE]); - - memset (&sb->slice, 0, sizeof(H264Slice)); - - return sb; -} - -void release_sb_entry(H264Context *h, SliceBufferEntry *sb){ - pthread_mutex_lock(&h->lock[PARSE]); - - sb->state = 0; - h->free_sb_cnt++; - pthread_cond_signal(&h->cond[PARSE]); - - pthread_mutex_unlock(&h->lock[PARSE]); -} - -int init_dpb_entry(H264Context *h, DecodedPicture *pic, H264Slice *s, int width, int height){ - int i; - - s->curr_pic=pic; - pic->poc = s->poc; - pic->key_frame = s->key_frame; - pic->mmco_reset = s->mmco_reset; - pic->reference = s->nal_ref_idc? 3:1; - pic->cpn = s->coded_pic_num; - - if(pic->data[0]==NULL) { - int size[3] = {0}; - - width+= EDGE_WIDTH*2; - height+= EDGE_WIDTH*2; - - pic->linesize[0]= width; - pic->linesize[1]= pic->linesize[2] = width>>1; - - size[0] = width*height; - size[1] = size[2] = width*height>>2; - - for(i=0; i<3; i++){ - pic->base[i]= av_malloc(size[i]); - } - - pic->data[0] = pic->base[0] + (pic->linesize[0]*EDGE_WIDTH) + EDGE_WIDTH; - pic->data[1] = pic->base[1] + (pic->linesize[1]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1); - pic->data[2] = pic->base[2] + (pic->linesize[2]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1); - } - - const int big_mb_num= h->mb_stride*(h->mb_height+1) + 1; //the +1 is needed so memset(,,stride*height) does not sig11 - const int mb_array_size= h->mb_stride*h->mb_height; - const int b4_array_size= h->b4_stride*h->mb_height*4; - - if(pic->mb_type_base==NULL){ - FF_ALLOCZ_OR_GOTO(pic->mb_type_base , big_mb_num * sizeof(uint32_t), fail) - pic->mb_type= pic->mb_type_base + h->mb_stride+1; - - for(int i=0; i<2; i++){ - FF_ALLOCZ_OR_GOTO(pic->motion_val_base[i], 2 * (b4_array_size+4) * sizeof(int16_t), fail) - pic->motion_val[i]= pic->motion_val_base[i]+4; - FF_ALLOCZ_OR_GOTO(pic->ref_index[i], 4*mb_array_size * sizeof(uint8_t), fail) - } - FF_ALLOCZ_OR_GOTO(pic->intra4x4_pred_mode, h->mb_width*h->mb_height * 4* sizeof(int8_t), fail) - } - - return 0; - fail: - return -1; -} - -void free_dp(DecodedPicture *pic){ - if(pic->base[0]){ - for (int i=0; i<3; i++){ - av_free(pic->base[i]); - pic->data[i]= NULL; - } - } - if (pic->mb_type_base){ - av_free(pic->mb_type_base); - pic->mb_type= NULL; - for(int i=0; i<2; i++){ - av_free(pic->motion_val_base[i]); - av_free(pic->ref_index[i]); - } - av_free(pic->intra4x4_pred_mode); - } -} - -DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s){ - DecodedPicture *dp = NULL; - - pthread_mutex_lock(&h->lock[REORDER2]); - while (h->free_dpb_cnt<=0){ - #if OMPSS - assert(0); - #endif - pthread_cond_wait(&h->cond[REORDER2], &h->lock[REORDER2]); - } - /* use first free picture */ - for(int i=0; imax_dpb_cnt; i++){ - if(h->dpb[i].reference==0){ - dp= &h->dpb[i]; - break; - } - } - assert(dp); - init_dpb_entry(h, dp, s, h->width, h->height); - h->free_dpb_cnt--; - h->acdpb_cnt++; //debug - pthread_mutex_unlock(&h->lock[REORDER2]); - - return dp; -} - -void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode){ - pthread_mutex_lock(&h->lock[REORDER2]); - pic->reference &= ~mode; - if (pic->reference == 0){ - h->free_dpb_cnt++; - h->reldpb_cnt++; //debug - pthread_cond_signal(&h->cond[REORDER2]); - } - pthread_mutex_unlock(&h->lock[REORDER2]); -} - - -/** -* Extends the edges of a macroblock line. -*/ -void draw_edges(MBRecContext *d, H264Slice *s, int line){ - int i; - int mb_width=d->mb_width; - int mb_height=d->mb_height; - int last = (line+1 == mb_height); - int lines = last?16:12; - int linesize = d->linesize; - int uvlinesize = d->uvlinesize; - uint8_t *y = s->curr_pic->data[0] + 16*line*linesize; - uint8_t *cb = s->curr_pic->data[1] + 8*line*uvlinesize; - uint8_t *cr = s->curr_pic->data[2] + 8*line*uvlinesize; - - for (i=-4; idelayed_pic[0]; - - if (!out) - return NULL; - - for(i=1; w->delayed_pic[i] && !w->delayed_pic[i]->key_frame && !w->delayed_pic[i]->mmco_reset; i++){ - if(w->delayed_pic[i]->poc < out->poc){ - out = w->delayed_pic[i]; - out_idx = i; - } - } - - if(w->dp_cnt > MAX_DELAYED_PIC_COUNT || flush) { - for(i=out_idx; w->delayed_pic[i]; i++) - w->delayed_pic[i] = w->delayed_pic[i+1]; - w->dp_cnt--; - return out; - } - return NULL; -} - -/** -* Remove the extra borders, and places the three parts of the image after each other. -*/ -static int raw_encode(const DecodedPicture* src, int width, int height, unsigned char *dest) { - int i, j; -/** To write entire image including extra borders*/ -// int w = src->linesize[0]; -// int h = height+64; -// int w2 = w>>1; -// int h2 = h>>1; -// int data_planes=3; -// int size = w * h + 2 *w2*h2; -// const unsigned char* s; -// for (i=0; ibase[i]; -// for(j=0; jlinesize[i]); -// dest += w; -// s += src->linesize[i]; -// } -// } - - int w = (width*8 + 7)/8; - int h = height; - int w2 =((width >>1) * 8 + 7) / 8; - int h2 = ((height+1) >>1); //not sure about +1 - int data_planes=3; - int size = w * h + 2 *w2*h2; - const unsigned char* s; - - - for (i=0; idata[i]; - for(j=0; jlinesize[i]; - } - } - return size; -} - -#ifdef HAVE_LIBSDL2 -static SDL_Texture *get_next_texture(H264Context *h, int side){ - SDLTextureQueue *sdlq = &h->sdlq; - SDL_Texture *texture; - pthread_mutex_lock (&sdlq->sdl_lock); - if (side ){ //send - while (sdlq->ready >= sdlq->size) - pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock); - texture = sdlq->queue[sdlq->fi]; - sdlq->fi++; sdlq->fi %= sdlq->size; - } else { //recv - while (sdlq->ready <= 0 && !sdlq->exit) - pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock); - - if (sdlq->ready == 0 && sdlq->exit){ - texture = NULL; - }else{ - texture = sdlq->queue[sdlq->fo]; - sdlq->fo++; sdlq->fo %= sdlq->size; - } - } - pthread_mutex_unlock(&sdlq->sdl_lock); - - return texture; -} - -static void signal_texture(H264Context *h, int side){ - SDLTextureQueue *sdlq = &h->sdlq; - pthread_mutex_lock (&sdlq->sdl_lock); - if (side) - sdlq->ready++; - else - sdlq->ready--; - pthread_cond_signal(&sdlq->sdl_cond); - pthread_mutex_unlock(&sdlq->sdl_lock); -} - -void signal_sdl_exit(H264Context *h){ - SDLTextureQueue *sdlq = &h->sdlq; - pthread_mutex_lock (&sdlq->sdl_lock); - sdlq->exit=1; - pthread_cond_signal(&sdlq->sdl_cond); - pthread_mutex_unlock(&sdlq->sdl_lock); -} - -static void display_frame(H264Context *h, OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height, int dropable){ - static int64_t last_time = -1; - int64_t cur_time; -// SDLContext *sdlc = h->sdlc; - uint8_t *iyuv_pixels; - int pitch; - - - if (last_time == -1){ - last_time = av_gettime(); - } - - - /* do not display frames that are less than 8.125 ms apart (120fps)*/ - if (dropable){ - cur_time = av_gettime(); - - if ((cur_time - last_time) < 8125) - return; - - last_time =cur_time; - } - - if(in_picture){ - - SDL_Texture *texture= get_next_texture(h, 1); - - SDL_LockTexture( texture, NULL, (void **)&iyuv_pixels, &pitch ); - - raw_encode(in_picture, frame_width, frame_height, iyuv_pixels); - - signal_texture(h, 1); - } -} -#endif - -// TODO: Parallelize the raw_encode (either split frame or over frames) -static void do_video_out(OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height) { - int size=0; - //remove extra borders - - if(in_picture) - size= raw_encode(in_picture, frame_width, frame_height, w->bit_buffer); - - if (size < 0) { - fprintf(stderr, "Video encoding failed\n"); - }else { - if (write(fd, w->bit_buffer, size)<0) - fprintf(stderr, "Write frame failed\n"); - } - - w->video_size += size; -} - -DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height) { - DecodedPicture *out; - - if (pic){ - oc->delayed_pic[oc->dp_cnt++]=pic; - out = get_reordered_picture(oc, 0); - }else{ - out = get_reordered_picture(oc, 1); - } - - if (out){ - if (fd){ - do_video_out(oc, fd, out, frame_width, frame_height); - }else{ -#ifdef HAVE_LIBSDL2 - if (h->display){ - display_frame(h, oc, fd, out, frame_width, frame_height, !(pic==NULL)); - } -#endif - } - oc->frame_number++; - } - - return out; -} - -OutputContext *get_output_context(H264Context *h){ - const int frame_width=h->frame_width; - const int frame_height=h->frame_height; - const int frame_size = frame_width*frame_height; - - OutputContext *oc = av_mallocz(sizeof(OutputContext)); - oc->bit_buffer_size= FFMAX(1024*256, frame_size*2); // oversize a little bit to allow extra border write - oc->bit_buffer= av_mallocz(oc->bit_buffer_size); - - return oc; -} - -void free_output_context(OutputContext *oc){ - - av_free(oc->bit_buffer); - av_free(oc); -} - -SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height){ - SuperMBContext *smbc = av_mallocz(sizeof(SuperMBContext)); - - smbc->smb_width = smb_width; - smbc->smb_height = smb_height; - - smbc->nsmb_height = h->mb_height / smbc->smb_height + (h->mb_height%smbc->smb_height ? 1:0); //only need one extra if mb_height was not dividable - smbc->nsmb_width = h->mb_width / smbc->smb_width; - while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width ) - smbc->nsmb_width++; - - smbc->nsmb_3dheight= smbc->nsmb_height - ((h->mb_height/2)/smbc->smb_height +1); //assuming max motion vector of half the height - - smbc->smbs[0] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask)); - smbc->smbs[1] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask)); - for (int y=0, i=0; insmb_height; i++, y+=smbc->smb_height){ - for (int x=0, j=0; jnsmb_width; j++, x+=smbc->smb_width){ - smbc->smbs[0][i*smbc->nsmb_width +j].smb_y = y; - smbc->smbs[0][i*smbc->nsmb_width +j].smb_x = x; - smbc->smbs[1][i*smbc->nsmb_width +j].smb_y = y; - smbc->smbs[1][i*smbc->nsmb_width +j].smb_x = x; - } - } - - smbc->refcount = 1; - - return smbc; -} - -void freeSuperMBContext(SuperMBContext *smbc){ - av_free(smbc->smbs[0]); - av_free(smbc->smbs[1]); - av_free(smbc); -} - -SuperMBContext * acquire_smbc(H264Context *h ){ - SuperMBContext *smbc; - - pthread_mutex_lock (&h->smb_lock); - smbc = h->smbc; - smbc->refcount++; - pthread_mutex_unlock(&h->smb_lock); - return smbc; -} - -void release_smbc(H264Context *h, SuperMBContext *smbc){ - pthread_mutex_lock (&h->smb_lock); - smbc->refcount--; - if (smbc->refcount==0){ - freeSuperMBContext(smbc); - } - pthread_mutex_unlock(&h->smb_lock); - -} - - -#ifdef HAVE_LIBSDL2 - -// #if OMPSS -static void draw_sb_border(H264Context *h, uint32_t *rgba_pixels, int smb_x, int smb_y){ - int mb_width = h->mb_width; - int mb_height = h->mb_height; - int width = h->frame_width; - int height = h->frame_height; - - int mb_x = smb_x * h->smb_width; - int mb_y = smb_y * h->smb_height; - - uint32_t pix= 0x0000FFC0; - - for (int k=0, i=mb_y; i< mb_y + h->smb_height; i++, k++){ - for (int l=0, j=mb_x -k ; j< mb_x - k + h->smb_width; j++, l++){ - //outside frame - if (i<0 || i>=mb_height || j<0 || j>=mb_width) { - continue; - } - - //draw top - if (i==0 || k==0 || l==0){ - int mx = j*16; - int my = i*16; - uint32_t *top = rgba_pixels + my*width + mx; - int endx = mx+16 < width? 16: width-mx; - - for (int x = 0; xsmb_height-1 || l==h->smb_width-1){ - int mx = j*16; - int my = i*16 + 15; my = my < height ? my: height-1; - uint32_t *bottom = rgba_pixels + my*width + mx; - int endx = mx+16 < width? 16: width-mx; - - for (int x = 0; xsmb_width-1 ){ - int mx = j*16 + 15; mx = mx < width ? mx: width-1; - int my = i*16; - uint32_t *right = rgba_pixels + my*width + mx; - int endy = my +16 < height ? 16: height - my; - - for (int y = 0; ysbmap_texture; - - SDL_LockTexture( sbmap, NULL, (void **)&rgba_pixels, &pitch ); - - memset (rgba_pixels, 0, pitch * h->height); - for (int i=0; i< smbc->nsmb_height; i++){ - for (int j=0; j< smbc->nsmb_width; j++){ - draw_sb_border(h, rgba_pixels, j, i); - } - } - - SDL_UnlockTexture( sbmap ); -} -// #endif - -// static void calc_sb_sizes (H264Context *h, SuperMBContext *smbc){ -// smbc->smb_height = h->smb_height; -// smbc->smb_width = h->smb_width; -// -// smbc->nsmb_height = h->mb_height / smbc->smb_height + (h->mb_height%smbc->smb_height ? 1:0); //only need one extra if mb_height was not dividable -// smbc->nsmb_width = h->mb_width / smbc->smb_width; -// while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width ) -// smbc->nsmb_width++; -// } - - -static void handle_key_event(H264Context *h, SDLContext *sdlc, SDL_Keysym keysym){ - int arrow=0; - - switch (keysym.sym){ - case SDLK_ESCAPE: - if (sdlc->fullscreen){ - SDL_SetWindowFullscreen(sdlc->window, SDL_FALSE); - sdlc->fullscreen = 0; - } - break; - case SDLK_SPACE: - pthread_mutex_lock(&h->sdl_lock); - sdlc->pause = !sdlc->pause; - pthread_cond_signal(&h->sdl_cond); - pthread_mutex_unlock(&h->sdl_lock); - break; - case SDLK_f: - if (!sdlc->fullscreen){ - if (keysym.mod == KMOD_LCTRL){ -// SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full); - SDL_SetWindowFullscreen(sdlc->window, SDL_TRUE); - - sdlc->fullscreen = 1; - } - } - break; - case SDLK_m: - sdlc->showmap = !sdlc->showmap; - break; - case SDLK_UP: - if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height < h->mb_height && h->smb_height < h->smb_width){ - h->smb_height++; - arrow =1; - } - break; - case SDLK_DOWN: - if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height > 1 ){ - h->smb_height--; - arrow =1; - } - break; - case SDLK_LEFT: - if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width > 1 && h->smb_width > h->smb_height){ - h->smb_width--; - arrow =1; - } - break; - case SDLK_RIGHT: - if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width < h->mb_width){ - h->smb_width++; - arrow =1; - } - break; - } - - if (arrow){ - SuperMBContext *smbc = getSuperMBContext(h, h->smb_width, h->smb_height); - pthread_mutex_lock(&h->smb_lock); - h->smbc->refcount--; - if (h->smbc->refcount == 0) - freeSuperMBContext(h->smbc); - h->smbc = smbc; - sdlc->updatemap =1; - pthread_mutex_unlock(&h->smb_lock); - } -} - -void handle_window_event(H264Context *h, SDLContext *sdlc, SDL_WindowEvent winevent){ - SDL_Rect nrect; - switch (winevent.event){ - case SDL_WINDOWEVENT_RESIZED: - - sdlc->win_w = winevent.data1; - sdlc->win_h = winevent.data2; - - double aspect = (double) sdlc->win_w/ sdlc->win_h; - if ( aspect < sdlc->aspect){ - double r = (double) sdlc->win_w / sdlc->rect.w; - double h = (double) sdlc->rect.h * r; - - nrect.y = lrint(( (double) sdlc->win_h - h)/2); - nrect.h = lrint(h); - - nrect.x=0; - nrect.w= sdlc->win_w; - - }else { - double r = (double) sdlc->win_h / sdlc->rect.h; - double w = (double) sdlc->rect.w * r; - - nrect.x = lrint(( (double) sdlc->win_w - w)/2); - nrect.w = lrint(w); - - nrect.y=0; - nrect.h= sdlc->win_h; - } - //prob better to lock - sdlc->win_rect = nrect; - sdlc->resized=1; - break; - } -} - -void *sdl_event_listen_thread(void *arg){ - H264Context *h = (H264Context *) arg; - SDLContext *sdlc = h->sdlc; - SDL_Event event; - - while ( SDL_WaitEvent(&event) ) { - switch (event.type) { - case SDL_KEYDOWN: - handle_key_event(h, sdlc, event.key.keysym); - break; - case SDL_WINDOWEVENT: - handle_window_event(h, sdlc, event.window); - break; - case SDL_QUIT: - h->quit=1; - goto finish; - } - } -finish: - pthread_exit(NULL); - return NULL; -} - -//XInitThreads not called in SDL2 library, causes crash -//remove in future when fixed ... -#include - -SDLContext *get_SDL_context(H264Context *h){ - const int frame_width=h->frame_width; - const int frame_height=h->frame_height; - - SDLContext *sdlc = av_mallocz(sizeof(SDLContext)); - sdlc->display = h->display; - sdlc->fullscreen = h->fullscreen; - - sdlc->aspect = (double) frame_width / (double) frame_height; - sdlc->rect.x =0; - sdlc->rect.y =0; - sdlc->rect.w =frame_width; - sdlc->rect.h =frame_height; - - XInitThreads(); //workaround - - // Initializes the video subsystem - if (SDL_Init(SDL_INIT_VIDEO) < 0) { - fprintf(stderr, "Unable to init SDL: %s\n", SDL_GetError()); - #undef exit - exit(-1); - } - SDL_SetHint("SDL_HINT_RENDER_SCALE_QUALITY", "best"); - SDL_SetHint("SDL_HINT_RENDER_OPENGL_SHADERS", "1"); - - SDL_GetDesktopDisplayMode(0, &sdlc->full); - sdlc->full.format = SDL_PIXELFORMAT_IYUV; - - sdlc->wind = sdlc->full; - if (sdlc->wind.w > frame_width) sdlc->wind.w = frame_width; - if (sdlc->wind.h > frame_height) sdlc->wind.h = frame_height; - - sdlc->win_rect.x =0; - sdlc->win_rect.y =0; - sdlc->win_rect.w =sdlc->wind.w; - sdlc->win_rect.h =sdlc->wind.h; - - if (sdlc->fullscreen){ - sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, sdlc->full.w, sdlc->full.h, SDL_WINDOW_FULLSCREEN|SDL_WINDOW_SHOWN|SDL_WINDOW_RESIZABLE); - SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full); - } else { - sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, sdlc->wind.w, sdlc->wind.h, SDL_WINDOW_RESIZABLE|SDL_WINDOW_SHOWN); - SDL_SetWindowDisplayMode (sdlc->window, &sdlc->wind); - } - - sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_ACCELERATED); -// sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_SOFTWARE); - - h->sdlq.queue[0] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); - h->sdlq.queue[1] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); - - sdlc->sbmap_texture = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); - SDL_SetTextureBlendMode(sdlc->sbmap_texture, SDL_BLENDMODE_BLEND); - sdlc->updatemap = 1; - -#if HAVE_LIBSDL_TTF - //not working with SDL 2.0, try again in future when supported - if(TTF_Init()==-1) { - printf("TTF_Init: %s\n", TTF_GetError()); - exit(2); - } - - // Load a font - TTF_Font *font; - font = TTF_OpenFont("/usr/share/fonts/truetype/freefont/FreeSans.ttf", 24); - if (font == NULL) - { - printf("TTF_OpenFont() Failed: %s\n", TTF_GetError()); - TTF_Quit(); - exit(1); - } -#endif - - pthread_create(&sdlc->listen_thread, NULL, sdl_event_listen_thread, h); - - return sdlc; - -} - -void free_SDL_context(H264Context *h){ - SDLContext *sdlc = h->sdlc; - pthread_join(sdlc->listen_thread, NULL); - -#if HAVE_LIBSDL_TTF - TTF_Quit(); -#endif - SDL_DestroyTexture(h->sdlq.queue[0]); - SDL_DestroyTexture(h->sdlq.queue[1]); - SDL_DestroyTexture(sdlc->sbmap_texture); - SDL_DestroyRenderer(sdlc->renderer); - SDL_DestroyWindow(sdlc->window); - SDL_Quit(); - -} - -void *sdl_thread(void *arg){ - H264Context *h = (H264Context *) arg; - - SDLContext *sdlc = get_SDL_context(h); - h->sdlc = sdlc; - - signal_texture(h, 0); - signal_texture(h, 0); - - SDL_Texture *texture; - for (;;){ - pthread_mutex_lock(&h->sdl_lock); - while (sdlc->pause){ - pthread_cond_wait(&h->sdl_cond, &h->sdl_lock); - } - pthread_mutex_unlock(&h->sdl_lock); - - texture = get_next_texture(h, 0); - if (texture == NULL) - break; - - SDL_UnlockTexture(texture); - - //clear if resized - if (sdlc->resized){ - // KDE bug prob, reset viewport change after resize from max - SDL_RenderSetViewport(sdlc->renderer, NULL); - SDL_SetRenderDrawColor(sdlc->renderer, 0, 0, 0, 255); - SDL_RenderClear(sdlc->renderer); - sdlc->resized = 0; - } - - SDL_RenderCopy(sdlc->renderer, texture, &sdlc->rect, &sdlc->win_rect); - - if (sdlc->showmap){ - if (sdlc->updatemap){ - SuperMBContext *smbc; - pthread_mutex_lock (&h->smb_lock); - smbc = h->smbc; - smbc->refcount++; - sdlc->updatemap=0; - pthread_mutex_unlock(&h->smb_lock); - - draw_sbmap(h, smbc, sdlc); - - release_smbc(h, smbc); - } - SDL_RenderCopy(sdlc->renderer, sdlc->sbmap_texture, &sdlc->rect, &sdlc->win_rect); - } - - SDL_RenderPresent(sdlc->renderer); - signal_texture(h, 0); - } - - free_SDL_context(h); - - pthread_exit(NULL); - return NULL; -} -#endif - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_misc.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_misc.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -#ifndef H264_MISC_H -#define H264_MISC_H - -#include "avcodec.h" -#include "h264_types.h" - -void start_timer(H264Context *h, int stage); -void stop_timer(H264Context *h, int stage); - -void init_sb_entry(H264Context *h, SliceBufferEntry *sbe); -void free_sb_entry(SliceBufferEntry *sb); -SliceBufferEntry *get_sb_entry(H264Context *h); -void release_sb_entry(H264Context *h, SliceBufferEntry *sb); - -DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s); -void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode); - -void draw_edges(MBRecContext *d, H264Slice *s, int line); - -int ff_init_slice(NalContext *n, H264Slice *s); -void free_picture(PictureInfo *pic); -void free_dp(DecodedPicture *pic); - -void av_start_timer(); -int copyEDtoH264Slice(H264Slice *ms, H264Slice *es); -void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose); - -int ff_alloc_picture_info(NalContext *n, H264Slice *s, PictureInfo *pic); -DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height); -OutputContext *get_output_context(H264Context *h); -void free_output_context(OutputContext *oc); - -void freeSuperMBContext(SuperMBContext *smbc); -SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height); -void release_smbc(H264Context *h, SuperMBContext *smbc); -SuperMBContext * acquire_smbc(H264Context *h ); - -#if HAVE_LIBSDL2 -void signal_sdl_exit(H264Context *h); -void *sdl_thread(void *arg); -SDLContext *get_SDL_context(H264Context *h); -void free_SDL_context(SDLContext *sdlc); -#endif - -/** -* gets the chroma qp. -*/ -static inline int get_chroma_qp(H264Slice *s, int t, int qscale){ - return s->pps.chroma_qp_table[t][qscale]; -} - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_nal.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_nal.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,628 +0,0 @@ -#include "h264_types.h" -#include "h264_data.h" - -#include "golomb.h" -#include "h264_sei.h" -#include "h264_refs.h" -#include "h264_ps.h" -#include "h264_pred_mode.h" -#include "h264_misc.h" - -static int ff_h264_decode_rbsp_trailing(const uint8_t *src){ - int v= *src; - int r; - - for(r=1; r<9; r++){ - if(v&1) return r; - v>>=1; - } - return 0; -} - -static int pred_weight_table(H264Slice *s, GetBitContext *gb){ - int luma_def, chroma_def; - - s->use_weight= 0; - s->use_weight_chroma= 0; - s->luma_log2_weight_denom= get_ue_golomb(gb); - s->chroma_log2_weight_denom= get_ue_golomb(gb); - luma_def = 1<luma_log2_weight_denom; - chroma_def = 1<chroma_log2_weight_denom; - - for(int list=0; list<2; list++){ - for(int i=0; iref_count[list]; i++){ - int luma_weight_flag, chroma_weight_flag; - - luma_weight_flag= get_bits1(gb); - if(luma_weight_flag){ - s->luma_weight[i][list][0]= get_se_golomb(gb); - s->luma_weight[i][list][1]= get_se_golomb(gb); - if( s->luma_weight[i][list][0] != luma_def - || s->luma_weight[i][list][1] != 0) { - s->use_weight= 1; - } - }else{ - s->luma_weight[i][list][0]= luma_def; - s->luma_weight[i][list][1]= 0; - } - - chroma_weight_flag= get_bits1(gb); - if(chroma_weight_flag){ - int j; - for(j=0; j<2; j++){ - s->chroma_weight[i][list][j][0]= get_se_golomb(gb); - s->chroma_weight[i][list][j][1]= get_se_golomb(gb); - if( s->chroma_weight[i][list][j][0] != chroma_def - || s->chroma_weight[i][list][j][1] != 0) { - s->use_weight_chroma= 1; - } - } - }else{ - int j; - for(j=0; j<2; j++){ - s->chroma_weight[i][list][j][0]= chroma_def; - s->chroma_weight[i][list][j][1]= 0; - } - } - } - if(s->slice_type_nos != FF_B_TYPE) break; - } - s->use_weight= s->use_weight || s->use_weight_chroma; - return 0; -} - -/** -* Initialize implicit_weight table. -*/ -static void implicit_weight_table(H264Slice *s){ - int ref0, ref1, cur_poc, ref_start, ref_count0, ref_count1; - - cur_poc = s->poc; - if( s->ref_count[0] == 1 && s->ref_count[1] == 1 && s->ref_list[0][0]->poc + s->ref_list[1][0]->poc == 2*cur_poc){ - s->use_weight= 0; - s->use_weight_chroma= 0; - return; - } - ref_start= 0; - ref_count0= s->ref_count[0]; - ref_count1= s->ref_count[1]; - - s->use_weight= 2; - s->use_weight_chroma= 2; - s->luma_log2_weight_denom= 5; - s->chroma_log2_weight_denom= 5; - - for(ref0=ref_start; ref0 < ref_count0; ref0++){ - int poc0 = s->ref_list[0][ref0]->poc; - for(ref1=ref_start; ref1 < ref_count1; ref1++){ - int poc1 = s->ref_list[1][ref1]->poc; - int td = av_clip(poc1 - poc0, -128, 127); - int w= 32; - if(td){ - int tb = av_clip(cur_poc - poc0, -128, 127); - int tx = (16384 + (FFABS(td) >> 1)) / td; - int dist_scale_factor = (tb*tx + 32) >> 8; - if(dist_scale_factor >= -64 && dist_scale_factor <= 128) - w = 64 - dist_scale_factor; - } - s->implicit_weight[ref0][ref1][0]= - s->implicit_weight[ref0][ref1][1]= w; - } - } -} - -/** -* instantaneous decoder refresh. -*/ -static void idr(NalContext *n, H264Slice *s){ - ff_h264_remove_all_refs(n, s); - n->prev_frame_num= 0; - n->prev_frame_num_offset= 0; - n->poc_offset += (n->prev_poc_msb<<16) + n->prev_poc_lsb; - n->prev_poc_msb= - n->prev_poc_lsb= 0; -} - -static int init_poc(NalContext *n, H264Slice *s, GetBitContext *gb){ - const int max_frame_num= 1<sps.log2_max_frame_num; - int frame_poc; - - if(n->sps.poc_type==0){ - n->poc_lsb= get_bits(gb, n->sps.log2_max_poc_lsb); - } - - if(n->sps.poc_type==1 && !n->sps.delta_pic_order_always_zero_flag){ - n->delta_poc= get_se_golomb(gb); - } - - n->frame_num_offset= n->prev_frame_num_offset; - if(n->frame_num < n->prev_frame_num) - n->frame_num_offset += max_frame_num; - - if(n->sps.poc_type==0){ - const int max_poc_lsb= 1<sps.log2_max_poc_lsb; - - if(n->poc_lsb < n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb >= max_poc_lsb/2) - n->poc_msb = n->prev_poc_msb + max_poc_lsb; - else if(n->poc_lsb > n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb < -max_poc_lsb/2) - n->poc_msb = n->prev_poc_msb - max_poc_lsb; - else - n->poc_msb = n->prev_poc_msb; - - frame_poc = n->poc_msb + n->poc_lsb; - }else if(n->sps.poc_type==1){ - int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc; - int i; - - if(n->sps.poc_cycle_length != 0) - abs_frame_num = n->frame_num_offset + n->frame_num; - else - abs_frame_num = 0; - - if(s->nal_ref_idc==0 && abs_frame_num > 0) - abs_frame_num--; - - expected_delta_per_poc_cycle = 0; - for(i=0; i < n->sps.poc_cycle_length; i++) - expected_delta_per_poc_cycle += n->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse - - if(abs_frame_num > 0){ - int poc_cycle_cnt = (abs_frame_num - 1) / n->sps.poc_cycle_length; - int frame_num_in_poc_cycle = (abs_frame_num - 1) % n->sps.poc_cycle_length; - - expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle; - for(i = 0; i <= frame_num_in_poc_cycle; i++) - expectedpoc = expectedpoc + n->sps.offset_for_ref_frame[ i ]; - } else - expectedpoc = 0; - if(s->nal_ref_idc == 0) - expectedpoc = expectedpoc + n->sps.offset_for_non_ref_pic; - frame_poc = expectedpoc + n->delta_poc; - }else{ - int poc= 2*(n->frame_num_offset + n->frame_num); - if(!s->nal_ref_idc) - poc--; - frame_poc= poc; - } - s->current_picture_info->poc= s->poc = frame_poc + n->poc_offset; - s->coded_pic_num = n->coded_pic_num++; - - return 0; -} - -static void ref2frame(NalContext *n, H264Slice *s){ - for(int j=0; jlist_count; j++){ - int *ref2frm= s->ref2frm[j]; - - ref2frm[0]= - ref2frm[1]= -1; - - for(int i=0; iref_count[j]; i++){ - ref2frm[i+2]= 15; - if(s->ref_list[j][i]->cpn >=0){ - int k; - for(k=0; kshort_ref_count; k++){ - if(n->short_ref[k]->cpn == s->ref_list[j][i]->cpn){ - ref2frm[i+2]= k; - break; - } - } - } - } - } -} - -/** -* decodes a slice header. -* This will also call MPV_common_init() and frame_start() as needed. -* -* @param h h264context -* @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding) -* -* @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded -*/ -static int decode_slice_header(NalContext *n, H264Slice *s, GetBitContext *gb){ - unsigned int first_mb_in_slice; - unsigned int pps_id; - int num_ref_idx_active_override_flag; - unsigned int slice_type, tmp; - - first_mb_in_slice= get_ue_golomb(gb); - (void) first_mb_in_slice; - - slice_type= get_ue_golomb_31(gb); - if(slice_type > 9){ - av_log(AV_LOG_ERROR, "slice type too large (%d)\n", s->slice_type); - return -1; - } - if(slice_type > 4) - slice_type -= 5; - - slice_type= golomb_to_pict_type[ slice_type ]; - - s->slice_type= slice_type; - s->slice_type_nos= slice_type & 3; - s->current_picture_info->slice_type_nos = s->slice_type_nos; - s->current_picture_info->reference= s->nal_ref_idc? 2:0; - s->key_frame = s->slice_type == FF_I_TYPE; - - pps_id= get_ue_golomb(gb); - - if(pps_id>=MAX_PPS_COUNT){ - av_log(AV_LOG_ERROR, "pps_id out of range\n"); - return -1; - } - if(!n->pps_buffers[pps_id]) { - av_log(AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id); - return -1; - } - s->pps= *n->pps_buffers[pps_id]; - - if(!n->sps_buffers[s->pps.sps_id]) { - av_log(AV_LOG_ERROR, "non-existing SPS %u referenced\n", s->pps.sps_id); - return -1; - } - n->sps = *n->sps_buffers[s->pps.sps_id]; - - n->mb_width= n->sps.mb_width; - n->mb_height= n->sps.mb_height; - - int chroma444 = (n->sps.chroma_format_idc == 3); - n->width = 16*n->mb_width - (2>>chroma444)*FFMIN(n->sps.crop_right, (8<sps.frame_mbs_only_flag) - n->height= 16*n->mb_height - (2>>chroma444)*FFMIN(n->sps.crop_bottom, (8<height= 16*n->mb_height - (4>>chroma444)*FFMIN(n->sps.crop_bottom, (8<direct_8x8_inference_flag = n->sps.direct_8x8_inference_flag; - s->transform_bypass = n->sps.transform_bypass; - - n->frame_num= get_bits(gb, n->sps.log2_max_frame_num); - if(n->frame_num != n->prev_frame_num && n->frame_num != (n->prev_frame_num+1)%(1<sps.log2_max_frame_num)){ - av_log(AV_LOG_ERROR, "unexpected frame_num \n"); - } - - s->current_picture_info->frame_num= n->frame_num; //FIXME frame_num cleanup - n->max_pic_num= 1<< n->sps.log2_max_frame_num; - - if(s->nal_unit_type == NAL_IDR_SLICE){ - get_ue_golomb(gb); /* idr_pic_id */ - } - - init_poc(n, s, gb); - - if(s->pps.redundant_pic_cnt_present){ - n->redundant_pic_count= get_ue_golomb(gb); - } - - //set defaults, might be overridden a few lines later - s->ref_count[0]= s->pps.ref_count[0]; - s->ref_count[1]= s->pps.ref_count[1]; - - if(s->slice_type_nos != FF_I_TYPE){ - if(s->slice_type_nos == FF_B_TYPE){ - s->direct_spatial_mv_pred= get_bits1(gb); - } - num_ref_idx_active_override_flag= get_bits1(gb); - - if(num_ref_idx_active_override_flag){ - s->ref_count[0]= get_ue_golomb(gb) + 1; - if(s->slice_type_nos==FF_B_TYPE) - s->ref_count[1]= get_ue_golomb(gb) + 1; - - if(s->ref_count[0]-1 > 32-1 || s->ref_count[1]-1 > 32-1){ - av_log(AV_LOG_ERROR, "reference overflow\n"); - s->ref_count[0]= s->ref_count[1]= 1; - return -1; - } - } - if(s->slice_type_nos == FF_B_TYPE) - s->list_count= 2; - else - s->list_count= 1; - }else - s->list_count= 0; - - - if(s->slice_type_nos!=FF_I_TYPE){ - ff_h264_fill_default_ref_list(n, s); - ff_h264_decode_ref_pic_list_reordering(n, s, gb); - ref2frame(n, s); - - for(int i=0; i<2; i++){ - for(int j=0; jref_count[i]; j++){ - if (s->ref_list[i][j]==NULL || s->ref_list[i][j]->reference < 2) // Don't know why sometimes the ref_count=1 while there are no references - s->ref_list_cpn[i][j] = -1; - else - s->ref_list_cpn[i][j] = s->ref_list[i][j]->cpn; - } - } - } - - if( (s->pps.weighted_pred && s->slice_type_nos == FF_P_TYPE ) - || (s->pps.weighted_bipred_idc==1 && s->slice_type_nos== FF_B_TYPE ) ){ - pred_weight_table(s, gb); - } - else if(s->pps.weighted_bipred_idc==2 && s->slice_type_nos== FF_B_TYPE){ - implicit_weight_table( s); - }else { - s->use_weight = 0; - } - - if(s->nal_ref_idc){ - ff_h264_ref_pic_marking(n, s, gb); - n->prev_poc_msb= n->poc_msb; - n->prev_poc_lsb= n->poc_lsb; - } - - n->prev_frame_num_offset= n->frame_num_offset; - n->prev_frame_num= n->frame_num; - - if(s->slice_type_nos != FF_B_TYPE){ - s->ip_id= n->ip_id++; - } - - if(s->slice_type_nos==FF_B_TYPE && !s->direct_spatial_mv_pred){ - ff_h264_direct_dist_scale_factor(s); - } - ff_h264_direct_ref_list_init(s); - - - if( s->slice_type_nos != FF_I_TYPE && s->pps.cabac ){ - tmp = get_ue_golomb_31(gb); - if(tmp > 2){ - av_log(AV_LOG_ERROR, "cabac_init_idc overflow\n"); - return -1; - } - s->cabac_init_idc= tmp; - } - - tmp = s->pps.init_qp + get_se_golomb(gb); - if(tmp>51){ - av_log(AV_LOG_ERROR, "QP %u out of range\n", tmp); - return -1; - } - s->qscale= tmp; - - //FIXME qscale / qp ... stuff - if(s->slice_type == FF_SP_TYPE){ - get_bits1(gb); /* sp_for_switch_flag */ - } - if(s->slice_type==FF_SP_TYPE || s->slice_type == FF_SI_TYPE){ - get_se_golomb(gb); /* slice_qs_delta */ - } - - s->slice_alpha_c0_offset = 52; - s->slice_beta_offset = 52; - if( s->pps.deblocking_filter_parameters_present ) { - tmp= get_ue_golomb_31(gb); - if(tmp > 1){ - av_log(AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp); - return -1; - } - - if(tmp < 2) - tmp^= 1; // 1<->0 - - if( tmp ) { - s->slice_alpha_c0_offset += get_se_golomb(gb) << 1; - s->slice_beta_offset += get_se_golomb(gb) << 1; - if( (unsigned) s->slice_alpha_c0_offset > 104U - ||(unsigned) s->slice_beta_offset > 104U){ - av_log(AV_LOG_ERROR, "deblocking filter parameters %d %d out of range\n", s->slice_alpha_c0_offset, s->slice_beta_offset); - return -1; - } - } - } - - s->qp_thresh= 15 + 52 - FFMIN(s->slice_alpha_c0_offset, s->slice_beta_offset) - FFMAX3(0, s->pps.chroma_qp_index_offset[0], s->pps.chroma_qp_index_offset[1]); - - return 0; -} - -PictureInfo *get_pib_entry(NalContext *nc, int coded_pic_num){ - PictureInfo *pic = NULL; - - for(int i=0; ipicture[i].reference==0){ - pic= &nc->picture[i]; - break; - } - } - pic->cpn = coded_pic_num; - - return pic; -} - -int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb1){ - GetBitContext *gb = gb1; - uint8_t *buf = gb1->raw; - int buf_size = gb1->buf_size; - int next_avc = buf_size; - int buf_index=0; - uint8_t *dst=NULL; -// gb->raw = gb1->raw; -// gb->rbsp = NULL; - s->release_cnt=0; - ff_h264_reset_sei(n); - - s->current_picture_info = get_pib_entry(n, n->coded_pic_num); - - for(;;){ - int consumed; - int dst_length; - int bit_length; - const uint8_t *ptr; - int err; - - if (buf_index >= buf_size){ - break; - } else { - // start code prefix search - for(; buf_index + 3 < buf_size; buf_index++){ - // This should always succeed in the first iteration. - if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1) - break; - } - if(buf_index+3 >= buf_size) break; - buf_index+=3; - } - - { - int length = next_avc - buf_index; - int i, si, di; - uint8_t *src= buf+buf_index; - // src[0]&0x80; //forbidden bit - s->nal_ref_idc= src[0]>>5; - s->nal_unit_type= src[0]&0x1F; - - src++; length--; - - for(i=0; i+10 && src[i-1]==0) i--; - if(i+2=length-1){ //no escaped 0 - dst_length= length; - consumed= length+1; //+1 for the header - ptr=src; - }else{ - av_fast_malloc(&gb->rbsp, &gb->rbsp_size, length+FF_INPUT_BUFFER_PADDING_SIZE); - dst = gb->rbsp; -// if (dst){ -// av_free(dst); -// } -// dst = av_malloc(length+FF_INPUT_BUFFER_PADDING_SIZE); - - if (dst == NULL){ - return -1; - } - - //printf("decoding esc\n"); - memcpy(dst, src, i); - si=di=i; - while(si+23){ - dst[di++]= src[si++]; - dst[di++]= src[si++]; - }else if(src[si]==0 && src[si+1]==0){ - if(src[si+2]==3){ //escape - dst[di++]= 0; - dst[di++]= 0; - si+=3; - continue; - }else //next start code - goto nsc; - } - - dst[di++]= src[si++]; - } - while(sirbsp=ptr; - } - } - if (ptr==NULL || dst_length < 0){ - return -1; - } - - //error prevention, should not touch dst_length - while(ptr[dst_length - 1] == 0 && dst_length > 0) - dst_length--; - - bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(ptr + dst_length - 1)); - buf_index += consumed; - - err = 0; - init_get_bits(gb, ptr, bit_length); - switch(s->nal_unit_type){ - case NAL_IDR_SLICE: - idr(n, s); //FIXME ensure we don't loose some frames if there is reordering - case NAL_SLICE: - if((err = decode_slice_header(n, s, gb))) - break; - s->key_frame |= (s->nal_unit_type == NAL_IDR_SLICE) || (n->sei_recovery_frame_cnt >= 0); - break; - case NAL_DPA: - case NAL_DPB: - case NAL_DPC: - av_log(AV_LOG_ERROR,"no slices/data partitioning support\n"); - break; - case NAL_SEI: - ff_h264_decode_sei(n, gb); - break; - case NAL_SPS: - ff_h264_decode_seq_parameter_set(n, gb); - break; - case NAL_PPS: - ff_h264_decode_picture_parameter_set(n, gb, bit_length); - break; - case NAL_AUD: - case NAL_END_SEQUENCE: - case NAL_END_STREAM: - case NAL_FILLER_DATA: - case NAL_SPS_EXT: - case NAL_AUXILIARY_SLICE: - break; - default: - av_log(AV_LOG_ERROR, "Unknown NAL code: %d (%d bits)\n", s->nal_unit_type, bit_length); - } - if (err < 0) - av_log(AV_LOG_ERROR, "decode_slice_header error\n"); - - } - - return buf_index; -} - -NalContext *get_nal_context(int width, int height){ - const int mb_height = (height + 15) / 16; - const int mb_width = (width + 15) / 16; - const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16 - - NalContext *nc = av_mallocz(sizeof(NalContext)); - nc->width = width; - nc->height = height; - nc->mb_height = mb_height; - nc->mb_width = mb_width; - nc->b4_stride = mb_width*4 + 1; - nc->mb_stride = mb_stride; - nc->outputed_poc = INT_MIN; - - for(int i=0; i<16; i++){ - nc->picture[i].cpn =-1; - } - - return nc; -} - -void free_nal_context(NalContext *nc){ - for(int i = 0; i < MAX_SPS_COUNT; i++){ - if (nc->sps_buffers[i]){ - av_free( nc->sps_buffers[i]); - } - } - for(int i = 0; i < MAX_PPS_COUNT; i++){ - if (nc->pps_buffers[i]){ - av_free( nc->pps_buffers[i]); - } - } - av_free(nc); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_nal.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_nal.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -#ifndef H264_NAL_H -#define H264_NAL_H - -#include "avcodec.h" -#include "h264_types.h" - -int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb); -NalContext *get_nal_context(int width, int height); -void free_nal_context(NalContext *nc); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_numa.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_numa.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ - -#include -#include "h264.h" -#include "malloc.h" - -/* -* Pthread version with affinity lock for ED and MBD threads. Deprecated -*/ -int av_transcode_pthread_affinity(int ifile, int ofile, int frame_width, int frame_height, h264_options *opts) { - H264Context *h; - pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; - - h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height, opts); - timer_start = av_gettime(); - - pthread_create(&read_thr, NULL, read_thread, h); - pthread_create(&parsenal_thr, NULL, parsenal_thread, h); - pthread_create(&entropy_thr, NULL, entropy_IPB_thread, h); - pthread_create(&mbdec_thr, NULL, mbdec_thread, h); - pthread_create(&write_thr, NULL, write_thread, h); - - - pthread_join(read_thr, NULL); - pthread_join(parsenal_thr, NULL); - pthread_join(entropy_thr, NULL); - pthread_join(mbdec_thr, NULL); - pthread_join(write_thr, NULL); - - /* finished ! */ - ff_h264_decode_end(h); - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_ompss.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_ompss.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,400 +0,0 @@ -/* -* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder -* Copyright (c) 2003 Michael Niedermayer -* -* This file is part of FFmpeg. -* -* FFmpeg is free software; you can redistribute it and/or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* FFmpeg is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public -* License along with FFmpeg; if not, write to the Free Software -* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -*/ -#include "h264_types.h" -#include "h264_parser.h" -#include "h264_nal.h" -#include "h264_entropy.h" -#include "h264_rec.h" -#include "h264_pred_mode.h" -#include "h264_misc.h" -// #undef NDEBUG -#include - -#pragma omp task inout(*pc, *nc) output(*sbe) -static void parse_task(H264Context *h, ParserContext *pc, NalContext *nc, SliceBufferEntry *sbe){ - H264Slice *s; - - if (!sbe->initialized){ - init_sb_entry(h, sbe); - sbe->lines_total=h->mb_height; - } - - av_read_frame_internal(pc, &sbe->gb); - s = &sbe->slice; - - decode_nal_units(nc, s, &sbe->gb); -} - -#pragma omp task inout(*ec) inout(*sbe) -static void decode_slice_entropy_task(H264Context *h, EntropyContext *ec, SliceBufferEntry *sbe){ - int i,j; - H264Slice *s = &sbe->slice; - GetBitContext *gb = &sbe->gb; - H264Mb *mbs = sbe->mbs; -// GetBitContext *gb = s->gb; - CABACContext *c = &ec->c; - - if( !s->pps.cabac ){ - av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); - return ; - } - - init_dequant_tables(s, ec); - ec->curr_qscale = s->qscale; - ec->last_qscale_diff = 0; - ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale); - ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale); - - /* realign */ - align_get_bits( gb ); - /* init cabac */ - ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); - - ff_h264_init_cabac_states(ec, s, c); - - for(j=0; jmb_height; j++){ - init_entropy_buf(ec, s, j); - for(i=0; imb_width; i++){ - int eos,ret; - H264Mb *m = &mbs[i + j*ec->mb_width]; - m->mb_x=i; - m->mb_y=j; - ec->m = m; - - ret = ff_h264_decode_mb_cabac(ec, s, c); - eos = get_cabac_terminate( c); - (void) eos; - if( ret < 0 || c->bytestream > c->bytestream_end + 2) { - av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); - return ; - } - } - } -} - -static void decode_super_mb_block(MBRecContext *d, H264Slice *s, SuperMBContext *smbc, H264Mb *mbs, int smb_x, int smb_y){ - MBRecState mrs; -// memset(&mrs, 0, sizeof(MBRecState)); - - for (int k=0, i= smb_y; i< smb_y + smbc->smb_height; i++, k++){ - init_mbrec_context(d, &mrs, s, i); - for (int j= smb_x -k ; j< smb_x - k + smbc->smb_width; j++){ - if (i< d->mb_height && j >= 0 && j < d->mb_width){ - h264_decode_mb_internal (d, &mrs, s, &mbs[i*d->mb_width+j]); - } - } - } -} - -#pragma omp task input(*d, *sbe, *ml, *mur) inout(*m) -static void decode_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml, -SuperMBTask *mur, SuperMBTask *m){ - H264Slice *s = &sbe->slice; - H264Mb *mbs = sbe->mbs; - decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y); -} - -#pragma omp task input(*d, *sbe) inout(*sm) -static void draw_edges_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *sm, int line){ - H264Slice *s = &sbe->slice; - for (int i=line*smbc->smb_height; i< (line+1)*smbc->smb_height && i< d->mb_height; i++) - draw_edges(d, s, i); -} - -static void decode_mb_in_slice(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){ - int i,j; - - SuperMBContext *smbc = acquire_smbc(h); - int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width; - SuperMBTask *smbs = smbc->smbs[0]; - - SuperMBTask *sm=NULL, *sml, *smur; - for(j=0; j< smb_height; j++){ - for(i=0; i< smb_width; i++){ - sm = smbs + j*smb_width + i; - sml = sm - ((i > 0) ? 1: 0); - smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); - decode_super_mb_task(d, sbe, smbc, sml, smur, sm); - } - draw_edges_task(d, sbe, smbc, sm, j); - } - #pragma omp taskwait on(*sm) - - release_smbc(h, smbc); -} - -#pragma omp task inout(*d) inout(*sbe) -static void decode_slice_mb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){ - H264Slice *s = &sbe->slice; - - for (int i=0; i<2; i++){ - for(int j=0; j< s->ref_count[i]; j++){ - if (s->ref_list_cpn[i][j] ==-1) - continue; - int k; - for (k=0; k< h->max_dpb_cnt; k++){ - if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ - s->dp_ref_list[i][j] = &h->dpb[k]; - break; - } - } - } - } - - #pragma omp critical (dpb) - get_dpb_entry(h, s); - - if (!h->no_mbd){ - decode_mb_in_slice (h, d, sbe); - } - - for (int i=0; irelease_cnt; i++){ - for(int j=0; jmax_dpb_cnt; j++){ - if(h->dpb[j].cpn== s->release_ref_cpn[i]){ - #pragma omp critical (dpb) - release_dpb_entry(h, &h->dpb[j], 2); - break; - } - } - } - s->release_cnt=0; -} - -// for static 3d wave -/*-------------------------------------------------------------------------------*/ -#pragma omp task input(*d, *sbe, *ml, *mur, *mprev) inout(*m) -static void decode_3dwave_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml, -SuperMBTask *mur, SuperMBTask *mprev, SuperMBTask *m){ - H264Slice *s = &sbe->slice; - H264Mb *mbs = sbe->mbs; - - decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y); -} - -// int init_ref_count=0; -#pragma omp task inout(*d, *sbe, *init) -static void init_ref_list_and_get_dpb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe, int *init){ - H264Slice *s = &sbe->slice; - for (int i=0; i<2; i++){ - for(int j=0; j< s->ref_count[i]; j++){ - if (s->ref_list_cpn[i][j] ==-1) - continue; - int k; - for (k=0; kmax_dpb_cnt; k++){ - if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ - s->dp_ref_list[i][j] = &h->dpb[k]; - break; - } - } - } - } - - #pragma omp critical (dpb) - get_dpb_entry(h, s); - -} - -static SuperMBTask* add_decode_slice_3dwave_tasks(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc){ - int i,j; - - int smb_3d_height =smbc->nsmb_3dheight; - int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width; - int smb_diff_prev = smb_height - smb_3d_height; - SuperMBTask *sm=NULL, *sml, *smur, *smprev; - - SuperMBTask *smbs = smbc->smbs[smbc->index++]; smbc->index%=2; - SuperMBTask *smbs_prev = smbc->smbs[smbc->index]; // index rotates -> next == prev - - for(j=0; j 0) ? 1: 0); - smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); - smprev = smbs_prev + (j + smb_diff_prev+1)*smb_width -1; - decode_3dwave_super_mb_task(d, sbe, smbc, sml, smur, smprev, sm); - } - draw_edges_task(d, sbe, smbc, sm, j); - } - - for(; j< smb_height; j++){ - for(i=0; i< smb_width; i++){ - sm = smbs + j*smb_width + i; - sml = sm - ((i > 0) ? 1: 0); - smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); - decode_super_mb_task(d, sbe, smbc, sml, smur, sm); - } - draw_edges_task(d, sbe, smbc, sm, j); - } - return sm; -} - -#pragma omp task inout(*d, *sbe, *release) input (*lastsmb) -static void release_ref_list_task(H264Context *h, SuperMBContext *smbc, MBRecContext *d, SliceBufferEntry *sbe, SuperMBTask *lastsmb, int *release){ - H264Slice *s = &sbe->slice; - for (int i=0; irelease_cnt; i++){ - for(int j=0; jmax_dpb_cnt; j++){ - if(h->dpb[j].cpn== s->release_ref_cpn[i]){ - #pragma omp critical (dpb) - release_dpb_entry(h, &h->dpb[j], 2); - break; - } - } - } - s->release_cnt=0; - - release_smbc(h, smbc); - -} - -// static void decode_mb_static_3dwave(H264Context *h, int mb_height, int mb_width, MBRecContext *d, H264Slice *s, H264Mb *mbs, SuperMBTask *smbs, SuperMBTask *smbs_prev){ -// -// } -/*-------------------------------------------------------------------------------*/ -//end for static 3d wave - -#pragma omp task inout (*oc) input(*sbe) -static void output_task(H264Context *h, OutputContext *oc, SliceBufferEntry *sbe){ - DecodedPicture* out =output_frame(h, oc, sbe->slice.curr_pic, h->ofile, h->frame_width, h->frame_height); - if (out){ - #pragma omp critical (dpb) - release_dpb_entry(h, out, 1); - } - print_report(oc->frame_number, oc->video_size, 0, h->verbose); -} - -/* -* The following code is the main loop of the file converter -*/ -int h264_decode_ompss( H264Context *h) { - const int bufs = h->pipe_bufs; - - ParserContext *pc; - NalContext *nc; - EntropyContext *ec[bufs]; - MBRecContext *rc[2]; - OutputContext *oc; - SliceBufferEntry *sbe; - SuperMBContext *smbc; - - DecodedPicture *out; - int frames=0; - -#if HAVE_LIBSDL2 - pthread_t sdl_thr; - if (h->display){ - pthread_create(&sdl_thr, NULL, sdl_thread, h); - } -#endif - sbe= av_mallocz(sizeof(SliceBufferEntry) * bufs); - - - pc = get_parse_context(h->ifile); - nc = get_nal_context(h->width, h->height); - - for(int i=0; istatic_3d && bufs < h->num_frames ){ - int num_pre_ed =0; - for (num_pre_ed=0; num_pre_ed< bufs -1 && !pc->final_frame; num_pre_ed++){ - parse_task( h, pc, nc, &sbe[k%bufs] ); - decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); - #pragma omp taskwait on(*pc) - k++; - } - - while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ - parse_task( h, pc, nc, &sbe[k%bufs] ); - decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); - - k++; - - init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init); - smbc = acquire_smbc(h); - SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc); - release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release); - - output_task (h, oc, &sbe[k%bufs]); - #pragma omp taskwait on(*pc) - } - - for (int i=0; i< num_pre_ed; i++){ - k++; - init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init); - smbc = acquire_smbc(h); - SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc); - release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release); - - output_task (h, oc, &sbe[k%bufs]); - } - - } else { - while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ - parse_task( h, pc, nc, &sbe[k%bufs] ); - - decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); - - decode_slice_mb_task(h, rc[0], &sbe[k%bufs]); - - output_task (h, oc, &sbe[k%bufs]); - #pragma omp taskwait on(*pc) - k++; - } - } - #pragma omp taskwait - - while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; - - print_report(oc->frame_number, oc->video_size, 1, h->verbose); - h->num_frames = oc->frame_number; - /* finished ! */ - - free_parse_context(pc); - free_nal_context (nc); - free_output_context(oc); - for (int i=0; idisplay){ - signal_sdl_exit(h); - pthread_join(sdl_thr, NULL); - } -#endif - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_parser.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_parser.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,224 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... parser - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 parser. - * @author Michael Niedermayer - */ - -#include - -#include "golomb.h" -#include "libavutil/error.h" -#include "h264_types.h" - -#undef NDEBUG -#include - -#define END_NOT_FOUND (-100) - -static int ff_h264_find_frame_end(ParserContext *s, const uint8_t *buf, int buf_size) -{ - int i; - uint32_t state; - - state= s->state; - if(state>13) - state= 7; - - for(i=0; i7, 1->4, 0->5 - else if(buf[i]) state = 7; - else state>>=1; //2->1, 1->0, 0->0 - }else if(state<=5){ - int v= buf[i] & 0x1F; - if(v==6 || v==7 || v==8 || v==9){ - if(s->frame_start_found){ - i++; - goto found; - } - }else if(v==1 || v==2 || v==5){ - if(s->frame_start_found){ - state+=8; - continue; - }else - s->frame_start_found = 1; - } - state= 7; - }else{ - if(buf[i] & 0x80) - goto found; - state= 7; - } - } - s->state= state; - return END_NOT_FOUND; - -found: - s->state=7; - s->frame_start_found= 0; - return i-(state&5); -} - -static int ff_combine_frame(ParserContext *s, GetBitContext *gb, int next, uint8_t **buf, int *buf_size) -{ - int i; - /* Copy overread bytes from last frame into buffer. */ - for(i =0; s->overread_cnt>0; s->overread_cnt--, i++){ - gb->raw[s->index++]= s->overread[i]; - } - - /* EOF - END_NOT_FOUND means no next frame start is found in current partial read. If buf_size of the partial read is 0 we are at EOF */ - if(!*buf_size && next == END_NOT_FOUND){ - next= 0; - } - s->last_index= s->index; - - /* copy into buffer end return */ - if(next == END_NOT_FOUND){ - gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, (*buf_size) + s->index + FF_INPUT_BUFFER_PADDING_SIZE); - memcpy(&gb->raw[s->index], *buf, *buf_size); - s->index += *buf_size; - return -1; - } - - ///end found - *buf_size= s->index + next; - /* append to buffer */ - - gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, next + s->index + FF_INPUT_BUFFER_PADDING_SIZE); - memcpy(&gb->raw[s->index], *buf, next + FF_INPUT_BUFFER_PADDING_SIZE ); - s->index = 0; - - /* store overread bytes */ - for(i=0; next < 0; next++, i++){ - s->state = (s->state<<8) | gb->raw[s->last_index + next]; - s->overread[i] = gb->raw[s->last_index + next]; - s->overread_cnt++; - } - - return 0; -} - -static int h264_parse(ParserContext *s, GetBitContext *gb, - uint8_t *buf, int buf_size) -{ - int next; - - next= ff_h264_find_frame_end(s, buf, buf_size); - - if (ff_combine_frame(s, gb, next, &buf, &buf_size) < 0) { - gb->buf_size = 0; - return buf_size; - } - - if(next<0 && next != END_NOT_FOUND){ - assert(s->last_index + next >= 0 ); - ff_h264_find_frame_end(s, &gb->raw[s->last_index + next], -next); //update state - } - - gb->buf_size = buf_size; - return next; -} - -static int ff_raw_read_partial_packet(ParserContext *pc) -{ - int len= -1; - - if (!pc->eof_reached){ - len = read( pc->ifile, pc->data, pc->buffer_size); -// printf("read task %d\t%d\n", pc->ifile, len); fflush(NULL); - if (len < pc->buffer_size) { - pc->eof_reached = 1; - } - } - - return len; -} - -void av_read_frame_internal(ParserContext *pc, GetBitContext *gb){ - int len; - uint8_t dummy_buf[FF_INPUT_BUFFER_PADDING_SIZE]={0}; - av_fast_malloc(&gb->raw, &gb->alloc_size, 2048+FF_INPUT_BUFFER_PADDING_SIZE); - - //Parsing is performed before read, since there are ussually leftovers from parsing the previous frame. - for(;;) { - if (pc->cur_len>0){ - len = h264_parse(pc, gb, pc->cur_ptr, pc->cur_len); - if (len<0) - len =0; - //* increment read pointer */ - pc->cur_ptr += len; - pc->cur_len -= len; - - if (gb->buf_size) { - break; - } - } - - //check for ret and not parser->eof_reached as one "read" can contain more than 1 frame - pc->size= ff_raw_read_partial_packet(pc); - if (pc->size < 0) { - pc->final_frame =1; - /* return the last frames, if any */ - h264_parse(pc, gb, dummy_buf, 0); - break; - } - pc->cur_ptr = pc->data; - pc->cur_len = pc->size; - } - - assert(gb->raw!=NULL); - -} - -ParserContext *get_parse_context(int ifile){ - ParserContext *pc = av_mallocz(sizeof(ParserContext)); - pc->buffer_size = 2048; - pc->final_frame = 0; - pc->cur_len= 0; - pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE); - pc->size = 2048; - pc->eof_reached =0; - pc->ifile = ifile; - - return pc; -} - -void free_parse_context(ParserContext *pc){ - av_free(pc->data); - av_free(pc); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_parser.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_parser.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#ifndef H264_PARSER_H -#define H264_PARSER_H - -#include "h264_types.h" - -void av_read_frame_internal(ParserContext *pc, GetBitContext *gb); -ParserContext *get_parse_context(int ifile); -void free_parse_context(ParserContext *pc); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_pred.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,945 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 prediction functions. - * @author Michael Niedermayer - */ - -#include "avcodec.h" -#include "h264_pred.h" -//#include "dsputil.h" - -static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const uint32_t a= ((uint32_t*)(src-stride))[0]; - ((uint32_t*)(src+0*stride))[0]= a; - ((uint32_t*)(src+1*stride))[0]= a; - ((uint32_t*)(src+2*stride))[0]= a; - ((uint32_t*)(src+3*stride))[0]= a; -} - -static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; - ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; - ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; - ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; -} - -static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] - + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; - - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; -} - -static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; - - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; -} - -static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; - - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; -} - -static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - ((uint32_t*)(src+0*stride))[0]= - ((uint32_t*)(src+1*stride))[0]= - ((uint32_t*)(src+2*stride))[0]= - ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; -} - - -#define LOAD_TOP_RIGHT_EDGE\ - const int av_unused t4= topright[0];\ - const int av_unused t5= topright[1];\ - const int av_unused t6= topright[2];\ - const int av_unused t7= topright[3];\ - -#define LOAD_DOWN_LEFT_EDGE\ - const int av_unused l4= src[-1+4*stride];\ - const int av_unused l5= src[-1+5*stride];\ - const int av_unused l6= src[-1+6*stride];\ - const int av_unused l7= src[-1+7*stride];\ - -#define LOAD_LEFT_EDGE\ - const int av_unused l0= src[-1+0*stride];\ - const int av_unused l1= src[-1+1*stride];\ - const int av_unused l2= src[-1+2*stride];\ - const int av_unused l3= src[-1+3*stride];\ - -#define LOAD_TOP_EDGE\ - const int av_unused t0= src[ 0-1*stride];\ - const int av_unused t1= src[ 1-1*stride];\ - const int av_unused t2= src[ 2-1*stride];\ - const int av_unused t3= src[ 3-1*stride];\ - -static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int lt= src[-1-1*stride]; - LOAD_TOP_EDGE - LOAD_LEFT_EDGE - - src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; - src[0+2*stride]= - src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; - src[0+1*stride]= - src[1+2*stride]= - src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; - src[0+0*stride]= - src[1+1*stride]= - src[2+2*stride]= - src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; - src[1+0*stride]= - src[2+1*stride]= - src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; - src[2+0*stride]= - src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; -} - -static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){ - LOAD_TOP_EDGE - LOAD_TOP_RIGHT_EDGE -// LOAD_LEFT_EDGE - - src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; - src[1+0*stride]= - src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; - src[2+0*stride]= - src[1+1*stride]= - src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; - src[3+0*stride]= - src[2+1*stride]= - src[1+2*stride]= - src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; - src[3+1*stride]= - src[2+2*stride]= - src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; - src[3+2*stride]= - src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; - src[3+3*stride]=(t6 + 3*t7 + 2)>>2; -} - -static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int lt= src[-1-1*stride]; - LOAD_TOP_EDGE - LOAD_LEFT_EDGE - - src[0+0*stride]= - src[1+2*stride]=(lt + t0 + 1)>>1; - src[1+0*stride]= - src[2+2*stride]=(t0 + t1 + 1)>>1; - src[2+0*stride]= - src[3+2*stride]=(t1 + t2 + 1)>>1; - src[3+0*stride]=(t2 + t3 + 1)>>1; - src[0+1*stride]= - src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; - src[1+1*stride]= - src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; - src[2+1*stride]= - src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; - src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; - src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; -} - -static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){ - LOAD_TOP_EDGE - LOAD_TOP_RIGHT_EDGE - - src[0+0*stride]=(t0 + t1 + 1)>>1; - src[1+0*stride]= - src[0+2*stride]=(t1 + t2 + 1)>>1; - src[2+0*stride]= - src[1+2*stride]=(t2 + t3 + 1)>>1; - src[3+0*stride]= - src[2+2*stride]=(t3 + t4+ 1)>>1; - src[3+2*stride]=(t4 + t5+ 1)>>1; - src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[1+1*stride]= - src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; - src[2+1*stride]= - src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; - src[3+1*stride]= - src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; - src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; -} - -static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - LOAD_LEFT_EDGE - - src[0+0*stride]=(l0 + l1 + 1)>>1; - src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; - src[2+0*stride]= - src[0+1*stride]=(l1 + l2 + 1)>>1; - src[3+0*stride]= - src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; - src[2+1*stride]= - src[0+2*stride]=(l2 + l3 + 1)>>1; - src[3+1*stride]= - src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; - src[3+2*stride]= - src[1+3*stride]= - src[0+3*stride]= - src[2+2*stride]= - src[2+3*stride]= - src[3+3*stride]=l3; -} - - -static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){ - (void) topright; - const int lt= src[-1-1*stride]; - LOAD_TOP_EDGE - LOAD_LEFT_EDGE - - src[0+0*stride]= - src[2+1*stride]=(lt + l0 + 1)>>1; - src[1+0*stride]= - src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; - src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; - src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; - src[0+1*stride]= - src[2+2*stride]=(l0 + l1 + 1)>>1; - src[1+1*stride]= - src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; - src[0+2*stride]= - src[2+3*stride]=(l1 + l2+ 1)>>1; - src[1+2*stride]= - src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; - src[0+3*stride]=(l2 + l3 + 1)>>1; - src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; -} - -static void pred16x16_vertical_c(uint8_t *src, int stride){ - int i; - const uint32_t a= ((uint32_t*)(src-stride))[0]; - const uint32_t b= ((uint32_t*)(src-stride))[1]; - const uint32_t c= ((uint32_t*)(src-stride))[2]; - const uint32_t d= ((uint32_t*)(src-stride))[3]; - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= a; - ((uint32_t*)(src+i*stride))[1]= b; - ((uint32_t*)(src+i*stride))[2]= c; - ((uint32_t*)(src+i*stride))[3]= d; - } -} - -static void pred16x16_horizontal_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; - } -} - -static void pred16x16_dc_c(uint8_t *src, int stride){ - int i, dc=0; - - for(i=0;i<16; i++){ - dc+= src[-1+i*stride]; - } - - for(i=0;i<16; i++){ - dc+= src[i-stride]; - } - - dc= 0x01010101*((dc + 16)>>5); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } -} - -static void pred16x16_left_dc_c(uint8_t *src, int stride){ - int i, dc=0; - - for(i=0;i<16; i++){ - dc+= src[-1+i*stride]; - } - - dc= 0x01010101*((dc + 8)>>4); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } -} - -static void pred16x16_top_dc_c(uint8_t *src, int stride){ - int i, dc=0; - - for(i=0;i<16; i++){ - dc+= src[i-stride]; - } - dc= 0x01010101*((dc + 8)>>4); - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= dc; - } -} - -static void pred16x16_128_dc_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<16; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= - ((uint32_t*)(src+i*stride))[2]= - ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U; - } -} - -static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){ - int i, j, k; - int a; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - const uint8_t * const src0 = src+7-stride; - const uint8_t *src1 = src+8*stride-1; - const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; - int H = src0[1] - src0[-1]; - int V = src1[0] - src2[ 0]; - for(k=2; k<=8; ++k) { - src1 += stride; src2 -= stride; - H += k*(src0[k] - src0[-k]); - V += k*(src1[0] - src2[ 0]); - } - if(svq3){ - H = ( 5*(H/4) ) / 16; - V = ( 5*(V/4) ) / 16; - - /* required for 100% accuracy */ - i = H; H = V; V = i; - }else if(rv40){ - H = ( H + (H>>2) ) >> 4; - V = ( V + (V>>2) ) >> 4; - }else{ - H = ( 5*H+32 ) >> 6; - V = ( 5*V+32 ) >> 6; - } - - a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); - for(j=16; j>0; --j) { - int b = a; - a += V; - for(i=-16; i<0; i+=4) { - src[16+i] = cm[ (b ) >> 5 ]; - src[17+i] = cm[ (b+ H) >> 5 ]; - src[18+i] = cm[ (b+2*H) >> 5 ]; - src[19+i] = cm[ (b+3*H) >> 5 ]; - b += 4*H; - } - src += stride; - } -} - -static void pred16x16_plane_c(uint8_t *src, int stride){ - pred16x16_plane_compat_c(src, stride, 0, 0); -} - - -static void pred8x8_vertical_c(uint8_t *src, int stride){ - int i; - const uint32_t a= ((uint32_t*)(src-stride))[0]; - const uint32_t b= ((uint32_t*)(src-stride))[1]; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= a; - ((uint32_t*)(src+i*stride))[1]= b; - } -} - -static void pred8x8_horizontal_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; - } -} - -static void pred8x8_128_dc_c(uint8_t *src, int stride){ - int i; - - for(i=0; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; - } -} - -static void pred8x8_left_dc_c(uint8_t *src, int stride){ - int i; - int dc0, dc2; - - dc0=dc2=0; - for(i=0;i<4; i++){ - dc0+= src[-1+i*stride]; - dc2+= src[-1+(i+4)*stride]; - } - dc0= 0x01010101*((dc0 + 2)>>2); - dc2= 0x01010101*((dc2 + 2)>>2); - - for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= dc0; - } - for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= - ((uint32_t*)(src+i*stride))[1]= dc2; - } -} - - -static void pred8x8_top_dc_c(uint8_t *src, int stride){ - int i; - int dc0, dc1; - - dc0=dc1=0; - for(i=0;i<4; i++){ - dc0+= src[i-stride]; - dc1+= src[4+i-stride]; - } - dc0= 0x01010101*((dc0 + 2)>>2); - dc1= 0x01010101*((dc1 + 2)>>2); - - for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; - } - for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; - } -} - -static void pred8x8_dc_c(uint8_t *src, int stride){ - int i; - int dc0, dc1, dc2, dc3; - - dc0=dc1=dc2=0; - for(i=0;i<4; i++){ - dc0+= src[-1+i*stride] + src[i-stride]; - dc1+= src[4+i-stride]; - dc2+= src[-1+(i+4)*stride]; - } - dc3= 0x01010101*((dc1 + dc2 + 4)>>3); - dc0= 0x01010101*((dc0 + 4)>>3); - dc1= 0x01010101*((dc1 + 2)>>2); - dc2= 0x01010101*((dc2 + 2)>>2); - - for(i=0; i<4; i++){ - ((uint32_t*)(src+i*stride))[0]= dc0; - ((uint32_t*)(src+i*stride))[1]= dc1; - } - for(i=4; i<8; i++){ - ((uint32_t*)(src+i*stride))[0]= dc2; - ((uint32_t*)(src+i*stride))[1]= dc3; - } -} - -//the following 4 function should not be optimized! -static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){ - pred8x8_top_dc_c(src, stride); - pred4x4_dc_c(src, NULL, stride); -} - -static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){ - pred8x8_dc_c(src, stride); - pred4x4_top_dc_c(src, NULL, stride); -} - -static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){ - pred8x8_left_dc_c(src, stride); - pred4x4_128_dc_c(src + 4*stride , NULL, stride); - pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride); -} - -static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){ - pred8x8_left_dc_c(src, stride); - pred4x4_128_dc_c(src , NULL, stride); - pred4x4_128_dc_c(src + 4, NULL, stride); -} - -static void pred8x8_plane_c(uint8_t *src, int stride){ - int j, k; - int a; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - const uint8_t * const src0 = src+3-stride; - const uint8_t *src1 = src+4*stride-1; - const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; - int H = src0[1] - src0[-1]; - int V = src1[0] - src2[ 0]; - for(k=2; k<=4; ++k) { - src1 += stride; src2 -= stride; - H += k*(src0[k] - src0[-k]); - V += k*(src1[0] - src2[ 0]); - } - H = ( 17*H+16 ) >> 5; - V = ( 17*V+16 ) >> 5; - - a = 16*(src1[0] + src2[8]+1) - 3*(V+H); - for(j=8; j>0; --j) { - int b = a; - a += V; - src[0] = cm[ (b ) >> 5 ]; - src[1] = cm[ (b+ H) >> 5 ]; - src[2] = cm[ (b+2*H) >> 5 ]; - src[3] = cm[ (b+3*H) >> 5 ]; - src[4] = cm[ (b+4*H) >> 5 ]; - src[5] = cm[ (b+5*H) >> 5 ]; - src[6] = cm[ (b+6*H) >> 5 ]; - src[7] = cm[ (b+7*H) >> 5 ]; - src += stride; - } -} - -#define SRC(x,y) src[(x)+(y)*stride] -#define PL(y) \ - const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; -#define PREDICT_8x8_LOAD_LEFT \ - const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ - + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ - PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ - const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 - -#define PT(x) \ - const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; -#define PREDICT_8x8_LOAD_TOP \ - const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ - + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ - PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ - const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ - + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 - -#define PTR(x) \ - t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; -#define PREDICT_8x8_LOAD_TOPRIGHT \ - int t8, t9, t10, t11, t12, t13, t14, t15; \ - if(has_topright) { \ - PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ - t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ - } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); - -#define PREDICT_8x8_LOAD_TOPLEFT \ - const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 - -#define PREDICT_8x8_DC(v) \ - int y; \ - for( y = 0; y < 8; y++ ) { \ - ((uint32_t*)src)[0] = \ - ((uint32_t*)src)[1] = v; \ - src += stride; \ - } - -static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - (void) has_topleft; (void) has_topright; - PREDICT_8x8_DC(0x80808080); -} - -static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - (void) has_topleft; (void) has_topright; - PREDICT_8x8_LOAD_LEFT; - const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; - PREDICT_8x8_DC(dc); -} - -static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - PREDICT_8x8_LOAD_TOP; - const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; - PREDICT_8x8_DC(dc); -} - -static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOP; - const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 - +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; - PREDICT_8x8_DC(dc); -} - -static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - (void) has_topleft; (void) has_topright; - PREDICT_8x8_LOAD_LEFT; -#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ - ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y - ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); -#undef ROW -} - -static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - int y; - PREDICT_8x8_LOAD_TOP; - src[0] = t0; - src[1] = t1; - src[2] = t2; - src[3] = t3; - src[4] = t4; - src[5] = t5; - src[6] = t6; - src[7] = t7; - for( y = 1; y < 8; y++ ) - *(uint64_t*)(src+y*stride) = *(uint64_t*)src; -} - -static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_TOPRIGHT; - SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; - SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; - SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; - SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; - SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; - SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; - SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; - SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; - SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; - SRC(7,7)= (t14 + 3*t15 + 2) >> 2; -} - -static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOPLEFT; - SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; - SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; - SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; - SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; - SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; - SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; - SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; - SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; - SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; - SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; -} - -static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOPLEFT; - SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; - SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; - SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; - SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; - SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; - SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; - SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; - SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; - SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; - SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; - SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; - SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; - SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; - SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; - SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; - SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; - SRC(7,0)= (t6 + t7 + 1) >> 1; -} - -static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_LEFT; - PREDICT_8x8_LOAD_TOPLEFT; - SRC(0,7)= (l6 + l7 + 1) >> 1; - SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; - SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; - SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; - SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; - SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; - SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; - SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; - SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; - SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; - SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; - SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; - SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; - SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; - SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; - SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; - SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; - SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; - SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; - SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; - SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; - SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; -} - -static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - PREDICT_8x8_LOAD_TOP; - PREDICT_8x8_LOAD_TOPRIGHT; - SRC(0,0)= (t0 + t1 + 1) >> 1; - SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; - SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; - SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; - SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; - SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; - SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; - SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; - SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; - SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; - SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; - SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; - SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; - SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; - SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; - SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; - SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; - SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; - SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; - SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; - SRC(7,6)= (t10 + t11 + 1) >> 1; - SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; -} - -static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride){ - (void) has_topleft; (void) has_topright; - PREDICT_8x8_LOAD_LEFT; - SRC(0,0)= (l0 + l1 + 1) >> 1; - SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; - SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; - SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; - SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; - SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; - SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; - SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; - SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; - SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; - SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; - SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; - SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; - SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; - SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= - SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= - SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= - SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; -} -#undef PREDICT_8x8_LOAD_LEFT -#undef PREDICT_8x8_LOAD_TOP -#undef PREDICT_8x8_LOAD_TOPLEFT -#undef PREDICT_8x8_LOAD_TOPRIGHT -#undef PREDICT_8x8_DC -#undef PTR -#undef PT -#undef PL -#undef SRC - -static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ - int i; - pix -= stride; - for(i=0; i<4; i++){ - uint8_t v = pix[0]; - pix[1*stride]= v += block[0]; - pix[2*stride]= v += block[4]; - pix[3*stride]= v += block[8]; - pix[4*stride]= v + block[12]; - pix++; - block++; - } -} - -static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ - int i; - for(i=0; i<4; i++){ - uint8_t v = pix[-1]; - pix[0]= v += block[0]; - pix[1]= v += block[1]; - pix[2]= v += block[2]; - pix[3]= v + block[3]; - pix+= stride; - block+= 4; - } -} - -static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ - int i; - pix -= stride; - for(i=0; i<8; i++){ - uint8_t v = pix[0]; - pix[1*stride]= v += block[0]; - pix[2*stride]= v += block[8]; - pix[3*stride]= v += block[16]; - pix[4*stride]= v += block[24]; - pix[5*stride]= v += block[32]; - pix[6*stride]= v += block[40]; - pix[7*stride]= v += block[48]; - pix[8*stride]= v + block[56]; - pix++; - block++; - } -} - -static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ - int i; - for(i=0; i<8; i++){ - uint8_t v = pix[-1]; - pix[0]= v += block[0]; - pix[1]= v += block[1]; - pix[2]= v += block[2]; - pix[3]= v += block[3]; - pix[4]= v += block[4]; - pix[5]= v += block[5]; - pix[6]= v += block[6]; - pix[7]= v + block[7]; - pix+= stride; - block+= 8; - } -} - -static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ - int i; - for(i=0; i<16; i++) - pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); -} - -static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ - int i; - for(i=0; i<16; i++) - pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); -} - -static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ - int i; - for(i=0; i<4; i++) - pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); -} - -static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ - int i; - for(i=0; i<4; i++) - pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); -} - - -/** - * Sets the intra prediction function pointers. - */ -void ff_h264_pred_init(H264PredContext *h){ - - h->pred4x4[VERT_PRED ]= pred4x4_vertical_c; - h->pred4x4[HOR_PRED ]= pred4x4_horizontal_c; - h->pred4x4[DC_PRED ]= pred4x4_dc_c; - h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c; - h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c; - h->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c; - h->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c; - h->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c; - h->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c; - h->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c; - h->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c; - h->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c; - - h->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c; - h->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c; - h->pred8x8l[DC_PRED ]= pred8x8l_dc_c; - h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c; - h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c; - h->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c; - h->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c; - h->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c; - h->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c; - h->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c; - h->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c; - h->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c; - - h->pred8x8[VERT_PRED8x8 ]= pred8x8_vertical_c; - h->pred8x8[HOR_PRED8x8 ]= pred8x8_horizontal_c; - h->pred8x8[PLANE_PRED8x8 ]= pred8x8_plane_c; - - h->pred8x8[DC_PRED8x8 ]= pred8x8_dc_c; - h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c; - h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c; - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= pred8x8_mad_cow_dc_l0t; - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= pred8x8_mad_cow_dc_0lt; - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= pred8x8_mad_cow_dc_l00; - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= pred8x8_mad_cow_dc_0l0; - - h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c; - - h->pred16x16[DC_PRED8x8 ]= pred16x16_dc_c; - h->pred16x16[VERT_PRED8x8 ]= pred16x16_vertical_c; - h->pred16x16[HOR_PRED8x8 ]= pred16x16_horizontal_c; - h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c; - - h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c; - - h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c; - h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c; - h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c; - - //special lossless h/v prediction for h264 - h->pred4x4_add [VERT_PRED ]= pred4x4_vertical_add_c; - h->pred4x4_add [ HOR_PRED ]= pred4x4_horizontal_add_c; - h->pred8x8l_add [VERT_PRED ]= pred8x8l_vertical_add_c; - h->pred8x8l_add [ HOR_PRED ]= pred8x8l_horizontal_add_c; - h->pred8x8_add [VERT_PRED8x8]= pred8x8_vertical_add_c; - h->pred8x8_add [ HOR_PRED8x8]= pred8x8_horizontal_add_c; - h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c; - h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c; - - if (HAVE_NEON) ff_h264_pred_init_arm(h); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_pred.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 prediction functions. - * @author Michael Niedermayer - */ - -#ifndef AVCODEC_H264PRED_H -#define AVCODEC_H264PRED_H - -#include "libavutil/common.h" -#include "dsputil.h" - -/** - * Prediction types - */ -//@{ -#define VERT_PRED 0 -#define HOR_PRED 1 -#define DC_PRED 2 -#define DIAG_DOWN_LEFT_PRED 3 -#define DIAG_DOWN_RIGHT_PRED 4 -#define VERT_RIGHT_PRED 5 -#define HOR_DOWN_PRED 6 -#define VERT_LEFT_PRED 7 -#define HOR_UP_PRED 8 - -#define LEFT_DC_PRED 9 -#define TOP_DC_PRED 10 -#define DC_128_PRED 11 - -#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN 12 -#define HOR_UP_PRED_RV40_NODOWN 13 -#define VERT_LEFT_PRED_RV40_NODOWN 14 - -#define DC_PRED8x8 0 -#define HOR_PRED8x8 1 -#define VERT_PRED8x8 2 -#define PLANE_PRED8x8 3 - -#define LEFT_DC_PRED8x8 4 -#define TOP_DC_PRED8x8 5 -#define DC_128_PRED8x8 6 - -#define ALZHEIMER_DC_L0T_PRED8x8 7 -#define ALZHEIMER_DC_0LT_PRED8x8 8 -#define ALZHEIMER_DC_L00_PRED8x8 9 -#define ALZHEIMER_DC_0L0_PRED8x8 10 -//@} - -/** - * Context for storing H.264 prediction functions - */ -typedef struct H264PredContext{ - void (*pred4x4 [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp? - void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride); - void (*pred8x8 [4+3+4])(uint8_t *src, int stride); - void (*pred16x16[4+3])(uint8_t *src, int stride); - - void (*pred4x4_add [2])(uint8_t *pix/*align 4*/, const DCTELEM *block/*align 16*/, int stride); - void (*pred8x8l_add [2])(uint8_t *pix/*align 8*/, const DCTELEM *block/*align 16*/, int stride); - void (*pred8x8_add [3])(uint8_t *pix/*align 8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); - void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); -}H264PredContext; - -void ff_h264_pred_init(H264PredContext *h); -void ff_h264_pred_init_arm(H264PredContext *h); - - -#endif /* AVCODEC_H264PRED_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1013 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 direct mb/block decoding. - * @author Michael Niedermayer - */ - -#include "dsputil.h" -#include "avcodec.h" -#include "h264_data.h" -#include "h264.h" -#include "rectangle.h" - -//#undef NDEBUG -#include - -static const uint8_t left_block_options[4][16]={ - {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, - {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, - {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, - {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} -}; - - -// static void check_cache_copy(MBRecContext *mrc, H264Slice *s, H264Mb *m){ -// for (int list=0; list<2; list++){ -// for (int i=0; i<40; i++){ -// assert (m->ref_cache[list][i] == m->ref_cache_copy[list][i]); -// assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy[list][i][0]); -// assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy[list][i][1]); -// } -// } -// } - -// static void check_cache_copy2(MBRecContext *mrc, H264Slice *s, H264Mb *m){ -// for (int list=0; list<2; list++){ -// for (int i=0; i<40; i++){ -// assert (m->ref_cache[list][i] == m->ref_cache_copy2[list][i]); -// assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy2[list][i][0]); -// assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy2[list][i][1]); -// } -// } -// } - -static void fill_decode_caches_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ - int topleft_type, top_type, topright_type, left_type; - const uint8_t * left_block= left_block_options[0]; - const int mb_x = m->mb_x; - int i; - - mrs->top_type = mrs->mb_type_top[mb_x ]; - mrs->left_type = mrs->mb_type [mb_x-1]; - - topleft_type = mrs->mb_type_top[mb_x-1]; - top_type = mrs->mb_type_top[mb_x ]; - topright_type= mrs->mb_type_top[mb_x+1]; - left_type = mrs->mb_type [mb_x-1]; - - int type_mask= s->pps.constrained_intra_pred ? 1 : -1; - - if(!IS_SKIP(mb_type)){ -// memset(mrc->non_zero_count_cache, 0, sizeof(mrc->non_zero_count_cache)); - AV_COPY32(&mrs->non_zero_count_cache[4+8*1], &m->non_zero_count[ 0]); - AV_COPY32(&mrs->non_zero_count_cache[4+8*2], &m->non_zero_count[ 4]); - AV_COPY32(&mrs->non_zero_count_cache[4+8*3], &m->non_zero_count[ 8]); - AV_COPY32(&mrs->non_zero_count_cache[4+8*4], &m->non_zero_count[12]); - - for (int i=0; i<2; i++) { - mrs->non_zero_count_cache[8*1 + 8*i + 1] = m->non_zero_count[16 + i*2 ]; - mrs->non_zero_count_cache[8*1 + 8*i + 2] = m->non_zero_count[16 + i*2 +1]; - mrs->non_zero_count_cache[8*4 + 8*i + 1] = m->non_zero_count[20 + i*2 ]; - mrs->non_zero_count_cache[8*4 + 8*i + 2] = m->non_zero_count[20 + i*2 +1]; - } - - if(IS_INTRA(mb_type)){ -// memset(mrc->intra4x4_pred_mode_cache, 0, sizeof(mrc->intra4x4_pred_mode_cache)); - - mrs->topleft_samples_available= - mrs->top_samples_available= - mrs->left_samples_available= 0xFFFF; - mrs->topright_samples_available= 0xEEEA; - - if(!(top_type & type_mask)){ - mrs->topleft_samples_available= 0xB3FF; - mrs->top_samples_available= 0x33FF; - mrs->topright_samples_available= 0x26EA; - } - - if(!(left_type & type_mask)){ - mrs->topleft_samples_available&= 0xDF5F; - mrs->left_samples_available&= 0x5F5F; - } - - if(!(topleft_type & type_mask)) - mrs->topleft_samples_available&= 0x7FFF; - - if(!(topright_type & type_mask)) - mrs->topright_samples_available&= 0xFBFF; - - if(IS_INTRA4x4(mb_type)){ - if(IS_INTRA4x4(top_type)){ - AV_COPY32(mrs->intra4x4_pred_mode_cache+4+8*0, &mrs->intra4x4_pred_mode_top[4*mb_x]); - }else{ - mrs->intra4x4_pred_mode_cache[4+8*0]= - mrs->intra4x4_pred_mode_cache[5+8*0]= - mrs->intra4x4_pred_mode_cache[6+8*0]= - mrs->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask); - } - - if(IS_INTRA4x4(left_type)){ -#if OMPSS - mrs->intra4x4_pred_mode_cache[3+8*1]= m->intra4x4_pred_mode_left[0]; - mrs->intra4x4_pred_mode_cache[3+8*2]= m->intra4x4_pred_mode_left[1]; - mrs->intra4x4_pred_mode_cache[3+8*3]= m->intra4x4_pred_mode_left[2]; - mrs->intra4x4_pred_mode_cache[3+8*4]= m->intra4x4_pred_mode_left[3]; -#else - mrs->intra4x4_pred_mode_cache[3+8*1]= mrs->intra4x4_pred_mode_left[0]; - mrs->intra4x4_pred_mode_cache[3+8*2]= mrs->intra4x4_pred_mode_left[1]; - mrs->intra4x4_pred_mode_cache[3+8*3]= mrs->intra4x4_pred_mode_left[2]; - mrs->intra4x4_pred_mode_cache[3+8*4]= mrs->intra4x4_pred_mode_left[3]; -#endif - }else{ - mrs->intra4x4_pred_mode_cache[3+8*1]= - mrs->intra4x4_pred_mode_cache[3+8*2]= - mrs->intra4x4_pred_mode_cache[3+8*3]= - mrs->intra4x4_pred_mode_cache[3+8*4]= 2 - 3*!(left_type & type_mask); - } - } - } - } - - if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ - int list; - -// memset(mrs->mv_cache, 0, sizeof(mrs->mv_cache)); -// memset(mrs->ref_cache, 0, sizeof(mrs->ref_cache)); - - mrs->ref_cache[0][scan8[5 ]+1] = mrs->ref_cache[0][scan8[7 ]+1] = mrs->ref_cache[0][scan8[13]+1] = - mrs->ref_cache[1][scan8[5 ]+1] = mrs->ref_cache[1][scan8[7 ]+1] = mrs->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; - - for(list=0; listlist_count; list++){ - if(!USES_LIST(mb_type, list)){ - continue; - } - assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); - - if(USES_LIST(top_type, list)){ - const int b_xy= 4*mb_x + 3*mrc->b_stride; - AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]); - mrs->ref_cache[list][scan8[0] + 0 - 1*8]= - mrs->ref_cache[list][scan8[0] + 1 - 1*8]= mrs->ref_index_top[list][4*mb_x + 2]; - mrs->ref_cache[list][scan8[0] + 2 - 1*8]= - mrs->ref_cache[list][scan8[0] + 3 - 1*8]= mrs->ref_index_top[list][4*mb_x + 3]; - }else{ - AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]); - AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); - } - - if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ - for(i=0; i<2; i++){ - int cache_idx = scan8[0] - 1 + i*2*8; - if(USES_LIST(left_type, list)){ - const int b_xy= 4*(mb_x-1) + 3; - const int b8_x= 4*(mb_x-1) + 1; - AV_COPY32(mrs->mv_cache[list][cache_idx ], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[0+i*2]]); - AV_COPY32(mrs->mv_cache[list][cache_idx+8], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[1+i*2]]); - mrs->ref_cache[list][cache_idx ]= mrs->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; - mrs->ref_cache[list][cache_idx+8]= mrs->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; - }else{ - AV_ZERO32(mrs->mv_cache [list][cache_idx ]); - AV_ZERO32(mrs->mv_cache [list][cache_idx+8]); - mrs->ref_cache[list][cache_idx ]= - mrs->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); - } - } - }else{ - if(USES_LIST(left_type, list)){ - const int b_x = 4*(mb_x-1) + 3; - const int b8_x= 4*(mb_x-1) + 1; - AV_COPY32(mrs->mv_cache[list][scan8[0] - 1], mrs->motion_val[list][b_x + mrc->b_stride*left_block[0]]); - mrs->ref_cache[list][scan8[0] - 1]= mrs->ref_index[list][b8_x + (left_block[0]&~1)]; - }else{ - AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1]); - mrs->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - } - - if(USES_LIST(topright_type, list)){ - const int b_xy= 4*(mb_x+1) + 3*mrc->b_stride; - AV_COPY32(mrs->mv_cache[list][scan8[0] + 4 - 1*8], mrs->motion_val_top[list][b_xy]); - mrs->ref_cache[list][scan8[0] + 4 - 1*8]= mrs->ref_index_top[list][4*(mb_x+1) + 2]; - }else{ - AV_ZERO32(mrs->mv_cache [list][scan8[0] + 4 - 1*8]); - mrs->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - if(mrs->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ - int topleft_partition= -1; - if(USES_LIST(topleft_type, list)){ - const int b_xy = 4*(mb_x-1) + 3 + mrc->b_stride + (topleft_partition & 2*mrc->b_stride); - const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); - AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 - 1*8], mrs->motion_val_top[list][b_xy]); - mrs->ref_cache[list][scan8[0] - 1 - 1*8]= mrs->ref_index_top[list][b8_x]; - }else{ - AV_ZERO32(mrs->mv_cache[list][scan8[0] - 1 - 1*8]); - mrs->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; - } - } - - if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) - continue; - - if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { - mrs->ref_cache[list][scan8[4 ]] = - mrs->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; - AV_ZERO32(mrs->mv_cache [list][scan8[4 ]]); - AV_ZERO32(mrs->mv_cache [list][scan8[12]]); - } - } - } -} - -static inline void write_back_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ - const int b_stride = mrc->b_stride; - const int b_x = 4*m->mb_x; //try mb2b(8)_xy - const int b8_x= 4*m->mb_x; - int list; - - if(!USES_LIST(mb_type, 0)) - fill_rectangle(&mrs->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); - - for(list=0; listlist_count; list++){ - int y; - int16_t (*mv_dst)[2]; - int16_t (*mv_src)[2]; - - if(!USES_LIST(mb_type, list)) - continue; - - mv_dst = &mrs->motion_val[list][b_x]; - mv_src = &mrs->mv_cache[list][scan8[0]]; - for(y=0; y<4; y++){ - AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); - } - - { - int8_t *ref_index = &mrs->ref_index[list][b8_x]; - ref_index[0+0*2]= mrs->ref_cache[list][scan8[0]]; - ref_index[1+0*2]= mrs->ref_cache[list][scan8[4]]; - ref_index[0+1*2]= mrs->ref_cache[list][scan8[8]]; - ref_index[1+1*2]= mrs->ref_cache[list][scan8[12]]; - } - } -} - - -/** -* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. -*/ -static int check_intra4x4_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){ - static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0}; - static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED}; - int i; - - if(!(mrs->top_samples_available&0x8000)){ - for(i=0; i<4; i++){ - int status= top[ mrs->intra4x4_pred_mode_cache[scan8[0] + i] ]; - if(status<0){ - av_log(AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); - return -1; - } else if(status){ - mrs->intra4x4_pred_mode_cache[scan8[0] + i]= status; - } - } - } - - if((mrs->left_samples_available&0x8888)!=0x8888){ - static const int mask[4]={0x8000,0x2000,0x80,0x20}; - for(i=0; i<4; i++){ - if(!(mrs->left_samples_available&mask[i])){ - int status= left[ mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i] ]; - if(status<0){ - av_log(AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); - return -1; - } else if(status){ - mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status; - } - } - } - } - return 0; -} - -/** -* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. -*/ -static int check_intra_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mode){ - static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1}; - static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8}; - - if(mode > 6) { - av_log(AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y); - return -1; - } - - if(!(mrs->top_samples_available&0x8000)){ - mode= top[ mode ]; - if(mode<0){ - av_log(AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y); - return -1; - } - } - - if((mrs->left_samples_available&0x8080) != 0x8080){ - mode= left[ mode ]; - if(mrs->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred - mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(mrs->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8); - } - if(mode<0){ - av_log(AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y); - return -1; - } - } - return mode; -} - -/** - * gets the predicted intra4x4 prediction mode. - */ -static inline int pred_intra_mode(MBRecContext *mrc, MBRecState *mrs, int n){ - const int index8= scan8[n]; - const int left= mrs->intra4x4_pred_mode_cache[index8 - 1]; - const int top = mrs->intra4x4_pred_mode_cache[index8 - 8]; - const int min= FFMIN(left, top); - - if(min<0) return DC_PRED; - else return min; -} - -static void write_back_intra_pred_mode_rec(MBRecContext *mrc, MBRecState *mrs, H264Mb *m, int mb_x){ - int8_t *mode= &mrs->intra4x4_pred_mode[4*mb_x]; - - AV_COPY32(mode, mrs->intra4x4_pred_mode_cache + 4 + 8*4); -#if OMPSS - if (m->mb_x < mrc->mb_width-1){ - H264Mb *mr= m+1; - mode = mr->intra4x4_pred_mode_left; - mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1]; - mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2]; - mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3]; - mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4]; - } -#else - mode = mrs->intra4x4_pred_mode_left; - mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1]; - mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2]; - mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3]; - mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4]; -#endif -} - -static void pred_spatial_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ - int b4_stride = mrc->b_stride; - const int mb_x = m->mb_x; - int mb_type_col[2]; - const int16_t (*l1mv0)[2], (*l1mv1)[2]; - const int8_t *l1ref0, *l1ref1; - const int is_b8x8 = IS_8X8(*mb_type); - unsigned int sub_mb_type= MB_TYPE_L0L1; - int i8, i4; - int ref[2]; - int mv[2]; - int list; - - //assert(h->ref_list[1][0].reference&3); - -#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) - - /* ref = min(neighbors) */ - for(list=0; list<2; list++){ - int left_ref = mrs->ref_cache[list][scan8[0] - 1]; - int top_ref = mrs->ref_cache[list][scan8[0] - 8]; - int refc = mrs->ref_cache[list][scan8[0] - 8 + 4]; - const int16_t *C= mrs->mv_cache[list][ scan8[0] - 8 + 4]; - if(refc == PART_NOT_AVAILABLE){ - refc = mrs->ref_cache[list][scan8[0] - 8 - 1]; - C = mrs->mv_cache[list][scan8[0] - 8 - 1]; - } - ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); - if(ref[list] >= 0){ - //this is just pred_motion() but with the cases removed that cannot happen for direct blocks - const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ]; - const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ]; - - int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); - if(match_count > 1){ //most common - mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]), - mid_pred(A[1], B[1], C[1]) ); - }else { - assert(match_count==1); - if(left_ref==ref[list]){ - mv[list]= AV_RN32A(A); - }else if(top_ref==ref[list]){ - mv[list]= AV_RN32A(B); - }else{ - mv[list]= AV_RN32A(C); - } - } - }else{ - int mask= ~(MB_TYPE_L0 << (2*list)); - mv[list] = 0; - ref[list] = -1; - if(!is_b8x8) - *mb_type &= mask; - sub_mb_type &= mask; - } - } - - if(ref[0] < 0 && ref[1] < 0){ - ref[0] = ref[1] = 0; - if(!is_b8x8) - *mb_type |= MB_TYPE_L0L1; - sub_mb_type |= MB_TYPE_L0L1; - } - - if(!(is_b8x8|mv[0]|mv[1])){ - fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); - fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); - fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); - fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); - *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; - return; - } - - mb_type_col[0] = - mb_type_col[1] = mrs->list1_mb_type[mb_x]; - - sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ - if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ - *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ - }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ - *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); - }else{ - if(!s->direct_8x8_inference_flag){ - /* FIXME save sub mb types from previous frames (or derive from MVs) - * so we know exactly what block size to use */ - sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ - } - *mb_type |= MB_TYPE_8x8; - } - - l1mv0 = (void *) &mrs->list1_motion_val[0][4*mb_x]; - l1mv1 = (void *) &mrs->list1_motion_val[1][4*mb_x]; - l1ref0 = &mrs->list1_ref_index [0][4*mb_x]; - l1ref1 = &mrs->list1_ref_index [1][4*mb_x]; -// if(!b8_stride){ -// if(m->mb_y&1){ -// l1ref0 += 2; -// l1ref1 += 2; -// l1mv0 += 2*b4_stride; -// l1mv1 += 2*b4_stride; -// } -// } - - if(IS_16X16(*mb_type)){ - int a,b; - - fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); - fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); - if(!IS_INTRA(mb_type_col[0]) && ( (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1) - || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1 - ))){ - a=b=0; - if(ref[0] > 0) - a= mv[0]; - if(ref[1] > 0) - b= mv[1]; - }else{ - a= mv[0]; - b= mv[1]; - } - fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, a, 4); - fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, b, 4); - }else{ - int n=0; - for(i8=0; i8<4; i8++){ - const int x8 = i8&1; - const int y8 = i8>>1; - - if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) - continue; - m->sub_mb_type[i8] = sub_mb_type; - - fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4); - fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4); - fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1); - fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1); - - /* col_zero_flag */ - if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 )) - ){ - const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1; - if(IS_SUB_8X8(sub_mb_type)){ - const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; - if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ - if(ref[0] == 0) - fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); - if(ref[1] == 0) - fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); - n+=4; - } - }else{ - int k=0; - for(i4=0; i4<4; i4++){ - const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; - if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ - if(ref[0] == 0) - AV_ZERO32(mrs->mv_cache[0][scan8[i8*4+i4]]); - if(ref[1] == 0) - AV_ZERO32(mrs->mv_cache[1][scan8[i8*4+i4]]); - k++; - } - } - if(!(k&3)) - m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8; - n+=k; - } - } - } - if(!is_b8x8 && !(n&15)){ - *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; - } - } -} - -static void pred_temp_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ - const int mb_x = m->mb_x; - int b4_stride = mrc->b_stride; - int mb_type_col[2]; - const int16_t (*l1mv0)[2], (*l1mv1)[2]; - const int8_t *l1ref0, *l1ref1; - const int is_b8x8 = IS_8X8(*mb_type); - unsigned int sub_mb_type; - int i8, i4; - const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]}; - const int *dist_scale_factor = s->dist_scale_factor; - - mb_type_col[0] = - mb_type_col[1] = mrs->list1_mb_type[mb_x]; - - sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ - if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ - *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ - }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ - *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); - }else{ - if(!s->direct_8x8_inference_flag){ - /* FIXME save sub mb types from previous frames (or derive from MVs) - * so we know exactly what block size to use */ - sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ - } - *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; - } - - l1mv0 = (void *) &mrs->list1_motion_val[0][4*mb_x]; - l1mv1 = (void *) &mrs->list1_motion_val[1][4*mb_x]; - l1ref0 = &mrs->list1_ref_index [0][4*mb_x]; - l1ref1 = &mrs->list1_ref_index [1][4*mb_x]; - - /* one-to-one mv scaling */ - if(IS_16X16(*mb_type)){ - int ref, mv0, mv1; - - fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1); - if(IS_INTRA(mb_type_col[0])){ - ref=mv0=mv1=0; - }else{ - const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]] - : map_col_to_list0[1][l1ref1[0]]; - const int scale = dist_scale_factor[ref0]; - const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0]; - int mv_l0[2]; - mv_l0[0] = (scale * mv_col[0] + 128) >> 8; - mv_l0[1] = (scale * mv_col[1] + 128) >> 8; - ref= ref0; - mv0= pack16to32(mv_l0[0],mv_l0[1]); - mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]); - } - fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1); - fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4); - fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4); - }else{ - for(i8=0; i8<4; i8++){ - const int x8 = i8&1; - const int y8 = i8>>1; - int ref0, scale; - const int16_t (*l1mv)[2]= l1mv0; - - if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) - continue; - m->sub_mb_type[i8] = sub_mb_type; - fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1); - if(IS_INTRA(mb_type_col[0])){ - fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1); - fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); - fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); - continue; - } - - ref0 = l1ref0[i8]; - if(ref0 >= 0) - ref0 = map_col_to_list0[0][ref0 ]; - else{ - ref0 = map_col_to_list0[1][l1ref1[i8]]; - l1mv= l1mv1; - } - scale = dist_scale_factor[ref0]; - - fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1); - if(IS_SUB_8X8(sub_mb_type)){ - const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; - int mx = (scale * mv_col[0] + 128) >> 8; - int my = (scale * mv_col[1] + 128) >> 8; - fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4); - fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4); - }else - for(i4=0; i4<4; i4++){ - const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; - int16_t *mv_l0 = mrs->mv_cache[0][scan8[i8*4+i4]]; - mv_l0[0] = (scale * mv_col[0] + 128) >> 8; - mv_l0[1] = (scale * mv_col[1] + 128) >> 8; - AV_WN32A(mrs->mv_cache[1][scan8[i8*4+i4]], - pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1])); - } - } - } -} - -void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ - if(s->direct_spatial_mv_pred){ - pred_spatial_direct_motion_rec(mrc, mrs, s, m, mb_type); - }else{ - pred_temp_direct_motion_rec(mrc, mrs, s, m, mb_type); - } -} - -static inline int fetch_diagonal_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, const int16_t **C, int i, int list, int part_width){ - const int topright_ref= mrs->ref_cache[list][ i - 8 + part_width ]; - - if(topright_ref != PART_NOT_AVAILABLE){ - *C= mrs->mv_cache[list][ i - 8 + part_width ]; - return topright_ref; - }else{ - *C= mrs->mv_cache[list][ i - 8 - 1 ]; - return mrs->ref_cache[list][ i - 8 - 1 ]; - } -} - -/** - * gets the predicted MV. - * @param n the block index - * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4) - * @param mx the x component of the predicted motion vector - * @param my the y component of the predicted motion vector - */ -static inline void pred_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int part_width, int list, int ref, int * const mx, int * const my){ - const int index8= scan8[n]; - const int top_ref= mrs->ref_cache[list][ index8 - 8 ]; - const int left_ref= mrs->ref_cache[list][ index8 - 1 ]; - const int16_t * const A= mrs->mv_cache[list][ index8 - 1 ]; - const int16_t * const B= mrs->mv_cache[list][ index8 - 8 ]; - const int16_t * C; - int diagonal_ref, match_count; - - assert(part_width==1 || part_width==2 || part_width==4); - -/* mv_cache - B . . A T T T T - U . . L . . , . - U . . L . . . . - U . . L . . , . - . . . L . . . . -*/ - - diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, index8, list, part_width); - match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref); - - if(match_count > 1){ //most common - *mx= mid_pred(A[0], B[0], C[0]); - *my= mid_pred(A[1], B[1], C[1]); - }else if(match_count==1){ - if(left_ref==ref){ - *mx= A[0]; - *my= A[1]; - }else if(top_ref==ref){ - *mx= B[0]; - *my= B[1]; - }else{ - *mx= C[0]; - *my= C[1]; - } - }else{ - if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){ - *mx= A[0]; - *my= A[1]; - }else{ - *mx= mid_pred(A[0], B[0], C[0]); - *my= mid_pred(A[1], B[1], C[1]); - } - } - -} - -/** - * gets the directionally predicted 16x8 MV. - * @param n the block index - * @param mx the x component of the predicted motion vector - * @param my the y component of the predicted motion vector - */ -static inline void pred_16x8_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){ - if(n==0){ - const int top_ref= mrs->ref_cache[list][ scan8[0] - 8 ]; - const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ]; - - if(top_ref == ref){ - *mx= B[0]; - *my= B[1]; - return; - } - }else{ - const int left_ref= mrs->ref_cache[list][ scan8[8] - 1 ]; - const int16_t * const A= mrs->mv_cache[list][ scan8[8] - 1 ]; - - if(left_ref == ref){ - *mx= A[0]; - *my= A[1]; - return; - } - } - - //RARE - pred_motion(mrc, mrs, s, n, 4, list, ref, mx, my); -} - -/** - * gets the directionally predicted 8x16 MV. - * @param n the block index - * @param mx the x component of the predicted motion vector - * @param my the y component of the predicted motion vector - */ -static inline void pred_8x16_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){ - if(n==0){ - const int left_ref= mrs->ref_cache[list][ scan8[0] - 1 ]; - const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ]; - - if(left_ref == ref){ - *mx= A[0]; - *my= A[1]; - return; - } - }else{ - const int16_t * C; - int diagonal_ref; - - diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, scan8[4], list, 2); - if(diagonal_ref == ref){ - *mx= C[0]; - *my= C[1]; - return; - } - } - - //RARE - pred_motion(mrc, mrs, s, n, 2, list, ref, mx, my); -} - -static inline void pred_pskip_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb * m, int * const mx, int * const my){ - const int top_ref = mrs->ref_cache[0][ scan8[0] - 8 ]; - const int left_ref= mrs->ref_cache[0][ scan8[0] - 1 ]; - - if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE - || !( top_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 8 ])) - || !(left_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 1 ]))){ - - *mx = *my = 0; - return; - } - - pred_motion(mrc, mrs, s, 0, 4, 0, 0, mx, my); - - return; -} - -#define ADD_MVD(list) \ -{ \ - mx += m->mvd[list][mp][0]; \ - my += m->mvd[list][mp][1]; \ - mp++; \ -} - -int pred_motion_mb_rec (MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){ - int mp=0; - int mb_type = m->mb_type; - const int mb_x = m->mb_x; - -// mrc->m =m; - - fill_decode_caches_rec(mrc, mrs, s, m, mb_type); - if (IS_SKIP(mb_type)){ - mb_type=0; - - if( s->slice_type_nos == FF_B_TYPE ) - { - mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; - ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); - } - else - { - int mx, my; - - mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; //FIXME check required - pred_pskip_motion(mrc, mrs, s, m, &mx, &my); - fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); - fill_rectangle(mrs->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); - } - - write_back_motion_rec(mrc, mrs, s, m, mb_type); - m->mb_type = mrs->mb_type[mb_x]= mb_type; - return 0; - } - - - if (IS_INTRA_PCM(mb_type)){ - mrs->mb_type[mb_x] = mb_type; - return 0; - } - else if (IS_INTRA(mb_type)){ - int i, pred_mode; - - if( IS_INTRA4x4( mb_type ) ) { - if ( IS_8x8DCT(mb_type) ) { - for( i = 0; i < 16; i+=4 ) { - int pred = pred_intra_mode(mrc, mrs, i ); - int mode = m->intra4x4_pred_mode[i]; - - mode = mode < 0 ? pred : mode + ( mode >= pred ); - fill_rectangle( &mrs->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 ); - } - } else { - for( i = 0; i < 16; i++ ) { - int pred = pred_intra_mode(mrc, mrs, i ); - int mode = m->intra4x4_pred_mode[i]; - mode = mode < 0 ? pred : mode + ( mode >= pred ); - mrs->intra4x4_pred_mode_cache[ scan8[i] ] = mode; - } - } - write_back_intra_pred_mode_rec(mrc, mrs, m, mb_x); - if( check_intra4x4_pred_mode(mrc, mrs, s, m) < 0 ) return -1; - } else { - m->intra16x16_pred_mode= check_intra_pred_mode(mrc, mrs, s, m, m->intra16x16_pred_mode ); - if( m->intra16x16_pred_mode < 0 ) return -1; - } - - pred_mode = m->chroma_pred_mode; - pred_mode= check_intra_pred_mode( mrc, mrs, s, m, pred_mode ); - if( pred_mode < 0 ) return -1; - m->chroma_pred_mode= pred_mode; - - } - else if (IS_8X8(mb_type)){ - int i, j, list; - - if( s->slice_type_nos == FF_B_TYPE ) { - if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | - m->sub_mb_type[2] | m->sub_mb_type[3]) ) { - ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); - mrs->ref_cache[0][scan8[4]] = - mrs->ref_cache[1][scan8[4]] = - mrs->ref_cache[0][scan8[12]] = - mrs->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; - } - } - - for(list=0; listlist_count; list++){ - for(i=0; i<4; i++){ - if(IS_DIRECT(m->sub_mb_type[i])){ - mrs->ref_cache[list][ scan8[4*i] ]=mrs->ref_cache[list][ scan8[4*i]+1 ]; - continue; - } else { - mrs->ref_cache[list][ scan8[4*i] ]=mrs->ref_cache[list][ scan8[4*i]+1 ]= - mrs->ref_cache[list][ scan8[4*i]+8 ]=mrs->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i]; - - if(IS_DIR(m->sub_mb_type[i], 0, list) ){ - const int sub_mb_type= m->sub_mb_type[i]; - const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; - - int sub_partition_count = IS_SUB_8X8(sub_mb_type) ? 1 : (IS_SUB_4X4(sub_mb_type)? 4 :2); - for(j=0; jmv_cache[list][ scan8[index]]; - pred_motion(mrc, mrs, s, index, block_width, list, mrs->ref_cache[list][ scan8[index] ], &mx, &my); - - ADD_MVD(list) - - if(IS_SUB_8X8(sub_mb_type)){ - mv_cache[ 1 ][0]= - mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx; - mv_cache[ 1 ][1]= - mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my; - }else if(IS_SUB_8X4(sub_mb_type)){ - mv_cache[ 1 ][0]= mx; - mv_cache[ 1 ][1]= my; - }else if(IS_SUB_4X8(sub_mb_type)){ - mv_cache[ 8 ][0]= mx; - mv_cache[ 8 ][1]= my; - } - mv_cache[ 0 ][0]= mx; - mv_cache[ 0 ][1]= my; - } - }else{ - fill_rectangle(mrs->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); - } - } - } - } - } else if( IS_DIRECT(mb_type) ) { - mb_type &= ~MB_TYPE_16x16; //FIXME not nice - ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); - } - else { - int list, i; - if(IS_16X16(mb_type)){ - for(list=0; listlist_count; list++){ - if(IS_DIR(mb_type, 0, list)){ - int ref; - int mx,my; - - ref = m->ref_index[list][0]; - fill_rectangle(&mrs->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); - pred_motion(mrc, mrs, s, 0, 4, list, mrs->ref_cache[list][ scan8[0] ], &mx, &my); - ADD_MVD(list) - fill_rectangle(mrs->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); - } - } - } - else if(IS_16X8(mb_type)){ - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ - int ref; - int mx,my; - ref = m->ref_index[list][i]; - fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); - - pred_16x8_motion(mrc, mrs, s, 8*i, list, mrs->ref_cache[list][scan8[0] + 16*i], &mx, &my); - ADD_MVD(list) - - fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); - }else{ - fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); - fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); - } - } - } - - }else{ - assert(IS_8X16(mb_type)); - - for(list=0; listlist_count; list++){ - for(i=0; i<2; i++){ - if(IS_DIR(mb_type, i, list)){ //FIXME optimize - int ref; - int mx,my; - ref = m->ref_index[list][i]; - fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); - pred_8x16_motion(mrc, mrs, s, i*4, list, mrs->ref_cache[list][ scan8[0] + 2*i ], &mx, &my); - ADD_MVD(list) - fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); - }else{ - fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); - fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); - } - } - } - } - } - - if (IS_INTER(mb_type)||(IS_DIRECT(mb_type))) - write_back_motion_rec(mrc, mrs, s, m, mb_type); - m->mb_type = mrs->mb_type[mb_x]= mb_type; - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -#ifndef H264_DIRECT_H -#define H264_DIRECT_H - -#include "h264_types.h" - -void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int *mb_type); -int pred_motion_mb_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m); - - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_ps.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_ps.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,462 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 parameter set decoding. - * @author Michael Niedermayer - */ - -#include "dsputil.h" -#include "avcodec.h" -#include "h264_types.h" -#include "h264_data.h" -#include "golomb.h" - - -//#undef NDEBUG -#include - -static const int pixel_aspect[17][2]={ - {0, 1}, - {1, 1}, - {12, 11}, - {10, 11}, - {16, 11}, - {40, 33}, - {24, 11}, - {20, 11}, - {32, 11}, - {80, 33}, - {18, 11}, - {15, 11}, - {64, 33}, - {160,99}, - {4, 3}, - {3, 2}, - {2, 1}, -}; - -const uint8_t ff_h264_chroma_qp[52]={ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, - 12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27, - 28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37, - 37,38,38,38,39,39,39,39 -}; - -static const uint8_t default_scaling4[2][16]={ -{ 6,13,20,28, - 13,20,28,32, - 20,28,32,37, - 28,32,37,42 -},{ - 10,14,20,24, - 14,20,24,27, - 20,24,27,30, - 24,27,30,34 -}}; - -static const uint8_t default_scaling8[2][64]={ -{ 6,10,13,16,18,23,25,27, - 10,11,16,18,23,25,27,29, - 13,16,18,23,25,27,29,31, - 16,18,23,25,27,29,31,33, - 18,23,25,27,29,31,33,36, - 23,25,27,29,31,33,36,38, - 25,27,29,31,33,36,38,40, - 27,29,31,33,36,38,40,42 -},{ - 9,13,15,17,19,21,22,24, - 13,13,17,19,21,22,24,25, - 15,17,19,21,22,24,25,27, - 17,19,21,22,24,25,27,28, - 19,21,22,24,25,27,28,30, - 21,22,24,25,27,28,30,32, - 22,24,25,27,28,30,32,33, - 24,25,27,28,30,32,33,35 -}}; - -static inline int decode_hrd_parameters(GetBitContext *gb, SPS *sps){ - int cpb_count, i; - cpb_count = get_ue_golomb_31(gb) + 1; - - if(cpb_count > 32){ - av_log(AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count); - return -1; - } - - get_bits(gb, 4); /* bit_rate_scale */ - get_bits(gb, 4); /* cpb_size_scale */ - for(i=0; iinitial_cpb_removal_delay_length = get_bits(gb, 5) + 1; - sps->cpb_removal_delay_length = get_bits(gb, 5) + 1; - sps->dpb_output_delay_length = get_bits(gb, 5) + 1; - sps->time_offset_length = get_bits(gb, 5); - sps->cpb_cnt = cpb_count; - return 0; -} - -static inline int decode_vui_parameters(GetBitContext *gb, SPS *sps){ - int aspect_ratio_info_present_flag; - unsigned int aspect_ratio_idc; - - aspect_ratio_info_present_flag= get_bits1(gb); - - if( aspect_ratio_info_present_flag ) { - aspect_ratio_idc= get_bits(gb, 8); - if( aspect_ratio_idc == EXTENDED_SAR ) { - sps->num= get_bits(gb, 16); - sps->den= get_bits(gb, 16); - }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(int[2])){ - //sps->sar= pixel_aspect[aspect_ratio_idc]; - }else{ - av_log( AV_LOG_ERROR, "illegal aspect ratio idc %d\n", aspect_ratio_idc); - // return -1; - } - }else{ - sps->num= - sps->den= 0; - } - - if(get_bits1(gb)){ /* overscan_info_present_flag */ - get_bits1(gb); /* overscan_appropriate_flag */ - } - - sps->video_signal_type_present_flag = get_bits1(gb); - if(sps->video_signal_type_present_flag){ - get_bits(gb, 3); /* video_format */ - sps->full_range = get_bits1(gb); /* video_full_range_flag */ - - sps->colour_description_present_flag = get_bits1(gb); - if(sps->colour_description_present_flag){ - sps->color_primaries = get_bits(gb, 8); /* colour_primaries */ - sps->color_trc = get_bits(gb, 8); /* transfer_characteristics */ - sps->colorspace = get_bits(gb, 8); /* matrix_coefficients */ - if (sps->color_primaries >= AVCOL_PRI_NB) - sps->color_primaries = AVCOL_PRI_UNSPECIFIED; - if (sps->color_trc >= AVCOL_TRC_NB) - sps->color_trc = AVCOL_TRC_UNSPECIFIED; - if (sps->colorspace >= AVCOL_SPC_NB) - sps->colorspace = AVCOL_SPC_UNSPECIFIED; - } - } - - if(get_bits1(gb)){ /* chroma_location_info_present_flag */ - av_log(AV_LOG_ERROR, "chroma_location_info_present_flag found, but not supported\n"); - (void) (get_ue_golomb(gb)+1); /* chroma_sample_location_type_top_field */ - (void) get_ue_golomb(gb); /* chroma_sample_location_type_bottom_field */ - } - - sps->timing_info_present_flag = get_bits1(gb); - if(sps->timing_info_present_flag){ - sps->num_units_in_tick = get_bits_long(gb, 32); - sps->time_scale = get_bits_long(gb, 32); - if(!sps->num_units_in_tick || !sps->time_scale){ - av_log(AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick); - return -1; - } - sps->fixed_frame_rate_flag = get_bits1(gb); - } - - sps->nal_hrd_parameters_present_flag = get_bits1(gb); - if(sps->nal_hrd_parameters_present_flag) - if(decode_hrd_parameters(gb, sps) < 0) - return -1; - sps->vcl_hrd_parameters_present_flag = get_bits1(gb); - if(sps->vcl_hrd_parameters_present_flag) - if(decode_hrd_parameters(gb, sps) < 0) - return -1; - if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag) - get_bits1(gb); /* low_delay_hrd_flag */ - sps->pic_struct_present_flag = get_bits1(gb); - - sps->bitstream_restriction_flag = get_bits1(gb); - if(sps->bitstream_restriction_flag){ - get_bits1(gb); /* motion_vectors_over_pic_boundaries_flag */ - get_ue_golomb(gb); /* max_bytes_per_pic_denom */ - get_ue_golomb(gb); /* max_bits_per_mb_denom */ - get_ue_golomb(gb); /* log2_max_mv_length_horizontal */ - get_ue_golomb(gb); /* log2_max_mv_length_vertical */ - sps->num_reorder_frames= get_ue_golomb(gb); - get_ue_golomb(gb); /*max_dec_frame_buffering*/ - - if(sps->num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){ - av_log(AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames); - return -1; - } - } - - return 0; -} - -static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size, const uint8_t *jvt_list, const uint8_t *fallback_list){ - int i, last = 8, next = 8; - const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct; - if(!get_bits1(gb)) /* matrix not written, we use the predicted one */ - memcpy(factors, fallback_list, size*sizeof(uint8_t)); - else - for(i=0;iscaling_matrix_present; - const uint8_t *fallback[4] = { - fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0], - fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1], - fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0], - fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1] - }; - if(get_bits1(gb)){ - sps->scaling_matrix_present |= is_sps; - decode_scaling_list(gb, scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y - decode_scaling_list(gb, scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr - decode_scaling_list(gb, scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb - decode_scaling_list(gb, scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y - decode_scaling_list(gb, scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr - decode_scaling_list(gb, scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb - if(is_sps || pps->transform_8x8_mode){ - decode_scaling_list(gb, scaling_matrix8[0],64,default_scaling8[0],fallback[2]); // Intra, Y - decode_scaling_list(gb, scaling_matrix8[1],64,default_scaling8[1],fallback[3]); // Inter, Y - } - } -} - -int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb){ - int profile_idc, level_idc; - unsigned int sps_id; - int i; - SPS *sps; - - profile_idc= get_bits(gb, 8); - get_bits1(gb); //constraint_set0_flag - get_bits1(gb); //constraint_set1_flag - get_bits1(gb); //constraint_set2_flag - get_bits1(gb); //constraint_set3_flag - get_bits(gb, 4); // reserved - level_idc= get_bits(gb, 8); - sps_id= get_ue_golomb_31(gb); - - if(sps_id >= MAX_SPS_COUNT) { - av_log(AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id); - return -1; - } - if (!n->sps_buffers[sps_id]) - n->sps_buffers[sps_id]= av_mallocz(sizeof(SPS)); - - sps = n->sps_buffers[sps_id]; - if(sps == NULL) - return -1; - - sps->profile_idc= profile_idc; - sps->level_idc= level_idc; - - memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4)); - memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8)); - sps->scaling_matrix_present = 0; - - if(sps->profile_idc >= 100){ //high profile - sps->chroma_format_idc= get_ue_golomb_31(gb); - if(sps->chroma_format_idc == 3) - sps->residual_color_transform_flag = get_bits1(gb); - sps->bit_depth_luma = get_ue_golomb(gb) + 8; - sps->bit_depth_chroma = get_ue_golomb(gb) + 8; - sps->transform_bypass = get_bits1(gb); - decode_scaling_matrices(gb, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8); - }else{ - sps->chroma_format_idc= 1; - sps->bit_depth_luma = 8; - sps->bit_depth_chroma = 8; - } - - sps->log2_max_frame_num= get_ue_golomb(gb) + 4; - sps->poc_type= get_ue_golomb_31(gb); - - if(sps->poc_type == 0){ //FIXME #define - sps->log2_max_poc_lsb= get_ue_golomb(gb) + 4; - } else if(sps->poc_type == 1){//FIXME #define - sps->delta_pic_order_always_zero_flag= get_bits1(gb); - sps->offset_for_non_ref_pic= get_se_golomb(gb); - sps->offset_for_top_to_bottom_field= get_se_golomb(gb); - sps->poc_cycle_length = get_ue_golomb(gb); - - if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){ - av_log(AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length); - goto fail; - } - - for(i=0; ipoc_cycle_length; i++) - sps->offset_for_ref_frame[i]= get_se_golomb(gb); - }else if(sps->poc_type != 2){ - av_log(AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type); - goto fail; - } - - sps->ref_frame_count= get_ue_golomb_31(gb); - if(sps->ref_frame_count >= 32){ - av_log(AV_LOG_ERROR, "too many reference frames\n"); - goto fail; - } - sps->gaps_in_frame_num_allowed_flag= get_bits1(gb); - sps->mb_width = get_ue_golomb(gb) + 1; - sps->mb_height= get_ue_golomb(gb) + 1; - - - sps->frame_mbs_only_flag= get_bits1(gb); - if(!sps->frame_mbs_only_flag){ - av_log(AV_LOG_ERROR, "MBAFF support not included\n"); - get_bits1(gb); - }else - sps->mb_aff= 0; - - sps->direct_8x8_inference_flag= get_bits1(gb); - if(!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag){ - av_log(AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n"); - goto fail; - } - - sps->crop= get_bits1(gb); - if(sps->crop){ - sps->crop_left = get_ue_golomb(gb); - sps->crop_right = get_ue_golomb(gb); - sps->crop_top = get_ue_golomb(gb); - sps->crop_bottom= get_ue_golomb(gb); - if(sps->crop_left || sps->crop_top){ - av_log( AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n"); - } - if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){ - av_log( AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n"); - } - }else { - - sps->crop_left = - sps->crop_right = - sps->crop_top = - sps->crop_bottom= 0; - } - - sps->vui_parameters_present_flag= get_bits1(gb); - if( sps->vui_parameters_present_flag ) - if (decode_vui_parameters(gb, sps) < 0) - goto fail; - - - n->sps = *sps; - - if( sps->bitstream_restriction_flag){ - n->has_b_frames = sps->num_reorder_frames; - } - else - n->has_b_frames= MAX_DELAYED_PIC_COUNT; - - return 0; -fail: - av_free(sps); - return -1; -} - -static void -build_qp_table(PPS *pps, int t, int index) -{ - int i; - for(i = 0; i < 52; i++) - pps->chroma_qp_table[t][i] = ff_h264_chroma_qp[av_clip(i + index, 0, 51)]; -} - -int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length){ - unsigned int pps_id= get_ue_golomb(gb); - PPS *pps; - - if(pps_id >= MAX_PPS_COUNT) { - av_log(AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id); - return -1; - } - if (!n->pps_buffers[pps_id]) - n->pps_buffers[pps_id]= av_mallocz(sizeof(PPS)); - pps = n->pps_buffers[pps_id]; - if(pps == NULL) - return -1; - pps->sps_id= get_ue_golomb_31(gb); - if((unsigned)pps->sps_id>=MAX_SPS_COUNT || n->sps_buffers[pps->sps_id] == NULL){ - av_log(AV_LOG_ERROR, "sps_id out of range\n"); - goto fail; - } - - pps->cabac= get_bits1(gb); - pps->pic_order_present= get_bits1(gb); - if(pps->pic_order_present){ - av_log(AV_LOG_ERROR, "no interlaces support\n"); - } - pps->slice_group_count= get_ue_golomb(gb) + 1; - if(pps->slice_group_count > 1 ){ - pps->mb_slice_group_map_type= get_ue_golomb(gb); - av_log(AV_LOG_ERROR, "multiple slices not supported\n"); - } - pps->ref_count[0]= get_ue_golomb(gb) + 1; - pps->ref_count[1]= get_ue_golomb(gb) + 1; - if(pps->ref_count[0]> 32 || pps->ref_count[1]> 32){ - av_log(AV_LOG_ERROR, "reference overflow (pps)\n"); - goto fail; - } - - pps->weighted_pred= get_bits1(gb); - pps->weighted_bipred_idc= get_bits(gb, 2); - pps->init_qp= get_se_golomb(gb) + 26; - pps->init_qs= get_se_golomb(gb) + 26; - pps->chroma_qp_index_offset[0]= get_se_golomb(gb); - pps->deblocking_filter_parameters_present= get_bits1(gb); - pps->constrained_intra_pred= get_bits1(gb); - pps->redundant_pic_cnt_present = get_bits1(gb); - - pps->transform_8x8_mode= 0; - memcpy(pps->scaling_matrix4, n->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4)); - memcpy(pps->scaling_matrix8, n->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8)); - - if(get_bits_count(gb) < bit_length){ - pps->transform_8x8_mode= get_bits1(gb); - decode_scaling_matrices(gb, n->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8); - pps->chroma_qp_index_offset[1]= get_se_golomb(gb); //second_chroma_qp_index_offset - } else { - pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0]; - } - - build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]); - build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]); - if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) - pps->chroma_qp_diff= 1; - - return 0; -fail: - av_free(pps); - return -1; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_ps.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_ps.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -#ifndef H264_PS_H -#define H264_PS_H - -#include "h264_types.h" - -int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb); -int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pthread.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_pthread.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,604 +0,0 @@ -#include "config.h" - -#include "h264_types.h" -#include "h264_parser.h" -#include "h264_nal.h" -#include "h264_entropy.h" -#include "h264_rec.h" -#include "h264_misc.h" -// #undef NDEBUG -#include -#include - -#define XOANON 1 - -#ifdef XOANON -static int ed_rec_affinity[40] = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, - 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, - 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, - 3, 7, 11, 15, 19, 23, 27, 31, 35, 39 }; -static int ed_rec_smt_aff[80] = { 0, 40, 4, 44, 8, 48, 12, 52, 16, 56, 20, 60, 24, 64, 28, 68, 32, 72, 36, 76, - 1, 41, 5, 45, 9, 49, 13, 53, 17, 57, 21, 61, 25, 65, 29, 69, 33, 73, 37, 77, - 2, 42, 6, 46, 10, 50, 14, 54, 18, 58, 22, 62, 26, 66, 30, 70, 34, 74, 38, 78, - 3, 43, 7, 47, 11, 51, 15, 55, 19, 59, 23, 63, 27, 67, 31, 71, 35, 75, 39, 79 }; -#else -static int ed_rec_affinity[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; -static int ed_rec_smt_aff[20] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, }; -#endif - -static int frames=0; - -static void notify_one_worker(H264Context *h){ - pthread_mutex_lock(&h->task_lock); - pthread_cond_signal(&h->task_cond); - pthread_mutex_unlock(&h->task_lock); -} - -static void notify_all_workers(H264Context *h){ - pthread_mutex_lock(&h->task_lock); - pthread_cond_broadcast(&h->task_cond); - pthread_mutex_unlock(&h->task_lock); -} - -static void push_sbe (SliceBufferQueue *sbq, SliceBufferEntry *sbe, int notify ){ - pthread_mutex_lock(&sbq->lock); - while (sbq->cnt >= sbq->size) - pthread_cond_wait(&sbq->cond, &sbq->lock); - sbq->queue[sbq->fi] = sbe; - sbq->cnt++; - sbq->fi++; sbq->fi %= sbq->size; - if (notify) - pthread_cond_signal(&sbq->cond); - pthread_mutex_unlock(&sbq->lock); -} - -static SliceBufferEntry* pop_sbe (SliceBufferQueue *sbq, int block){ - SliceBufferEntry *sbe=NULL; - - pthread_mutex_lock(&sbq->lock); - if (block){ - while (sbq->cnt <= 0) - pthread_cond_wait(&sbq->cond, &sbq->lock); - }else { - if (sbq->cnt <= 0) - goto nonblock; - } - sbe = sbq->queue[sbq->fo]; - sbq->cnt--; - sbq->fo++; sbq->fo %= sbq->size; - pthread_cond_signal(&sbq->cond); -nonblock: - pthread_mutex_unlock(&sbq->lock); - - return sbe; -} - -// static void push_rle (RingLineQueue *rlq, SliceBufferEntry *sbe, int line, int notify){ -// -// //check for free slots -// pthread_mutex_lock(&rlq->wslock); -// while (rlq->free <= 0){ -// pthread_cond_wait(&rlq->wscond, &rlq->wslock); -// } -// //free slot is available, decrement one in this lock -// rlq->free--; -// pthread_mutex_unlock(&rlq->wslock); -// -// pthread_mutex_lock(&rlq->swlock); -// rlq->queue[rlq->fi]->sbe=sbe; -// rlq->queue[rlq->fi]->line=line; -// rlq->queue[rlq->fi]->mb_cnt=0; -// rlq->fi++; rlq->fi %= rlq->size; -// rlq->ready++; -// if(notify) -// pthread_cond_signal(&rlq->swcond); -// pthread_mutex_unlock(&rlq->swlock); -// } - -// static RingLineEntry* pop_rle (RingLineQueue *rlq, int block){ -// RingLineEntry *rle=NULL; -// -// pthread_mutex_lock(&rlq->swlock); -// if (block){ -// while (rlq->ready <= 0) -// pthread_cond_wait(&rlq->swcond, &rlq->swlock); -// }else { -// if (rlq->ready <= 0) -// goto nonblock; -// } -// rle = rlq->queue[rlq->fo]; -// rlq->fo++; rlq->fo %= rlq->size; -// rlq->ready--; -// nonblock: -// pthread_mutex_unlock(&rlq->swlock); -// -// return rle; -// } -// -// static void rel_rle (RingLineQueue *rlq){ -// pthread_mutex_lock(&rlq->wslock); -// rlq->free++; -// pthread_cond_signal(&rlq->wscond); -// pthread_mutex_unlock(&rlq->wslock); -// } - -static RingLineEntry* pop_rle (SliceBufferQueue *sbq, RingLineQueue *rlq, int *has_token){ - RingLineEntry *rle=NULL; - SliceBufferEntry *sbe=NULL; - int line=-1; - - pthread_mutex_lock(&sbq->lock); - if (sbq->cnt <= 0) - goto unlock; - sbe = sbq->queue[sbq->fo]; - line = sbe->lines_taken; - - - pthread_mutex_lock(&rlq->swlock); - if (!*has_token){ - if (rlq->free <= 0) - goto unlock2; - rlq->free--; - *has_token=1; - } - rle = rlq->queue[rlq->fo]; - rlq->fo++; rlq->fo %= rlq->size; - rle->sbe=sbe; - rle->line = line; - rle->mb_cnt =0; - if (++sbe->lines_taken >= sbe->lines_total){ - sbq->cnt--; - sbq->fo++; sbq->fo %= sbq->size; - pthread_cond_signal(&sbq->cond); - } -unlock2: - pthread_mutex_unlock(&rlq->swlock); -unlock: - pthread_mutex_unlock(&sbq->lock); - - - return rle; -} - -static void rel_rle (RingLineQueue *rlq, int *rec_token){ - pthread_mutex_lock(&rlq->swlock); - rlq->free++; - *rec_token=0; -// pthread_cond_signal(&rlq->swcond); - pthread_mutex_unlock(&rlq->swlock); - -} - -//get either a entropy or a line reconstruct task -static void pop_next_task(H264Context *h, SliceBufferEntry **psbe, RingLineEntry **prle, int *rec_token){ - - pthread_mutex_lock(&h->task_lock); - - for(;;){ - if ( (*psbe = pop_sbe(&h->sb_q[ENTROPY], 0)) ){ - if (*rec_token){ - rel_rle(&h->rl_q, rec_token); - pthread_cond_signal(&h->task_cond); - } - break; - } - else if ( (*prle = pop_rle(&h->sb_q[MBDEC], &h->rl_q, rec_token)) ) - break; - pthread_cond_wait(&h->task_cond, &h->task_lock); - } - - pthread_mutex_unlock(&h->task_lock); -} - -void *parse_thread(void *arg){ - H264Context *h = (H264Context *) arg; - ParserContext *pc = get_parse_context(h->ifile); - NalContext *nc = get_nal_context(h->width, h->height); - H264Slice *s; - SliceBufferEntry *sbe = NULL; - - while(!pc->final_frame && frames++ num_frames && !h->quit){ - sbe = get_sb_entry(h); - - av_read_frame_internal(pc, &sbe->gb); - s = &sbe->slice; - - decode_nal_units(nc, s, &sbe->gb); - - push_sbe(&h->sb_q[ENTROPY], sbe, 0); - notify_one_worker(h); - } - - if (!h->no_mbd){ - sbe = get_sb_entry(h); - sbe->state=-1; - sbe->slice.coded_pic_num=nc->coded_pic_num; - sbe->lines_total=h->threads; - - push_sbe(&h->sb_q[REORDER], sbe, 1); - }else{ - for (int i=0; ithreads; i++){ - sbe = get_sb_entry(h); - sbe->state=-1; - push_sbe(&h->sb_q[ENTROPY], sbe, 1); - notify_one_worker(h); - } - } - free_nal_context(nc); - free_parse_context(pc); - - pthread_exit(NULL); - return NULL; -} - -int decode_slice_entropy(EntropyContext *ec, SliceBufferEntry *sbe){ - int i,j; - H264Slice *s = &sbe->slice; - GetBitContext *gb = &sbe->gb; - CABACContext *c = &ec->c; - H264Mb *mbs = sbe->mbs; - - if( !s->pps.cabac ){ - av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); - return -1; - } - - init_dequant_tables(s, ec); - ec->curr_qscale = s->qscale; - ec->last_qscale_diff = 0; - ec->chroma_qp[0] = get_chroma_qp( s, 0, s->qscale); - ec->chroma_qp[1] = get_chroma_qp( s, 1, s->qscale); - - /* realign */ - align_get_bits( gb ); - /* init cabac */ - ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); - - ff_h264_init_cabac_states(ec, s, c); - - for(j=0; jmb_height; j++){ - init_entropy_buf(ec, s, j); - for(i=0; imb_width; i++){ - int eos,ret; - H264Mb *m = &mbs[i + j*ec->mb_width]; - //memset(m, 0, sizeof(H264Mb)); - m->mb_x=i; - m->mb_y=j; - ec->m = m; - - ret = ff_h264_decode_mb_cabac(ec, s, c); - eos = get_cabac_terminate( c); (void) eos; - - if( ret < 0 || c->bytestream > c->bytestream_end + 2) { - av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); - return -1; - } - } - } - - return 0; -} - -static int decode_slice_mb(MBRecContext *d, RingLineEntry *rle, int frames){ - SliceBufferEntry *sbe= rle->sbe; - H264Slice *s = &sbe->slice; - H264Mb *mbs = sbe->mbs; - - int mb_width= d->mb_width; - int i; - const int line = rle->line; - - init_mbrec_context(d, d->mrs, s, line); - - H264Mb *m = &mbs[line*mb_width]; - d->top=rle->prev_line->top; - d->top_next=rle->top; - -// assert(rle->mb_cnt ==0); - for(i=0; i< mb_width; i++){ - if (frames || line>0){ - while (rle->mb_cnt >= rle->prev_line->mb_cnt -1); - } - h264_decode_mb_internal( d, d->mrs, s, &m[i]); - rle->mb_cnt++; - } - draw_edges(d, s, line); - - return 0; -} - -// static int decode_slice_mb_static(MBRecContext *d, H264Slice *s, RLThreadContext *r, RLThreadContext *rp, int frames){ -// int mb_height= d->mb_height; -// int mb_width= d->mb_width; -// int thread_num = r->thread_num; -// int thread_total = r->thread_total; -// int i; -// int j = thread_num; -// -// r->mb_cnt=frames* mb_height*mb_width; -// for(; jmbs[j*mb_width]; -// for(i=0; i< mb_width; i++){ -// if (j>0){ -// while (r->mb_cnt- (thread_num? 0:mb_width) >= rp->mb_cnt-1); -// } -// h264_decode_mb_internal(d, s, m++); -// r->mb_cnt++; -// } -// draw_edges(d, s, j); -// } -// return 0; -// } - -static void *ed_rec_thread(void *arg){ - H264Context *h = (H264Context*) arg; - EntropyContext *ec=NULL; - MBRecContext *mrc=NULL; - - RingLineEntry *rle=NULL; - SliceBufferEntry *sbe=NULL; - H264Slice *s; - int rec_token=0; - - if (!h->no_mbd){ - mrc = get_mbrec_context(h); - } - ec = get_entropy_context(h); - - for(;;){ - pop_next_task(h, &sbe, &rle, &rec_token); - if (sbe){ - if (h->no_mbd && sbe->state<0){ - break; - } - if (!sbe->initialized){ - init_sb_entry(h, sbe); - } - decode_slice_entropy(ec, sbe); - - if (h->no_mbd){ - release_sb_entry(h, sbe); - sbe=NULL; - } else { - push_sbe(&h->sb_q[REORDER], sbe, 1); - } - } else if (rle){ - if (rle->sbe->state<0) - break; - s = &rle->sbe->slice; - - decode_slice_mb(mrc, rle, s->coded_pic_num); - - if (rle->line == h->mb_height-1){ - push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1); - } - rle->mb_cnt++; - } - } - - //make sure threads quit in order of rle assignment - if (!h->no_mbd){ - while (rle->prev_line->mb_cnt <= h->mb_width); - rel_rle(&h->rl_q, &rec_token); - notify_one_worker(h); - rle->mb_cnt = h->mb_width +1; - if (rle->line == h->threads-1){ - push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1); - } - - free_mbrec_context(mrc); - } - - free_entropy_context(ec); - - pthread_exit(NULL); - return NULL; -} - -static void *reorder_thread(void *arg){ - H264Context *h = (H264Context *) arg; - int i; - SliceBufferEntry *reorder[h->sb_size]; - SliceBufferEntry *sbe, *next_sbe; - H264Slice *s; - int reorder_cnt=0; - unsigned next_pic_num=0; - - for(;;){ - - sbe = pop_sbe(&h->sb_q[REORDER], 1); - - s = &sbe->slice; - for(i=reorder_cnt; i>0; i--){ - if (s->coded_pic_num < reorder[i-1]->slice.coded_pic_num) - break; - reorder[i]=reorder[i-1]; - } - reorder[i]=sbe; - - while(reorder_cnt>=0){ - if (next_pic_num!=reorder[reorder_cnt]->slice.coded_pic_num){ - break; - } - next_sbe = reorder[reorder_cnt]; - H264Slice *es = &next_sbe->slice; - - if (next_sbe->state<0) - goto end; - - for (int i=0; i<2; i++){ - for(int j=0; j< es->ref_count[i]; j++){ - if (es->ref_list_cpn[i][j] ==-1) - continue; - int k; - for (k=0; kmax_dpb_cnt; k++){ - if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == es->ref_list_cpn[i][j]){ - es->dp_ref_list[i][j] = &h->dpb[k]; - break; - } - } - } - } - next_sbe->dp = get_dpb_entry(h, es); - - push_sbe(&h->sb_q[MBDEC], next_sbe, 0); - notify_all_workers(h); - -// for (int i=0; i< h->mb_height; i++){ -// push_rle(&h->rl_q, next_sbe, i, 0); -// notify_one_worker(h); -// } - - - next_pic_num++; - reorder_cnt--; - } - reorder_cnt++; - } - -end: - { - push_sbe(&h->sb_q[MBDEC], next_sbe, 0); - notify_all_workers(h); - if (h->no_mbd){ - push_sbe(&h->sb_q[OUTPUT], next_sbe, 1); - } -// for (int i=0; i< h->threads; i++){ -// push_rle(&h->rl_q, next_sbe, i, 0); -// notify_one_worker(h); -// } - } - - pthread_exit(NULL); - return NULL; -} - -void create_ed_rec_threads(H264Context *h){ - cpu_set_t cpuset; - int* aff; - - if (h->setaff){ - aff = h->smt ? ed_rec_smt_aff : ed_rec_affinity ; - for (int i=0; ithreads; i++){ - pthread_attr_init(&h->ed_rec_attr[i]); - CPU_ZERO(&cpuset); - CPU_SET(aff[i], &cpuset); - pthread_attr_setaffinity_np(&h->ed_rec_attr[i], sizeof(cpu_set_t), &cpuset); - pthread_create(&h->ed_rec_thr[i], &h->ed_rec_attr[i], ed_rec_thread, h); - } - } else { - for (int i=0; ithreads; i++){ - pthread_create(&h->ed_rec_thr[i], NULL, ed_rec_thread, h); - } - } -} - -void join_ed_rec_threads(H264Context *h){ - for (int i=0; i< h->threads; i++){ - pthread_join(h->ed_rec_thr[i], NULL); - } -} - -void *output_thread(void *arg){ - H264Context *h = (H264Context *) arg; - - OutputContext *oc = get_output_context( h ); - - SliceBufferEntry *sbe = NULL; - H264Slice *s=NULL; - for(;;) { - DecodedPicture *out, *dp; - sbe = pop_sbe(&h->sb_q[OUTPUT], 1); - - if (sbe->state <0) - break; - - s = &sbe->slice; - for (int i=0; irelease_cnt; i++){ - for(int j=0; jmax_dpb_cnt; j++){ - if(h->dpb[j].cpn== s->release_ref_cpn[i]){ - release_dpb_entry(h, &h->dpb[j], 2); - break; - } - } - } - - dp=sbe->dp; - release_sb_entry(h, sbe); - - out =output_frame(h, oc, dp, h->ofile, h->frame_width, h->frame_height); - if (out){ - release_dpb_entry(h, out, 1); - } - - print_report(oc->frame_number, oc->video_size, 0, h->verbose); - - } - /* at the end of stream, we must flush the decoder buffers */ - while (output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height)); - print_report(oc->frame_number, oc->video_size, 1, h->verbose); - - free_output_context(oc); - - pthread_exit(NULL); - return NULL; -} - -/* -* The following code is the main loop of the file converter -*/ -int h264_decode_pthread(H264Context *h) { - pthread_t parse_thr, reorder_thr, output_thr; - - av_start_timer(); - - pthread_create(&parse_thr, NULL, parse_thread, h); - if (!h->no_mbd){ - pthread_create(&reorder_thr, NULL, reorder_thread, h); - pthread_create(&output_thr, NULL, output_thread, h); - } -#if HAVE_LIBSDL2 - pthread_t sdl_thr; - if (h->display){ - pthread_create(&sdl_thr, NULL, sdl_thread, h); - } -#endif - create_ed_rec_threads(h); - - - if (h->rl_side_touch){ - pthread_mutex_lock(&h->ilock); - while (h->init_threads< h->threads) - pthread_cond_wait(&h->icond, &h->ilock); - pthread_mutex_unlock(&h->ilock); - - pthread_mutex_lock(&h->tlock); - h->touch_start =1; - pthread_cond_broadcast(&h->tcond); - pthread_mutex_unlock(&h->tlock); - - pthread_mutex_lock(&h->tdlock); - while (h->touch_done < h->threads) - pthread_cond_wait(&h->tdcond, &h->tdlock); - pthread_mutex_unlock(&h->tdlock); - - pthread_mutex_lock(&h->slock); - h->start =1; - pthread_cond_broadcast(&h->scond); - pthread_mutex_unlock(&h->slock); - } - join_ed_rec_threads(h); - pthread_join(parse_thr, NULL); - if (!h->no_mbd){ - pthread_join(reorder_thr, NULL); - pthread_join(output_thr, NULL); - } -#if HAVE_LIBSDL2 - if (h->display) - signal_sdl_exit(h); - pthread_join(sdl_thr, NULL); -#endif - - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_pthread.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_pthread.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -#ifndef H264_PTHREAD_H -#define H264_PTHREAD_H - -#include "h264_types.h" - -int decode_B_slice_entropy(EntropyContext *ec, EDSlice *s, EDThreadContext *eb, EDThreadContext *eb_prev); -int decode_slice_entropy(EntropyContext *hc, EDSlice *s); - -void *read_thread(void *arg); -void *parsenal_thread(void *arg); -void *mbrec_thread(void *arg); -void *write_thread(void *arg); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_rec.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_rec.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,412 +0,0 @@ -#include "config.h" - -#include "dsputil.h" -#include "h264_types.h" -#include "h264_data.h" -#include "h264_mc.h" -#include "h264_deblock.h" -#include "h264_pred_mode.h" -//#undef NDEBUG -#include - -void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line){ - DecodedPicture *pic = s->curr_pic; - int mb_stride = mrc->mb_stride; - int mb_width = mrc->mb_width; - mrs->mb_type_top = pic->mb_type + (line -1)*mb_stride; - mrs->mb_type = pic->mb_type + line*mb_stride; - mrs->ref_index_top[0] = pic->ref_index[0] + 4*(line -1)*mb_stride; - mrs->ref_index_top[1] = pic->ref_index[1] + 4*(line -1)*mb_stride; - mrs->ref_index[0] = pic->ref_index[0] + 4*line*mb_stride; - mrs->ref_index[1] = pic->ref_index[1] + 4*line*mb_stride; - - mrs->motion_val_top[0] = pic->motion_val[0] + 4*mb_width*4*(line-1); - mrs->motion_val_top[1] = pic->motion_val[1] + 4*mb_width*4*(line-1); - mrs->motion_val[0] = pic->motion_val[0] + 4*mb_width*4*line; - mrs->motion_val[1] = pic->motion_val[1] + 4*mb_width*4*line; - - mrs->intra4x4_pred_mode_top = pic->intra4x4_pred_mode + 4*mb_width*(line-1); - mrs->intra4x4_pred_mode = pic->intra4x4_pred_mode + 4*mb_width*line; - - mrs->non_zero_count_top = pic->non_zero_count + 8*mb_width*(line-1); - mrs->non_zero_count = pic->non_zero_count + 8*mb_width*line; - - if (s->slice_type_nos == FF_B_TYPE){ - mrs->list1_mb_type = s->dp_ref_list[1][0]->mb_type + line*mb_stride; - mrs->list1_ref_index[0] = s->dp_ref_list[1][0]->ref_index[0] + 4*line*mb_stride; - mrs->list1_ref_index[1] = s->dp_ref_list[1][0]->ref_index[1] + 4*line*mb_stride; - mrs->list1_motion_val[0] = s->dp_ref_list[1][0]->motion_val[0] + 4*mb_width*4*line; - mrs->list1_motion_val[1] = s->dp_ref_list[1][0]->motion_val[1] + 4*mb_width*4*line; - } - -} - -#if OMPSS -static void backup_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ - int i; - uint8_t * top_border_y1 = m->top_border; - uint8_t * top_border_y2 = m->top_border + 8; - uint8_t * top_border_cb = m->top_border + 16; - uint8_t * top_border_cr = m->top_border + 24; - uint8_t * top_border_next = m->top_border_next; - - src_y -= linesize; - src_cb -= uvlinesize; - src_cr -= uvlinesize; - - m->left_border[0]= m->top_border[15]; - for(i=1; i<17 ; i++){ - m->left_border[i]= src_y[15 + i*linesize]; - } - - *(uint64_t*)(top_border_y1) = *(uint64_t*)(src_y + 16*linesize); - *(uint64_t*)(top_border_next) = *(uint64_t*)(src_y + 16*linesize); - *(uint64_t*)(top_border_y2) = *(uint64_t*)(src_y +8+16*linesize); - - m->left_border[17]= m->top_border[16+7]; - m->left_border[17+9]= m->top_border[24+7]; - for(i=1; i<9; i++){ - m->left_border[17 +i]= src_cb[7+i*uvlinesize]; - m->left_border[17+9+i]= src_cr[7+i*uvlinesize]; - } - *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); - *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); -} - -static void xchg_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ - int temp8, i; - uint64_t temp64; - - uint8_t * top_border_y1 = m->top_border; - uint8_t * top_border_y2 = m->top_border + 8; - uint8_t * top_border_cb = m->top_border + 16; - uint8_t * top_border_cr = m->top_border + 24; - uint8_t * top_border_next = m->top_border_next; - - int deblock_left; - int deblock_top; - - deblock_left = (m->mb_x > 0); - deblock_top = (m->mb_y > 0); - - src_y -= ( linesize + 1); - src_cb -= (uvlinesize + 1); - src_cr -= (uvlinesize + 1); - - #define XCHG(a,b,t,xchg)\ - t= a;\ - if(xchg)\ - a= b;\ - b= t; - - if(deblock_left){ - for(i = !deblock_top; i<16; i++){ - XCHG(m->left_border[i], src_y [i* linesize], temp8, xchg); - } - XCHG(m->left_border[i], src_y [i* linesize], temp8, 1); - - for(i = !deblock_top; i<8; i++){ - XCHG(m->left_border[17 +i], src_cb[i*uvlinesize], temp8, xchg); - XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, xchg); - } - XCHG(m->left_border[17 +i], src_cb[i*uvlinesize], temp8, 1); - XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, 1); - } - - if(deblock_top){ - XCHG(*(uint64_t*)(top_border_y1) , *(uint64_t*)(src_y +1), temp64, xchg); - XCHG(*(uint64_t*)(top_border_y2) , *(uint64_t*)(src_y +9), temp64, 1); - XCHG(*(uint64_t*)(top_border_next), *(uint64_t*)(src_y +17), temp64, 1); - - XCHG(*(uint64_t*)(top_border_cb) , *(uint64_t*)(src_cb+1), temp64, 1); - XCHG(*(uint64_t*)(top_border_cr) , *(uint64_t*)(src_cr+1), temp64, 1); - } -} -#else - -static void backup_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ - int i; - uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y; - uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb; - uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr; - - uint8_t* left_border_y = d->left.unfiltered_y; - uint8_t* left_border_cb = d->left.unfiltered_cb; - uint8_t* left_border_cr = d->left.unfiltered_cr; - - src_y -= linesize; - src_cb -= uvlinesize; - src_cr -= uvlinesize; - - // There are two lines saved, the line above the top macroblock of a pair, - // and the line above the bottom macroblock - left_border_y[0] = top_border_y[15]; - for(i=1; i<17; i++){ - left_border_y[i] = src_y[15+i* linesize]; - } - *(uint64_t*)(top_border_y ) = *(uint64_t*)(src_y + 16*linesize); - *(uint64_t*)(top_border_y +8) = *(uint64_t*)(src_y +8+16*linesize); - - left_border_cb[0] = top_border_cb[7]; - left_border_cr[0] = top_border_cr[7]; - for(i=1; i<9; i++){ - left_border_cb[i] = src_cb[7+i*uvlinesize]; - left_border_cr[i] = src_cr[7+i*uvlinesize]; - } - *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); - *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); -} - -static void xchg_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ - - int temp8, i; - uint64_t temp64; - int deblock_left; - int deblock_top; - - uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y; - uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb; - uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr; - uint8_t* top_border_y_next = d->top[m->mb_x +1].unfiltered_y; - - uint8_t* left_border_y = d->left.unfiltered_y; - uint8_t* left_border_cb = d->left.unfiltered_cb; - uint8_t* left_border_cr = d->left.unfiltered_cr; - - deblock_left = (m->mb_x > 0); - deblock_top = (m->mb_y > 0); - - src_y -= ( linesize + 1); - src_cb -= (uvlinesize + 1); - src_cr -= (uvlinesize + 1); - - #define XCHG(a,b,t,xchg)\ - t= a;\ - if(xchg)\ - a= b;\ - b= t; - - if(deblock_left){ - for(i = !deblock_top; i<16; i++){ - XCHG(left_border_y[i], src_y [i* linesize], temp8, xchg); - } - XCHG(left_border_y[i], src_y [i* linesize], temp8, 1); - - for(i = !deblock_top; i<8; i++){ - XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg); - XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg); - } - XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1); - XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1); - } - - if(deblock_top){ - XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg); - XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1); - if(m->mb_x+1 < d->mb_width){ - XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1); - } - XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1); - XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1); - } -} - -#endif - -void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m){ - int i; - const int mb_x= m->mb_x; - const int mb_y= m->mb_y; - int *block_offset = d->block_offset; - - void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); - void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); - - int linesize = d->linesize; - int uvlinesize = d->uvlinesize; - - uint8_t *dest_y = s->curr_pic->data[0] + (mb_x + mb_y * linesize ) * 16; - uint8_t *dest_cb = s->curr_pic->data[1] + (mb_x + mb_y * uvlinesize) * 8; - uint8_t *dest_cr = s->curr_pic->data[2] + (mb_x + mb_y * uvlinesize) * 8; - - pred_motion_mb_rec (d, mrs, s, m); - - const int mb_type= m->mb_type; - - d->dsp.prefetch(dest_y + (m->mb_x&3)*4*linesize + 64, d->linesize, 4); - d->dsp.prefetch(dest_cb + (m->mb_x&7)*uvlinesize + 64, dest_cr - dest_cb, 2); - - if(IS_INTRA(mb_type)){ -#if OMPSS - xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); -#else - xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); -#endif - - d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cb, uvlinesize); - d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cr, uvlinesize); - - if(IS_INTRA4x4(mb_type)){ - if(IS_8x8DCT(mb_type)){ - idct_dc_add = d->hdsp.h264_idct8_dc_add; - idct_add = d->hdsp.h264_idct8_add; - - for(i=0; i<16; i+=4){ - uint8_t * const ptr= dest_y + block_offset[i]; - const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ]; - - const int nnz = mrs->non_zero_count_cache[ scan8[i] ]; - d->hpc.pred8x8l[ dir ](ptr, (mrs->topleft_samples_available<topright_samples_available<mb[i*16]) - idct_dc_add(ptr, m->mb + i*16, linesize); - else - idct_add (ptr, m->mb + i*16, linesize); - } - } - }else{ - idct_dc_add = d->hdsp.h264_idct_dc_add; - idct_add = d->hdsp.h264_idct_add; - - for(i=0; i<16; i++){ - uint8_t * const ptr= dest_y + block_offset[i]; - const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ]; - uint8_t *topright; - int nnz, tr; - if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ - const int topright_avail= (mrs->topright_samples_available<hpc.pred4x4[ dir ](ptr, topright, linesize); - nnz = mrs->non_zero_count_cache[ scan8[i] ]; - if(nnz){ - if(nnz == 1 && m->mb[i*16]) - idct_dc_add(ptr, m->mb + i*16, linesize); - else - idct_add (ptr, m->mb + i*16, linesize); - } - } - } - }else{ - d->hpc.pred16x16[ m->intra16x16_pred_mode ](dest_y , linesize); - } -#if OMPSS - xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); -#else - xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); -#endif - }else { - hl_motion(d, mrs, s, m, dest_y, dest_cb, dest_cr, - d->hdsp.qpel_put, d->dsp.put_h264_chroma_pixels_tab, - d->hdsp.qpel_avg, d->dsp.avg_h264_chroma_pixels_tab, - d->hdsp.weight_h264_pixels_tab, d->hdsp.biweight_h264_pixels_tab); - } - - if(!IS_INTRA4x4(mb_type)){ - - if(IS_INTRA16x16(mb_type)){ - - d->hdsp.h264_idct_add16intra(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); - - }else if(m->cbp&15){ - - if(IS_8x8DCT(mb_type)){ - d->hdsp.h264_idct8_add4(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); - }else{ - d->hdsp.h264_idct_add16(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); - } - } - } - - if(m->cbp&0x30){ - uint8_t *dest[2] = {dest_cb, dest_cr}; - - idct_add = d->hdsp.h264_idct_add; - idct_dc_add = d->hdsp.h264_idct_dc_add; - for(i=16; i<16+8; i++){ - if(mrs->non_zero_count_cache[ scan8[i] ]) - idct_add (dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize); - else if(m->mb[i*16]) - idct_dc_add(dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize); - } - } - -#if OMPSS - backup_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize); - if (mb_x+1 mb_width){ - H264Mb *mr = m+1; - memcpy(mr->left_border, m->left_border, sizeof(m->left_border)); - } - if (mb_y +1 mb_height){ - H264Mb *md = m + d->mb_width; - memcpy(md->top_border, m->top_border, sizeof(m->top_border)); - if (mb_x>0){ - H264Mb *mdl = m + d->mb_width -1; - memcpy(mdl->top_border_next, m->top_border_next, sizeof(m->top_border_next)); - } - } -#else - backup_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize); - if (mb_y +1 mb_height && d->top_next != d->top){ - memcpy(&d->top_next[mb_x],&d->top[mb_x], sizeof(TopBorder)); - } -#endif - - ff_h264_filter_mb(d, mrs, s, m, dest_y, dest_cb, dest_cr); -} - -MBRecContext *get_mbrec_context(H264Context *h){ - MBRecContext *d = av_mallocz(sizeof(MBRecContext)); - - ff_h264dsp_init(&d->hdsp); - ff_h264_pred_init(&d->hpc); - dsputil_init(&d->dsp); - -#if !OMPSS - d->mrs = av_mallocz(sizeof(MBRecState)); -#endif - d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab; - d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab; - d->mb_height = h->mb_height; - d->mb_width = h->mb_width; - d->mb_stride = h->mb_stride; - d->b_stride = h->b_stride; - d->height = h->height; - d->width = h->width; - d->linesize = h->width + EDGE_WIDTH*2; - d->uvlinesize = d->linesize>>1; - - d->scratchpad_y = av_malloc(d->linesize*16*sizeof(uint8_t)); - d->scratchpad_cb= av_malloc(d->uvlinesize*8*sizeof(uint8_t)); - d->scratchpad_cr= av_malloc(d->uvlinesize*8*sizeof(uint8_t)); - - for (int i=0; i<16; i++){ - d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3); - } - for (int i=0; i<4; i++){ - d->block_offset[16+i]= - d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3); - } - - - - return d; -} - -void free_mbrec_context(MBRecContext *d){ -#if !OMPSS - av_free(d->mrs); -#endif - av_free(d->scratchpad_y); - av_free(d->scratchpad_cb); - av_free(d->scratchpad_cr); - av_free(d); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_rec.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_rec.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -#ifndef H264_REC_H -#define H264_REC_H - -#include "h264_types.h" - -MBRecContext *get_mbrec_context(H264Context *h); -void free_mbrec_context( MBRecContext *d); -void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m); - -void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_refs.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_refs.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,461 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 reference picture handling. - * @author Michael Niedermayer - */ - -#include "dsputil.h" -#include "h264_types.h" -#include "golomb.h" - -//#undef NDEBUG -#include - -static int build_def_list(PictureInfo **def, PictureInfo **in, int len, int is_long){ - int i[2]={0}; - int index=0; - - while(i[0]reference))) - i[0]++; - while(i[1]reference & 0))) - i[1]++; - if(i[0] < len){ - in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num; - def[index++]= in[ i[0]++ ]; - } - if(i[1] < len){ - in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num; - def[index++]= in[ i[1]++ ]; - } - } - - return index; -} - -static int add_sorted(PictureInfo **sorted, PictureInfo **src, int len, int limit, int dir){ - int i, best_poc; - int out_i= 0; - - for(;;){ - best_poc= dir ? INT_MIN : INT_MAX; - - for(i=0; ipoc; - if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){ - best_poc= poc; - sorted[out_i]= src[i]; - } - } - if(best_poc == (dir ? INT_MIN : INT_MAX)) - break; - limit= sorted[out_i++]->poc - dir; - } - return out_i; -} - -int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s){ - int i,len; - - if(s->slice_type_nos==FF_B_TYPE){ - PictureInfo *sorted[32]; - int cur_poc, list; - int lens[2]; - - cur_poc= s->poc; - - for(list= 0; list<2; list++){ - len= add_sorted(sorted, n->short_ref, n->short_ref_count, cur_poc, !list); - len+=add_sorted(sorted+len, n->short_ref, n->short_ref_count, cur_poc, list); - assert(len<=32); - len= build_def_list(s->ref_list[list], sorted, len, 0); - len+=build_def_list(s->ref_list[list] +len, n->long_ref, 16 , 1); - assert(len<=32); - - for(int i=len; iref_count[list]; i++) - s->ref_list[list][i] = NULL; - - lens[list]= len; - } - - if(lens[0] == lens[1] && lens[1] > 1){ - for(i=0; s->ref_list[0][i]->poc == s->ref_list[1][i]->poc && iref_list[1][0], s->ref_list[1][1]); - } - }else{ - len = build_def_list(s->ref_list[0], n->short_ref, n->short_ref_count, 0); - len+= build_def_list(s->ref_list[0] +len, n->long_ref, 16, 1); - assert(len <= 32); - for(i=len; iref_count[0]; i++) - s->ref_list[0][i] = NULL; - } - - return 0; -} - -/** -* print short term list -*/ -static void print_short_term(NalContext *n) { - av_log(AV_LOG_DEBUG, "short term list:\n"); - for(int i=0; ishort_ref_count; i++){ - PictureInfo *pic= n->short_ref[i]; - av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d ref:%d \n", i, pic->frame_num, pic->poc, pic->reference); - } -} - -/** -* print long term list -*/ -static void print_long_term(NalContext *n) { - uint32_t i; - - av_log(AV_LOG_DEBUG, "long term list:\n"); - for(i = 0; i < 16; i++){ - PictureInfo *pic= n->long_ref[i]; - if (pic) { - av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d\n", i, pic->frame_num, pic->poc); - } - } -} - -int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb){ - int list, index; - - print_short_term(n); - print_long_term(n); - - for(list=0; listlist_count; list++){ - - if(get_bits1(gb)){ - int frame_num = n->frame_num; - unsigned int abs_diff_pic_num; - for(index=0; ; index++){ - unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(gb); - int i=0; - PictureInfo *ref = NULL; - - if(reordering_of_pic_nums_idc==3){ - break; - } - if(index >= s->ref_count[list]){ - av_log(AV_LOG_ERROR, "reference count overflow\n"); - return -1; - } - - if (reordering_of_pic_nums_idc>2){ - av_log(AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n"); - return -1; - } - - if (reordering_of_pic_nums_idc<2){ - //av_log(AV_LOG_ERROR, "long term pic not supported\n"); - - abs_diff_pic_num= get_ue_golomb(gb) + 1; - if(abs_diff_pic_num > (unsigned) n->max_pic_num){ - av_log(AV_LOG_ERROR, "abs_diff_pic_num overflow\n"); - return -1; - } - - if(reordering_of_pic_nums_idc == 0) - frame_num-= abs_diff_pic_num; - else - frame_num+= abs_diff_pic_num; - frame_num &= n->max_pic_num - 1; - - for(i= 0 ; ishort_ref_count; i++){ - ref = n->short_ref[i]; - if(ref->frame_num == frame_num && ref->reference){ - break; - } - } - ref->pic_id= frame_num; - }else{ - int long_idx; - long_idx= get_ue_golomb(gb); //long_term_pic_idx - - if(long_idx>31){ - av_log(AV_LOG_ERROR, "long_term_pic_idx overflow\n"); - return -1; - } - ref = n->long_ref[long_idx]; - assert(!(ref && !ref->reference)); - if(ref && (ref->reference)){ - ref->pic_id= long_idx; - assert(ref->long_ref); - }else{ - av_log(AV_LOG_ERROR, "reference picture missing during reorder\n"); - } - } - - if (i >= n->short_ref_count) { - av_log(AV_LOG_ERROR, "reference picture missing during reorder\n"); - return -1; - } else { - for(i=index; i+1 ref_count[list]; i++){ - -// if(ref->frame_num == s->ref_list[list][i]->frame_num) -// break; - ///there is probably no need for a separate pic_id and frame_num - if (s->ref_list[list][i]){ - - if(ref->long_ref == s->ref_list[list][i]->long_ref && ref->pic_id == s->ref_list[list][i]->pic_id) - break; - } - } - for(; i > index; i--){ - s->ref_list[list][i]= s->ref_list[list][i-1]; - } - s->ref_list[list][index]= ref; - } - } - } - } - -// //Check if everything went well -// for(list=0; listlist_count; list++){ -// //printf("ref_count %d list %d\n", s->ref_count[list], list); -// for(index= 0; index < s->ref_count[list]; index++){ -// //printf("%d\n", s->ref_list[list][index]->pic_id); -// if(!s->ref_list[list][index]->data[0]){ -// av_log(AV_LOG_ERROR, "Missing reference picture\n"); -// return -1; -// } -// } -// } - - return 0; -} - -static PictureInfo *find_short(NalContext *n, int frame_num){ - int i; - for(i=0; ishort_ref_count; i++){ - if(n->short_ref[i]->frame_num == frame_num) { - return n->short_ref[i]; - } - } - return NULL; -} - -static int remove_short(NalContext *n, H264Slice *s, int frame_num, int release){ - int i; - - for (i=0; ishort_ref_count; i++){ - if (n->short_ref[i]->frame_num == frame_num){ - if (release){ - s->release_ref_cpn[s->release_cnt++] = n->short_ref[i]->cpn; - n->short_ref[i]->reference &= ~2; - } - n->short_ref[i] = NULL; - if (--n->short_ref_count) - memmove(&n->short_ref[i], &n->short_ref[i+1], (n->short_ref_count - i)*sizeof(PictureInfo *)); - return 0; - } - } - return -1; -} - -static void remove_long(NalContext *n, H264Slice *s, int i){ - - if (n->long_ref[i]){ - s->release_ref_cpn[s->release_cnt++] = n->long_ref[i]->cpn; - n->long_ref[i]->reference &= ~2; - n->long_ref[i]->long_ref = 0; - n->long_ref_count--; - n->long_ref[i] = NULL; - } -} - -void ff_h264_remove_all_refs(NalContext *n, H264Slice *s){ - int i; - - while (n->short_ref[0]) - remove_short(n, s, n->short_ref[0]->frame_num, 1); - - for(i=0; i<16; i++){ - remove_long(n, s, i); - } - assert(n->short_ref_count==0); - assert(n->long_ref_count==0); -} - -int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb){ - - if(s->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields - get_bits1(gb); //get_bits1(gb) -1; //broken link - if(get_bits1(gb)){ - av_log(AV_LOG_ERROR, "MMCO_LONG reference management not supported\n"); - } - }else{ - if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag - int i,j; - for(i= 0; iframe_num - get_ue_golomb(gb) - 1) & (n->max_pic_num - 1); - } - if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){ - long_arg= get_ue_golomb_31(gb); - if(long_arg >= 16){ - av_log(AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode); - return -1; - } - } - - if(opcode > (unsigned)MMCO_LONG){ - av_log(AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode); - return -1; - } - if(opcode == MMCO_END) - break; - - switch (opcode){ - case MMCO_SHORT2UNUSED: - remove_short(n, s, short_pic_num, 1); - break; - case MMCO_SHORT2LONG: - pic = find_short(n, short_pic_num); - if (n->long_ref[long_arg] != pic) - remove_long(n, s, long_arg); - remove_short(n, s, short_pic_num, 0); - n->long_ref[long_arg]= pic; - if (pic){ - pic->long_ref=1; - n->long_ref[long_arg]= pic; - n->long_ref_count++; - } - break; - case MMCO_LONG2UNUSED: - assert(n->long_ref[long_arg]); - remove_long(n, s, long_arg); - break; - case MMCO_SET_MAX_LONG: - for(j=long_arg; j<16; j++) - remove_long(n, s, j); - break; - case MMCO_RESET: - while(n->short_ref_count) - remove_short(n, s, n->short_ref[0]->frame_num, 1); - - for(j=0; j < 16; j++) - remove_long(n, s, j); - - s->current_picture_info->poc= - s->poc = - n->poc_lsb= - n->poc_msb= - n->frame_num= - s->current_picture_info->frame_num= 0; - break; - case MMCO_END: - case MMCO_LONG: - break; - } - } - }else{// sliding window ref picture marking - if(n->short_ref_count == n->sps.ref_frame_count) { - s->release_ref_cpn[s->release_cnt++] = n->short_ref[n->short_ref_count - 1]->cpn; - n->short_ref[n->short_ref_count - 1]->reference &= ~2; - n->short_ref[ n->short_ref_count - 1 ] =NULL; - n->short_ref_count--; - } - } - } - - if(n->short_ref_count) - memmove(&n->short_ref[1], &n->short_ref[0], n->short_ref_count*sizeof(PictureInfo *)); - - n->short_ref[0]= s->current_picture_info; - n->short_ref_count++; - - return 0; -} - -static int get_scale_factor(H264Slice *s, int poc, int poc1, int i){ - int poc0 = s->ref_list[0][i]->poc; - int td = av_clip(poc1 - poc0, -128, 127); - if(td == 0 || s->ref_list[0][i]->long_ref){ - return 256; - }else{ - int tb = av_clip(poc - poc0, -128, 127); - int tx = (16384 + (FFABS(td) >> 1)) / td; - return av_clip((tb*tx + 32) >> 6, -1024, 1023); - } -} - -void ff_h264_direct_dist_scale_factor(H264Slice *s){ - const int poc = s->current_picture_info->poc; - const int poc1 = s->ref_list[1][0]->poc; - - for(int i=0; iref_count[0]; i++){ - s->dist_scale_factor[i] = get_scale_factor(s, poc, poc1, i); - } -} - -static void fill_colmap(H264Slice *s, int map[2][16], int list){ - PictureInfo * const ref1 = s->ref_list[1][0]; - int old_ref, rfield; - - /* bogus; fills in for missing frames */ - memset(map[list], 0, sizeof(map[list])); - - for(rfield=0; rfield<2; rfield++){ - for(old_ref=0; old_ref < ref1->ref_count[list]; old_ref++){ - int poc = ref1->ref_poc[list][old_ref]; - - for(int j=0; jref_count[0]; j++){ - if(s->ref_list[0][j]->poc == poc){ - map[list][old_ref] = j; - break; - } - } - } - } -} - -void ff_h264_direct_ref_list_init(H264Slice *s){ - PictureInfo * const cur = s->current_picture_info; - int list; - - for(list=0; list<2; list++){ - cur->ref_count[list] = s->ref_count[list]; - for(int j=0; jref_count[list]; j++){ - cur->ref_poc[list][j] = s->ref_list[list][j] ? s->ref_list[list][j]->poc : 0; - } - } - - if(s->slice_type_nos != FF_B_TYPE || s->direct_spatial_mv_pred) - return; - - for(list=0; list<2; list++){ - fill_colmap(s, s->map_col_to_list0, list); - } -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_refs.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_refs.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -#ifndef H264_REFS_H -#define H264_REFS_H - -#include "avcodec.h" -#include "h264_types.h" - -int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s); -int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb); -void ff_h264_remove_all_refs(NalContext *n, H264Slice *s); -int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb); -void ff_h264_direct_ref_list_init(H264Slice *s); -void ff_h264_direct_dist_scale_factor(H264Slice *s); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_sei.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_sei.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,191 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... sei decoding - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * H.264 / AVC / MPEG4 part10 sei decoding. - * @author Michael Niedermayer - */ - -#include "avcodec.h" -#include "h264_types.h" -#include "golomb.h" - -//#undef NDEBUG -#include - -static const uint8_t sei_num_clock_ts_table[9]={ - 1, 1, 1, 2, 2, 3, 3, 2, 3 -}; - -void ff_h264_reset_sei(NalContext *n) { - n->sei_recovery_frame_cnt = -1; - n->sei_dpb_output_delay = 0; - n->sei_cpb_removal_delay = -1; - n->sei_buffering_period_present = 0; -} - -static int decode_picture_timing(NalContext *n, GetBitContext *gb){ - if(n->sps.nal_hrd_parameters_present_flag || n->sps.vcl_hrd_parameters_present_flag){ - n->sei_cpb_removal_delay = get_bits(gb, n->sps.cpb_removal_delay_length); - n->sei_dpb_output_delay = get_bits(gb, n->sps.dpb_output_delay_length); - } - if(n->sps.pic_struct_present_flag){ - unsigned int i, num_clock_ts; - n->sei_pic_struct = get_bits(gb, 4); - n->sei_ct_type = 0; - - if (n->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING) - return -1; - - num_clock_ts = sei_num_clock_ts_table[n->sei_pic_struct]; - - for (i = 0 ; i < num_clock_ts ; i++){ - if(get_bits(gb, 1)){ /* clock_timestamp_flag */ - unsigned int full_timestamp_flag; - n->sei_ct_type |= 1<sps.time_offset_length > 0) - skip_bits(gb, n->sps.time_offset_length); /* time_offset */ - } - } - } - return 0; -} - -static int decode_unregistered_user_data(GetBitContext *gb, int size){ - char user_data[16+256]; - int e, build, i; - - if(size<16) - return -1; - - for(i=0; i<(int) sizeof(user_data)-1 && isei_recovery_frame_cnt = get_ue_golomb(gb); - skip_bits(gb, 4); /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */ - - return 0; -} - -static int decode_buffering_period(NalContext *n, GetBitContext *gb){ - unsigned int sps_id; - int sched_sel_idx; - SPS *sps; - - sps_id = get_ue_golomb_31(gb); - if(sps_id > 31 || !n->sps_buffers[sps_id]) { - av_log(AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id); - return -1; - } - sps = n->sps_buffers[sps_id]; - - // NOTE: This is really so duplicated in the standard... See H.264, D.1.1 - if (sps->nal_hrd_parameters_present_flag) { - for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) { - n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length); - skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset - } - } - if (sps->vcl_hrd_parameters_present_flag) { - for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) { - n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length); - skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset - } - } - - n->sei_buffering_period_present = 1; - return 0; -} - -int ff_h264_decode_sei(NalContext *n, GetBitContext *gb){ - while(get_bits_count(gb) + 16 < gb->size_in_bits){ - int size, type; - - type=0; - do{ - type+= show_bits(gb, 8); - }while(get_bits(gb, 8) == 255); - - size=0; - do{ - size+= show_bits(gb, 8); - }while(get_bits(gb, 8) == 255); - - switch(type){ - case SEI_TYPE_PIC_TIMING: // Picture timing SEI - if(decode_picture_timing(n, gb) < 0) - return -1; - break; - case SEI_TYPE_USER_DATA_UNREGISTERED: - if(decode_unregistered_user_data(gb, size) < 0) - return -1; - break; - case SEI_TYPE_RECOVERY_POINT: - if(decode_recovery_point(n, gb) < 0) - return -1; - break; - case SEI_BUFFERING_PERIOD: - if(decode_buffering_period(n, gb) < 0) - return -1; - break; - default: - skip_bits(gb, 8*size); - } - - //FIXME check bits here - align_get_bits(gb); - } - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_sei.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_sei.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -#ifndef H264_SEI_H -#define H264_SEI_H - -int ff_h264_decode_sei(NalContext *n, GetBitContext *gb); -void ff_h264_reset_sei(NalContext *n); - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_seq.c --- a/ffmpeg_smp/h264dec/libavcodec/h264_seq.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,220 +0,0 @@ -/* -* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder -* Copyright (c) 2003 Michael Niedermayer -* -* This file is part of FFmpeg. -* -* FFmpeg is free software; you can redistribute it and/or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* FFmpeg is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -* -* You should have received a copy of the GNU Lesser General Public -* License along with FFmpeg; if not, write to the Free Software -* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -*/ -#include "h264_types.h" -#include "h264_parser.h" -#include "h264_nal.h" -#include "h264_entropy.h" -#include "h264_rec.h" -#include "h264_pred_mode.h" -#include "h264_misc.h" -// #undef NDEBUG -#include - -static int decode_slice_entropy_seq(H264Context *h, EntropyContext *ec, H264Slice *s, GetBitContext *gb, H264Mb *mbs){ - int i,j; -// GetBitContext *gb = s->gb; - CABACContext *c = &ec->c; - - if( !s->pps.cabac ){ - av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); - return -1; - } - - init_dequant_tables(s, ec); - ec->curr_qscale = s->qscale; - ec->last_qscale_diff = 0; - ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale); - ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale); - - /* realign */ - align_get_bits( gb ); - /* init cabac */ - ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); - - ff_h264_init_cabac_states(ec, s, c); - - for(j=0; jmb_height; j++){ - init_entropy_buf(ec, s, j); - for(i=0; imb_width; i++){ - int eos,ret; - H264Mb *m = &mbs[i + j*ec->mb_width]; - //memset(m, 0, sizeof(H264Mb)); - m->mb_x=i; - m->mb_y=j; - ec->m = m; - - ret = ff_h264_decode_mb_cabac(ec, s, c); - eos = get_cabac_terminate( c); - (void) eos; - if( ret < 0 || c->bytestream > c->bytestream_end + 2) { - av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); - return -1; - } - } - } - -// av_freep(&s->gb.raw); -// if (s->gb.rbsp) -// av_freep(&s->gb.rbsp); - - return 0; -} - - - -/** -* Sequential version -*/ -static void decode_slice_mb_seq(H264Context *h, MBRecContext *d, H264Slice *s2, H264Mb *mbs){ - - for (int i=0; i<2; i++){ - for(int j=0; j< s2->ref_count[i]; j++){ - if (s2->ref_list_cpn[i][j] ==-1) - continue; - int k; - for (k=0; kmax_dpb_cnt; k++){ - if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s2->ref_list_cpn[i][j]){ - s2->dp_ref_list[i][j] = &h->dpb[k]; - break; - } - } - } - } - - get_dpb_entry(h, s2); - - if (!h->no_mbd){ - for(int j=0; jmb_height; j++){ - init_mbrec_context(d, d->mrs, s2, j); - if (h->profile) printf("\n[MBREC LINE %d ", j); - for(int i=0; imb_width; i++){ - - if ((i & 0x7) == 0) start_timer(h, REC); - H264Mb *m = &mbs[i + j*d->mb_width]; - if (h->profile==2) - pred_motion_mb_rec (d, d->mrs, s2, m); - else{ - h264_decode_mb_internal(d, d->mrs, s2, m); - } - stop_timer(h, REC); - } - draw_edges(d, s2, j); - - } - } - - for (int i=0; irelease_cnt; i++){ - for(int j=0; jmax_dpb_cnt; j++){ - if(h->dpb[j].cpn== s2->release_ref_cpn[i]){ - release_dpb_entry(h, &h->dpb[j], 2); - break; - } - } - } - s2->release_cnt=0; -} - -/* -* The following code is the main loop of the file converter -*/ -int h264_decode_seq( H264Context *h) { - ParserContext *pc; - NalContext *nc; - EntropyContext *ec; - MBRecContext *rc; - OutputContext *oc; - - H264Slice slice, *s=&slice; - H264Mb *mbs; - DecodedPicture *out; - int frames=0; - -#if HAVE_LIBSDL2 - pthread_t sdl_thr; - if (h->display){ - pthread_create(&sdl_thr, NULL, sdl_thread, h); - } -#endif - - pc = get_parse_context(h->ifile); - nc = get_nal_context(h->width, h->height); - - memset(s, 0, sizeof(H264Slice)); - mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb)); - - ec = get_entropy_context( h ); - rc = get_mbrec_context(h); - rc->top_next = rc->top = av_malloc( h->mb_width * sizeof(TopBorder)); - - oc = get_output_context( h ); - - av_start_timer(); - GetBitContext gb = {0,}; - while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ - if (h->profile) start_timer(h, FRONT); - av_read_frame_internal(pc, &gb); - decode_nal_units(nc, s, &gb); - if (h->profile) stop_timer(h, FRONT); -// memset(s->mbs, 0, sizeof(H264Mb)*ec->mb_width*ec->mb_height); - if (h->profile) start_timer(h, ED); - decode_slice_entropy_seq(h, ec, s, &gb, mbs); - if (h->profile) stop_timer(h, ED); - - if (h->profile) start_timer(h, REC); - decode_slice_mb_seq(h, rc, s, mbs); - if (h->profile) stop_timer(h, REC); - - out =output_frame(h, oc, s->curr_pic, h->ofile, h->frame_width, h->frame_height); - if (out){ - release_dpb_entry(h, out, 1); - } - - print_report(oc->frame_number, oc->video_size, 0, h->verbose); - if (h->profile == 3){ - printf("[ENTROPY %.3fms] [MBREC %.3fms]\n", h->last_time[ED] , h->last_time[REC]); - } - } - while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; - - print_report(oc->frame_number, oc->video_size, 1, h->verbose); - h->num_frames = oc->frame_number; - /* finished ! */ - av_freep(&mbs); - av_freep(&gb.raw); - if (gb.rbsp) - av_freep(&gb.rbsp); - av_freep(&rc->top); - - free_parse_context(pc); - free_nal_context (nc); - free_entropy_context(ec); - free_mbrec_context(rc); - free_output_context(oc); - -#if HAVE_LIBSDL2 - if (h->display){ - signal_sdl_exit(h); - pthread_join(sdl_thr, NULL); - } -#endif - - return 0; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/h264_types.h --- a/ffmpeg_smp/h264dec/libavcodec/h264_types.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,658 +0,0 @@ -#ifndef H264_TYPES_H -#define H264_TYPES_H - -#include "config.h" -#ifdef HAVE_LIBSDL2 -#include -#endif - -#include -#include "avcodec.h" -#include "cabac.h" -#include "h264_dsp.h" -#include "h264_pred.h" -#include "get_bits.h" - - -#define MAX_REF_PIC_COUNT 16 -#define MAX_DELAYED_PIC_COUNT 16 - -#define MAX_THREADS 80 - -//#define MAX_PIC_COUNT (4*(MAX_REF_PIC_COUNT+MAX_DELAYED_PIC_COUNT)) - -#define DPB_SIZE 33 - - -//potsdam machine 8xX7560 without HT -// static int edb_affinity [16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; -// static int edip_affinity[8] = {16, 17, 18, 19, 20, 21, 22, 23}; -// -// static int mbd_affinity[8][5] = { {24, 32, 40, 48, 56}, -// {25, 33, 41, 49, 57}, -// {26, 34, 42, 50, 58}, -// {27, 35, 43, 51, 59}, -// {28, 36, 44, 52, 60}, -// {29, 37, 45, 53, 61}, -// {30, 38, 46, 54, 62}, -// {31, 39, 47, 55, 63}, }; - -// static int edb_affinity [22] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 58, 59, 60, 61 ,62, 63}; -// static int edip_affinity[10] = {16, 17, 18, 19, 20, 21, 22, 23, 56, 57 }; -// -// static int mbd_affinity[8][5] = { {24, 32, 40, 48, 56}, -// {25, 33, 41, 49, 57}, -// {26, 34, 42, 50, 58}, -// {27, 35, 43, 51, 59}, -// {28, 36, 44, 52, 60}, -// {29, 37, 45, 53, 61}, -// {30, 38, 46, 54, 62}, -// {31, 39, 47, 55, 63}, }; -// //4 socket -// static int edip_affinity[5] = {0, 1, 2, 3, 56}; -// static int edb_affinity [12] = {8, 9, 10, 11, 16, 17, 18, 19, 59, 58, 57, 51}; -// -// static int mbd_affinity[4][5] = { {24, 32, 40, 48, 56}, -// {25, 33, 41, 49, 57}, -// {26, 34, 42, 50, 58}, -// {27, 35, 43, 51, 59}, }; - -// static int edip_affinity[3] = {0, 1, 49}; -// static int edb_affinity [6] = {8, 9, 16, 17, 56, 57}; -// -// static int mbd_affinity[2][5] = { {24, 32, 40, 48, 56}, -// {25, 33, 41, 49, 57}}; - -// static int edip_affinity[2] = {0, 8}; -// static int edb_affinity [3] = {16, 24, 56}; -// -// static int mbd_affinity[1][4] = { {32, 40, 48, 56}, -// }; - -/// for ducks_take_off_2160p -// static int edip_affinity[2] = {0, 8}; -// static int edb_affinity [3] = {16, 24, 32}; -// -// static int mbd_affinity[1][4] = {{ 40, 48, 56, 32}}; - -// static int edip_affinity[3] = {0, 1, 57}; -// static int edb_affinity [7] = {8, 9, 16, 17, 24, 25, 56}; -// -// static int mbd_affinity[2][4] = { {32, 40, 48, 56}, -// {33, 41, 49, 57}}; - -//4 socket -// static int edip_affinity[6] = {0, 1, 2, 3, 59}; -// static int edb_affinity [14] = {8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 58, 57}; -// -// static int mbd_affinity[4][4] = { {32, 40, 48, 56}, -// {33, 41, 49, 57}, -// {34, 42, 50, 58}, -// {35, 43, 51, 59}, }; - - -// static int edb_affinity [29] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 59, 60, 61, 62, 63}; -// static int edip_affinity[11] = {24, 25, 26, 27, 28, 29, 30, 31, 63, 62, 61}; -// -// static int mbd_affinity[8][4] = {{32, 40, 48, 56}, -// {33, 41, 49, 57}, -// {34, 42, 50, 58}, -// {35, 43, 51, 59}, -// {36, 44, 52, 60}, -// {37, 45, 53, 61}, -// {38, 46, 54, 62}, -// {39, 47, 55, 63}, }; - -//potsdam machine 4xX7550 with HT -// int edip_affinity[16] = {0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; -// int edb_affinity [16] = {1, 9, 17, 25, 2, 10, 18, 26, 6, 14, 22, 30, 7, 15, 23, 31 }; -// int edip_affinity[16] = {58, 50, 42, 34, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; -// int edb_affinity [16] = {57, 49, 41, 33, 56, 48, 40, 32, 6, 14, 22, 30, 7, 15, 23, 31 }; -// //int edb_affinity [16] = {4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 }; -// //mb threads affinity on logical cores moving back to keep inteference with ed threads low -// int mbd_affinity[4][8] = { {63, 62, 61, 60, 59, 58, 57, 56}, -// {55, 54, 53, 52, 51, 50, 49, 48}, -// {47, 46, 45, 44, 43, 42, 41, 40}, -// {39, 38, 37, 36, 35, 34, 33, 32}, -// }; - - -// static int edip_affinity[2] = {0, 2}; -// static int edb_affinity [4] = {1, 3, 2, 5}; -// -// static int mbd_affinity[1][4] = {{ 4, 6, 7, 5}}; - -enum{ - PARSE=0, - ENTROPY, - REORDER, - REORDER2, //second mutex-cond pair used in reorder_thread - MBDEC, - OUTPUT, - STAGES -}; - -//adhoc for profiling -enum{ - TOTAL=0, - FRONT, - ED, - REC, - PROFILE_STAGES -}; - -/* bit input */ -/* buffer, buffer_end and size_in_bits must be present and used by every reader */ - -/* frame parsing */ -typedef struct ParserContext { - //int64_t offset; ///< byte offset from starting packet start - int ifile; - int ofile; - int buffer_size; - int eof_reached; - - uint8_t *data; - int size; - uint8_t *cur_ptr; - int cur_len; - - int64_t frame_offset; /* offset of the current frame */ - int64_t cur_offset; /* current offset (incremented by each av_parser_parse()) */ - int64_t next_frame_offset; /* offset of the next frame */ - int pict_type; - int repeat_pict; //frame_duration = (1 + repeat_pict) * time_base. It is used by codecs like H.264 to display telecined material. - int key_frame; //Set by parser to 1 for key frames and 0 for non-key frames. - int64_t pos; // Byte position of currently parsed frame in stream. - int64_t last_pos; //Previous frame byte position. - int final_frame; - - uint8_t overread[5]; - int overread_cnt; ///< the number of bytes which where irreversibly read from the next frame - int index; - int last_index; - int frame_start_found; - uint32_t state; ///< contains the last few bytes in MSB order -} ParserContext; - -typedef struct NalContext { - - SPS *sps_buffers[MAX_SPS_COUNT]; - PPS *pps_buffers[MAX_PPS_COUNT]; - SPS sps; ///< current sps - - PictureInfo picture[16 + 1]; ///< Ref pic buffer used for deriving lists. Later linked with pic in dpb. - PictureInfo *release_ref[MAX_MMCO_COUNT]; - PictureInfo *short_ref[32]; - PictureInfo *long_ref[32]; - int long_ref_count; ///< number of actual long term references - int short_ref_count; ///< number of actual short term references - - //POC stuff - uint32_t coded_pic_num; - int poc_lsb; - int poc_msb; - uint32_t poc_offset; - int delta_poc; - int frame_num; - int prev_poc_msb; ///< poc_msb of the last reference pic for POC type 0 - int prev_poc_lsb; ///< poc_lsb of the last reference pic for POC type 0 - int frame_num_offset; ///< for POC type 2 - int prev_frame_num_offset; ///< for POC type 2 - int prev_frame_num; ///< frame_num of the last pic for POC type 1/2 - - int max_pic_num; - int redundant_pic_count; - int outputed_poc; - int ip_id; -// int b8_stride; ///< 2*mb_width+1 used for some 8x8 block arrays to allow simple addressing - int b4_stride; ///< 4*mb_width+1 used for some 4x4 block arrays to allow simple addressing - int mb_stride; ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11 - int mb_width; - int mb_height; - int width; - int height; - - int has_b_frames; - //pic_struct in picture timing SEI message - SEI_PicStructType sei_pic_struct; - // Bit set of clock types for fields/frames in picture timing SEI message. For each found ct_type, appropriate bit is set (e.g., bit 1 for interlaced). - int sei_ct_type; - // dpb_output_delay in picture timing SEI message, see H.264 C.2.2 - int sei_dpb_output_delay; - //cpb_removal_delay in picture timing SEI message, see H.264 C.1.2 - int sei_cpb_removal_delay; - //recovery_frame_cnt from SEI message - int sei_recovery_frame_cnt; - // Timestamp stuff - int sei_buffering_period_present; ///< Buffering period SEI flag - int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs - -} NalContext; - -typedef struct EntropyContext{ - CABACContext c; - - H264Mb *m; - int top_cbp; - int left_cbp; - int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct - - uint32_t top_type; - uint32_t left_type; - uint32_t topright_type; - uint32_t topleft_type; - - int curr_qscale; - int chroma_qp[2]; //QPc - int last_qscale_diff; - - uint32_t dequant4_buffer[6][52][16]; - uint32_t dequant8_buffer[2][52][64]; - uint32_t (*dequant4_coeff[6])[16]; - uint32_t (*dequant8_coeff[2])[64]; - -// uint8_t (*non_zero_count_top)[32]; -// uint8_t (*non_zero_count)[32]; -// uint8_t (*non_zero_count_row[2])[32]; - - uint8_t (*non_zero_count_top)[8]; - uint8_t (*non_zero_count)[8]; - uint8_t (*non_zero_count_row[2])[8]; - DECLARE_ALIGNED(8, uint8_t, non_zero_count_left[8]); - - uint8_t (*mvd_top[2])[2]; - uint8_t (*mvd[2])[2]; - uint8_t (*mvd_table[2][2])[2]; - - uint8_t *direct_top; - uint8_t *direct; - uint8_t *direct_table[2]; - - uint8_t *chroma_pred_mode_top; - uint8_t *chroma_pred_mode; - uint8_t *chroma_pred_mode_table[2]; - - uint16_t *cbp_top; - uint16_t *cbp; - uint16_t *cbp_table[2]; - - int8_t *qscale_top; - int8_t *qscale; - int8_t *qscale_table[2]; - - int8_t *ref_index_top[2]; - int8_t *ref_index[2]; - int8_t *ref_index_table[2][2]; - - uint32_t *mb_type_top; - uint32_t *mb_type; - uint32_t *mb_type_table[2]; - - int b_stride; - int mb_stride; - int mb_width; - int mb_height; - - uint8_t *zigzag_scan; - uint8_t *zigzag_scan8x8; - uint8_t direct_cache[5*8]; - - DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]); - DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; - DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; - DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; - DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; - -} EntropyContext; - -typedef struct H264Slice { - PPS pps; ///< current pps - PictureInfo* current_picture_info; - DecodedPicture* curr_pic; - int slice_num; - - int release_ref_cpn[MAX_MMCO_COUNT]; - int release_cnt; - - int qp_thresh; ///< QP threshold to skip loopfilter - int use_weight; - int use_weight_chroma; - int luma_log2_weight_denom; - int chroma_log2_weight_denom; - - int16_t luma_weight[16][2][2]; - int16_t chroma_weight[16][2][2][2]; - int16_t implicit_weight[16][16][2]; - - //poc number of ref_list int ref_poc[2][16] - //In edslice this must becom Picture Info - int ref_list_cpn[2][16]; - PictureInfo *ref_list[2][16]; ///Reordered version of default_ref_list according to picture reordering in slice header - DecodedPicture *dp_ref_list[2][16]; - int ref_count[2]; ///< counts frames or fields, depending on current mb mode - - int slice_type; - int slice_type_nos; - int slice_alpha_c0_offset; - int slice_beta_offset; - int direct_8x8_inference_flag; - - uint8_t list_count; - uint32_t coded_pic_num; - - int poc; - int key_frame; - int mmco_reset; //FIXME not used? - - ///stuff only needed for nal/entropy decoding -// H264Mb *m; -// GetBitContext *gb; - int ip_id; - int transform_bypass; - int direct_spatial_mv_pred; - int map_col_to_list0[2][16]; - int dist_scale_factor[16]; - - int cabac_init_idc; - int nal_ref_idc; - int nal_unit_type; - - int ref2frm[2][64]; ///< reference to frame number lists, the first 2 are for -2,-1 - - int qscale; - -} H264Slice; - -typedef struct { - H264Slice slice; - H264Mb *mbs; - DecodedPicture *dp; - GetBitContext gb; - - int lines_taken; - int lines_total; - int state; // 0 free, 1 in use //1 wait for entropy, 2 wait for reconstruct. - int initialized; -} SliceBufferEntry; - -typedef struct RingLineEntry{ - union{ - DECLARE_ALIGNED(64, volatile int32_t, mb_cnt); - DECLARE_ALIGNED(64, int32_t, pad[16]); - }; - SliceBufferEntry *sbe; - int id; - int line; - TopBorder *top; - struct RingLineEntry *prev_line; - -} RingLineEntry; - -// #if OMPSS -typedef struct SuperMBTask{ - int smb_x; - int smb_y; -} SuperMBTask; - -typedef struct SuperMBContext{ - int nsmb_width; //number of super macroblocks in picture width - int nsmb_height; //number of super macroblocks in picture height - int nsmb_3dheight; //number of super macroblocks in picture height - max motion vertical vector - int smb_width; //width of a super macroblock - int smb_height; //height of a super macroblock - int refcount; - int index; - SuperMBTask *smbs[2]; -} SuperMBContext; -// #endif - -//scratchpad for decoding a macroblock -typedef struct MBRecState{ - int8_t *ref_index_top[2]; - int8_t *ref_index[2]; - int16_t (*motion_val_top[2])[2]; - int16_t (*motion_val[2])[2]; - uint32_t *mb_type_top; - uint32_t *mb_type; - - int8_t *list1_ref_index[2]; - int16_t (*list1_motion_val[2])[2]; - uint32_t *list1_mb_type; - - int8_t *intra4x4_pred_mode_top; - int8_t *intra4x4_pred_mode; -#if !OMPSS - int8_t intra4x4_pred_mode_left[4]; -#endif - int8_t *non_zero_count_top; - int8_t *non_zero_count; -// int8_t non_zero_count_left[8]; - - - unsigned int topleft_samples_available; - unsigned int topright_samples_available; - unsigned int top_samples_available; - unsigned int left_samples_available; - - int top_type; - int left_type; - - DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]); - DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; - DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; - DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; - DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; - - DECLARE_ALIGNED(8, int16_t, bS)[2][4][4]; - uint8_t edges[2]; - -}MBRecState ; - -typedef struct MBRecContext{ - DSPContext dsp; ///< pointers for accelerated dsp functions - H264DSPContext hdsp; - H264PredContext hpc; - - MBRecState *mrs; - RingLineEntry *rle; //debug - - uint8_t *scratchpad_y; ///implemented different on Cell - uint8_t *scratchpad_cb; ///implemented different on Cell - uint8_t *scratchpad_cr; ///implemented different on Cell - - int linesize; - int uvlinesize; - int mb_width; - int mb_height; - int mb_stride; - int b_stride; - int width; - int height; - -#if !OMPSS // not used in OMPSS - LeftBorder left; - TopBorder *top; - TopBorder *top_next; // next line top border -#endif - /* - .UU.YYYY - .UU.YYYY - .vv.YYYY - .VV.YYYY - */ - - // block_offset[ 0..23] for frame macroblocks - int block_offset[16+8]; - -} MBRecContext; - -#ifdef HAVE_LIBSDL2 -typedef struct SDLContext{ - int display; - int fullscreen; - pthread_t listen_thread; - - SDL_DisplayMode full; - SDL_DisplayMode wind; - - - SDL_Renderer *renderer; - SDL_Rect rect; - SDL_Rect win_rect; - SDL_Window *window; - double aspect; - int win_w; - int win_h; - int resized; - - SDL_Texture *sbmap_texture; - int showmap; - int updatemap; - int pause; - -} SDLContext; -#endif - -typedef struct OutputContext { - int bit_buffer_size; - uint8_t *bit_buffer; - uint64_t video_size; - int frame_number; - DecodedPicture *delayed_pic[DPB_SIZE]; - int dp_cnt; - -} OutputContext; - -typedef struct { - pthread_mutex_t lock; - pthread_cond_t cond; - SliceBufferEntry **queue; - int size; - int cnt; - int fi; - int fo; -} SliceBufferQueue; - -typedef struct { - pthread_mutex_t wslock; - pthread_cond_t wscond; - pthread_mutex_t swlock; - pthread_cond_t swcond; - RingLineEntry **queue; - int size; - int ready; - int free; - int fi; - int fo; -} RingLineQueue; - -#if HAVE_LIBSDL2 -typedef struct { - pthread_mutex_t sdl_lock; - pthread_cond_t sdl_cond; - SDL_Texture **queue; - int size; - int ready; - int fi; - int fo; - int exit; -} SDLTextureQueue; -#endif -/** -* H264Context -*/ -typedef struct H264Context{ - SliceBufferQueue sb_q[STAGES]; - RingLineQueue rl_q; - - pthread_mutex_t lock[STAGES]; - pthread_cond_t cond[STAGES]; - - pthread_mutex_t task_lock; - pthread_cond_t task_cond; - - pthread_attr_t ed_rec_attr[MAX_THREADS]; - pthread_t ed_rec_thr[MAX_THREADS]; - - int init_threads; - pthread_mutex_t ilock; - pthread_cond_t icond; - - const char *file_name; - int profile; - int start; - int touch_start; - int setaff; - int touch_done; - int rl_side_touch; - int statmbd; - pthread_mutex_t slock; - pthread_cond_t scond; - pthread_mutex_t tlock; - pthread_cond_t tcond; - pthread_mutex_t tdlock; - pthread_cond_t tdcond; - - int ed_ppe_threads; - int threads; - int smt; - - int acdpb_cnt; //debug - int reldpb_cnt; - - int sb_size; - SliceBufferEntry *sb; ///< Slice Syntax Buffer - int free_sb_cnt; - int slice_bufs; - - int max_dpb_cnt; - DecodedPicture *dpb; ///< Decoded Picture Buffer - int free_dpb_cnt; - - int ifile; - int ofile; - int frame_width; - int frame_height; - int num_frames; - int width; - int height; - int mb_width; - int mb_height; - int mb_stride; ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11 - int b4_stride; - int b_stride; - - int smb_height; - int smb_width; - pthread_mutex_t smb_lock; - pthread_cond_t sdl_cond; - pthread_mutex_t sdl_lock; - SuperMBContext *smbc; - - int wave_order; - int static_3d; - int pipe_bufs; - - //shared tables used in entropy decoding - uint8_t zigzag_scan[16]; - uint8_t zigzag_scan8x8[64]; - - int verbose; - int no_mbd; - int display; - int fullscreen; - int quit; -#ifdef HAVE_LIBSDL2 - SDLTextureQueue sdlq; - SDLContext *sdlc; -#endif - - struct timespec start_time[PROFILE_STAGES]; - struct timespec end_time[PROFILE_STAGES]; - double last_time[PROFILE_STAGES]; - double total_time[PROFILE_STAGES]; - -}H264Context; - -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/mathops.h --- a/ffmpeg_smp/h264dec/libavcodec/mathops.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2001, 2002 Fabrice Bellard - * Copyright (c) 2006 Michael Niedermayer et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef AVCODEC_MATHOPS_H -#define AVCODEC_MATHOPS_H - -#include "libavutil/common.h" -#include "libavutil/internal.h" - -#if ARCH_ARM -# include "arm/mathops.h" -#elif ARCH_PPC -# include "ppc/mathops.h" -#elif ARCH_X86 -# include "x86/mathops.h" -#endif - -/* generic implementation */ - -#ifndef MULL -# define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s)) -#endif - -#ifndef MULH -//gcc 3.4 creates an incredibly bloated mess out of this -//# define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) - -static av_always_inline int MULH(int a, int b){ - return ((int64_t)(a) * (int64_t)(b))>>32; -} -#endif - -#ifndef UMULH -static av_always_inline unsigned UMULH(unsigned a, unsigned b){ - return ((uint64_t)(a) * (uint64_t)(b))>>32; -} -#endif - -#ifndef MUL64 -# define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) -#endif - -#ifndef MAC64 -# define MAC64(d, a, b) ((d) += MUL64(a, b)) -#endif - -#ifndef MLS64 -# define MLS64(d, a, b) ((d) -= MUL64(a, b)) -#endif - -/* signed 16x16 -> 32 multiply add accumulate */ -#ifndef MAC16 -# define MAC16(rt, ra, rb) rt += (ra) * (rb) -#endif - -/* signed 16x16 -> 32 multiply */ -#ifndef MUL16 -# define MUL16(ra, rb) ((ra) * (rb)) -#endif - -#ifndef MLS16 -# define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb)) -#endif - -/* median of 3 */ -#ifndef mid_pred -#define mid_pred mid_pred -static inline av_const int mid_pred(int a, int b, int c) -{ -#if 0 - int t= (a-b)&((a-b)>>31); - a-=t; - b+=t; - b-= (b-c)&((b-c)>>31); - b+= (a-b)&((a-b)>>31); - - return b; -#else - if(a>b){ - if(c>b){ - if(c>a) b=a; - else b=c; - } - }else{ - if(b>c){ - if(c>a) b=c; - else b=a; - } - } - return b; -#endif -} -#endif - -#ifndef sign_extend -static inline av_const int sign_extend(int val, unsigned bits) -{ - return (val << (INT_BIT - bits)) >> (INT_BIT - bits); -} -#endif - -#ifndef zero_extend -static inline av_const unsigned zero_extend(unsigned val, unsigned bits) -{ - return (val << (INT_BIT - bits)) >> (INT_BIT - bits); -} -#endif - -#ifndef COPY3_IF_LT -#define COPY3_IF_LT(x, y, a, b, c, d)\ -if ((y) < (x)) {\ - (x) = (y);\ - (a) = (b);\ - (c) = (d);\ -} -#endif - -#ifndef NEG_SSR32 -# define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s))) -#endif - -#ifndef NEG_USR32 -# define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s))) -#endif - -#endif /* AVCODEC_MATHOPS_H */ - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c --- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,619 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include -#endif -#include "libavcodec/dsputil.h" -#include "dsputil_ppc.h" -#include "util_altivec.h" -#include "types_altivec.h" -#include "dsputil_altivec.h" - - -static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) -{ - int i; - vector unsigned char perm, bytes, *pixv; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts; - - for (i = 0; i < 8; i++) { - // Read potentially unaligned pixels. - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, pixels); - pixv = (vector unsigned char *) pixels; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts = (vector signed short)vec_mergeh(zero, bytes); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts, i*16, (vector signed short*)block); - - pixels += line_size; - } -} - -static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, - const uint8_t *s2, int stride) -{ - int i; - vector unsigned char perm, bytes, *pixv; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts1, shorts2; - - for (i = 0; i < 4; i++) { - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, s1); - pixv = (vector unsigned char *) s1; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - perm = vec_lvsl(0, s2); - pixv = (vector unsigned char *) s2; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - - - // The code below is a copy of the code above... This is a manual - // unroll. - - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, s1); - pixv = (vector unsigned char *) s1; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - perm = vec_lvsl(0, s2); - pixv = (vector unsigned char *) s2; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - } -} - - -static void clear_block_altivec(DCTELEM *block) { - LOAD_ZERO; - vec_st(zero_s16v, 0, block); - vec_st(zero_s16v, 16, block); - vec_st(zero_s16v, 32, block); - vec_st(zero_s16v, 48, block); - vec_st(zero_s16v, 64, block); - vec_st(zero_s16v, 80, block); - vec_st(zero_s16v, 96, block); - vec_st(zero_s16v, 112, block); -} - - - -/* next one assumes that ((line_size % 16) == 0) */ -void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); - register vector unsigned char pixelsv1, pixelsv2; - register vector unsigned char pixelsv1B, pixelsv2B; - register vector unsigned char pixelsv1C, pixelsv2C; - register vector unsigned char pixelsv1D, pixelsv2D; - - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - register int line_size_2 = line_size << 1; - register int line_size_3 = line_size + line_size_2; - register int line_size_4 = line_size << 2; - -POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); -// hand-unrolling the loop by 4 gains about 15% -// mininum execution time goes from 74 to 60 cycles -// it's faster than -funroll-loops, but using -// -funroll-loops w/ this is bad - 74 cycles again. -// all this is on a 7450, tuning for the 7450 -#if 0 - for (i = 0; i < h; i++) { - pixelsv1 = vec_ld(0, pixels); - pixelsv2 = vec_ld(16, pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, block); - pixels+=line_size; - block +=line_size; - } -#else - for (i = 0; i < h; i += 4) { - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(15, pixels); - pixelsv1B = vec_ld(line_size, pixels); - pixelsv2B = vec_ld(15 + line_size, pixels); - pixelsv1C = vec_ld(line_size_2, pixels); - pixelsv2C = vec_ld(15 + line_size_2, pixels); - pixelsv1D = vec_ld(line_size_3, pixels); - pixelsv2D = vec_ld(15 + line_size_3, pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)block); - vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), - line_size, (unsigned char*)block); - vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), - line_size_2, (unsigned char*)block); - vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), - line_size_3, (unsigned char*)block); - pixels+=line_size_4; - block +=line_size_4; - } -#endif -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) -void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - -POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); - - for (i = 0; i < h; i++) { - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(16,pixels); - blockv = vec_ld(0, block); - pixelsv = vec_perm(pixelsv1, pixelsv2, perm); - blockv = vec_avg(blockv,pixelsv); - vec_st(blockv, 0, (unsigned char*)block); - pixels+=line_size; - block +=line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - int i; - -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); - - for (i = 0; i < h; i++) { - /* block is 8 bytes-aligned, so we're either in the - left block (16 bytes-aligned) or in the right block (not) */ - int rightside = ((unsigned long)block & 0x0000000F); - - blockv = vec_ld(0, block); - pixelsv1 = vec_ld( 0, pixels); - pixelsv2 = vec_ld(16, pixels); - pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); - - if (rightside) { - pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); - } else { - pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); - } - - blockv = vec_avg(blockv, pixelsv); - - vec_st(blockv, 0, block); - - pixels += line_size; - block += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - -POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vcone); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short temp3, temp4, - pixelssum1, pixelssum2, pixelssum3, pixelssum4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vctwo); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char blockv, temp1, temp2; - register vector unsigned short temp3, temp4, - pixelssum1, pixelssum2, pixelssum3, pixelssum4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vcone); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vcone); - pixelssum1 = vec_add(pixelssum2, vcone); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2, blocktemp; - register vector unsigned short pixelssum1, pixelssum2, temp3; - - register const vector unsigned char vczero = (const vector unsigned char) - vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short) - vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - blockv = vec_avg(blocktemp, blockv); - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); -} - -void dsputil_init_altivec(DSPContext* c) -{ - c->diff_pixels = diff_pixels_altivec; - c->get_pixels = get_pixels_altivec; - c->clear_block = clear_block_altivec; - - c->put_pixels_tab[0][0] = put_pixels16_altivec; - /* the two functions do the same thing, so use the same code */ - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; - c->avg_pixels_tab[0][0] = avg_pixels16_altivec; - c->avg_pixels_tab[1][0] = avg_pixels8_altivec; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; - c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; - c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; - -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h --- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H -#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H - -#include -#include "libavcodec/dsputil.h" - -void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -int has_altivec(void); - -void fdct_altivec(int16_t *block); -void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, - int x16, int y16, int rounder); -void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - -void ff_vp3_idct_altivec(DCTELEM *block); -void ff_vp3_idct_put_altivec(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_altivec(uint8_t *dest, int line_size, DCTELEM *block); - -void dsputil_h264_init_ppc(DSPContext* c); - -void dsputil_init_altivec(DSPContext* c); -//void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx); -//void float_init_altivec(DSPContext* c, AVCodecContext *avctx); -//void int_init_altivec(DSPContext* c, AVCodecContext *avctx); - -#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c --- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_ppc.h" -#include "dsputil_altivec.h" - -static void prefetch_ppc(void *mem, int stride, int h) -{ - register const uint8_t *p = mem; - do { - __asm__ volatile ("dcbt 0,%0" : : "r" (p)); - p+= stride; - } while(--h); -} - -void dsputil_init_ppc(DSPContext* c) -{ - c->prefetch = prefetch_ppc; - -#if HAVE_ALTIVEC - dsputil_h264_init_ppc(c); - dsputil_init_altivec(c); - - c->idct_put = idct_put_altivec; - c->idct_add = idct_add_altivec; - -#endif /* HAVE_ALTIVEC */ -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h --- a/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2003-2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_DSPUTIL_PPC_H -#define AVCODEC_PPC_DSPUTIL_PPC_H - -#include "config.h" - -#if CONFIG_POWERPC_PERF -void powerpc_display_perf_report(void); -/* the 604* have 2, the G3* have 4, the G4s have 6, - and the G5 are completely different (they MUST use - ARCH_PPC64, and let's hope all future 64 bis PPC - will use the same PMCs... */ -#define POWERPC_NUM_PMC_ENABLED 6 -/* if you add to the enum below, also add to the perfname array - in dsputil_ppc.c */ -enum powerpc_perf_index { - altivec_fft_num = 0, - altivec_gmc1_num, - altivec_dct_unquantize_h263_num, - altivec_fdct, - altivec_idct_add_num, - altivec_idct_put_num, - altivec_put_pixels16_num, - altivec_avg_pixels16_num, - altivec_avg_pixels8_num, - altivec_put_pixels8_xy2_num, - altivec_put_no_rnd_pixels8_xy2_num, - altivec_put_pixels16_xy2_num, - altivec_put_no_rnd_pixels16_xy2_num, - altivec_hadamard8_diff8x8_num, - altivec_hadamard8_diff16_num, - altivec_avg_pixels8_xy2_num, - powerpc_clear_blocks_dcbz32, - powerpc_clear_blocks_dcbz128, - altivec_put_h264_chroma_mc8_num, - altivec_avg_h264_chroma_mc8_num, - altivec_put_h264_qpel16_h_lowpass_num, - altivec_avg_h264_qpel16_h_lowpass_num, - altivec_put_h264_qpel16_v_lowpass_num, - altivec_avg_h264_qpel16_v_lowpass_num, - altivec_put_h264_qpel16_hv_lowpass_num, - altivec_avg_h264_qpel16_hv_lowpass_num, - powerpc_perf_total -}; -enum powerpc_data_index { - powerpc_data_min = 0, - powerpc_data_max, - powerpc_data_sum, - powerpc_data_num, - powerpc_data_total -}; -extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; - -#if !ARCH_PPC64 -#define POWERP_PMC_DATATYPE unsigned long -#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a)) -#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a)) -#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a)) -#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#else /* ARCH_PPC64 */ -#define POWERP_PMC_DATATYPE unsigned long long -#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a)) -#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a)) -#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a)) -#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#endif /* ARCH_PPC64 */ -#define POWERPC_PERF_DECLARE(a, cond) \ - POWERP_PMC_DATATYPE \ - pmc_start[POWERPC_NUM_PMC_ENABLED], \ - pmc_stop[POWERPC_NUM_PMC_ENABLED], \ - pmc_loop_index; -#define POWERPC_PERF_START_COUNT(a, cond) do { \ - POWERPC_GET_PMC6(pmc_start[5]); \ - POWERPC_GET_PMC5(pmc_start[4]); \ - POWERPC_GET_PMC4(pmc_start[3]); \ - POWERPC_GET_PMC3(pmc_start[2]); \ - POWERPC_GET_PMC2(pmc_start[1]); \ - POWERPC_GET_PMC1(pmc_start[0]); \ - } while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do { \ - POWERPC_GET_PMC1(pmc_stop[0]); \ - POWERPC_GET_PMC2(pmc_stop[1]); \ - POWERPC_GET_PMC3(pmc_stop[2]); \ - POWERPC_GET_PMC4(pmc_stop[3]); \ - POWERPC_GET_PMC5(pmc_stop[4]); \ - POWERPC_GET_PMC6(pmc_stop[5]); \ - if (cond) { \ - for(pmc_loop_index = 0; \ - pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ - pmc_loop_index++) { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ - POWERP_PMC_DATATYPE diff = \ - pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ - if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ - perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ - if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ - perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ - perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ - perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ - } \ - } \ - } \ -} while (0) -#else /* CONFIG_POWERPC_PERF */ -// those are needed to avoid empty statements. -#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) -#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) -#endif /* CONFIG_POWERPC_PERF */ - -#endif /* AVCODEC_PPC_DSPUTIL_PPC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c --- a/ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1021 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "libavcodec/h264_data.h" -#include "libavcodec/h264_dsp.h" - -#include "dsputil_ppc.h" -#include "dsputil_altivec.h" -#include "util_altivec.h" -#include "types_altivec.h" - -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec -#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num -#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num -#include "h264_template_altivec.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec -#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num -#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num -#include "h264_template_altivec.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define H264_MC(OPNAME, SIZE, CODETYPE) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ - DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ - -static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(a, b); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -/* Implemented but could be faster -#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) -#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) - */ - -H264_MC(put_, 16, altivec) -H264_MC(avg_, 16, altivec) - - -/**************************************************************************** - * IDCT transform: - ****************************************************************************/ - -#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ - /* 1st stage */ \ - vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ - vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ - vz2 = vec_sra(vb1,vec_splat_u16(1)); \ - vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ - vz3 = vec_sra(vb3,vec_splat_u16(1)); \ - vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ - /* 2nd stage: output */ \ - va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ - va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ - va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ - va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ - -#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ - b0 = vec_mergeh( a0, a0 ); \ - b1 = vec_mergeh( a1, a0 ); \ - b2 = vec_mergeh( a2, a0 ); \ - b3 = vec_mergeh( a3, a0 ); \ - a0 = vec_mergeh( b0, b2 ); \ - a1 = vec_mergel( b0, b2 ); \ - a2 = vec_mergeh( b1, b3 ); \ - a3 = vec_mergel( b1, b3 ); \ - b0 = vec_mergeh( a0, a2 ); \ - b1 = vec_mergel( a0, a2 ); \ - b2 = vec_mergeh( a1, a3 ); \ - b3 = vec_mergel( a1, a3 ) - -#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ - vdst_orig = vec_ld(0, dst); \ - vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ - vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \ - va = vec_add(va, vdst_ss); \ - va_u8 = vec_packsu(va, zero_s16v); \ - va_u32 = vec_splat((vec_u32)va_u8, 0); \ - vec_ste(va_u32, element, (uint32_t*)dst); - -static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) -{ - vec_s16 va0, va1, va2, va3; - vec_s16 vz0, vz1, vz2, vz3; - vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; - vec_u8 va_u8; - vec_u32 va_u32; - vec_s16 vdst_ss; - const vec_u16 v6us = vec_splat_u16(6); - vec_u8 vdst, vdst_orig; - vec_u8 vdst_mask = vec_lvsl(0, dst); - int element = ((unsigned long)dst & 0xf) >> 2; - LOAD_ZERO; - - block[0] += 32; /* add 32 as a DC-level for rounding */ - - vtmp0 = vec_ld(0,block); - vtmp1 = vec_sld(vtmp0, vtmp0, 8); - vtmp2 = vec_ld(16,block); - vtmp3 = vec_sld(vtmp2, vtmp2, 8); - - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - - va0 = vec_sra(va0,v6us); - va1 = vec_sra(va1,v6us); - va2 = vec_sra(va2,v6us); - va3 = vec_sra(va3,v6us); - - VEC_LOAD_U8_ADD_S16_STORE_U8(va0); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va1); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va2); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va3); -} - -#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ - /* a0 = SRC(0) + SRC(4); */ \ - vec_s16 a0v = vec_add(s0, s4); \ - /* a2 = SRC(0) - SRC(4); */ \ - vec_s16 a2v = vec_sub(s0, s4); \ - /* a4 = (SRC(2)>>1) - SRC(6); */ \ - vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \ - /* a6 = (SRC(6)>>1) + SRC(2); */ \ - vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \ - /* b0 = a0 + a6; */ \ - vec_s16 b0v = vec_add(a0v, a6v); \ - /* b2 = a2 + a4; */ \ - vec_s16 b2v = vec_add(a2v, a4v); \ - /* b4 = a2 - a4; */ \ - vec_s16 b4v = vec_sub(a2v, a4v); \ - /* b6 = a0 - a6; */ \ - vec_s16 b6v = vec_sub(a0v, a6v); \ - /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ - /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ - vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ - /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ - /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ - vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ - /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ - /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ - vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ - /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ - vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ - /* b1 = (a7>>2) + a1; */ \ - vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \ - /* b3 = a3 + (a5>>2); */ \ - vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \ - /* b5 = (a3>>2) - a5; */ \ - vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \ - /* b7 = a7 - (a1>>2); */ \ - vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ - /* DST(0, b0 + b7); */ \ - d0 = vec_add(b0v, b7v); \ - /* DST(1, b2 + b5); */ \ - d1 = vec_add(b2v, b5v); \ - /* DST(2, b4 + b3); */ \ - d2 = vec_add(b4v, b3v); \ - /* DST(3, b6 + b1); */ \ - d3 = vec_add(b6v, b1v); \ - /* DST(4, b6 - b1); */ \ - d4 = vec_sub(b6v, b1v); \ - /* DST(5, b4 - b3); */ \ - d5 = vec_sub(b4v, b3v); \ - /* DST(6, b2 - b5); */ \ - d6 = vec_sub(b2v, b5v); \ - /* DST(7, b0 - b7); */ \ - d7 = vec_sub(b0v, b7v); \ -} - -#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ - /* unaligned load */ \ - vec_u8 hv = vec_ld( 0, dest ); \ - vec_u8 lv = vec_ld( 7, dest ); \ - vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \ - vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ - vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \ - vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ - vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ - vec_u8 edgehv; \ - /* unaligned store */ \ - vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ - vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ - lv = vec_sel( lv, bodyv, edgelv ); \ - vec_st( lv, 7, dest ); \ - hv = vec_ld( 0, dest ); \ - edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ - hv = vec_sel( hv, bodyv, edgehv ); \ - vec_st( hv, 0, dest ); \ - } - -static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { - vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; - vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; - vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; - - vec_u8 perm_ldv = vec_lvsl(0, dst); - vec_u8 perm_stv = vec_lvsr(8, dst); - - const vec_u16 onev = vec_splat_u16(1); - const vec_u16 twov = vec_splat_u16(2); - const vec_u16 sixv = vec_splat_u16(6); - - const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; - LOAD_ZERO; - - dct[0] += 32; // rounding for the >>6 at the end - - s0 = vec_ld(0x00, (int16_t*)dct); - s1 = vec_ld(0x10, (int16_t*)dct); - s2 = vec_ld(0x20, (int16_t*)dct); - s3 = vec_ld(0x30, (int16_t*)dct); - s4 = vec_ld(0x40, (int16_t*)dct); - s5 = vec_ld(0x50, (int16_t*)dct); - s6 = vec_ld(0x60, (int16_t*)dct); - s7 = vec_ld(0x70, (int16_t*)dct); - - IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, - d0, d1, d2, d3, d4, d5, d6, d7); - - TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); - - IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, - idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); - - ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); -} - -static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) -{ - vec_s16 dc16; - vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; - LOAD_ZERO; - DECLARE_ALIGNED(16, int, dc); - int i; - - dc = (block[0] + 32) >> 6; - dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); - - if (size == 4) - dc16 = vec_sld(dc16, zero_s16v, 8); - dcplus = vec_packsu(dc16, zero_s16v); - dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); - - aligner = vec_lvsr(0, dst); - dcplus = vec_perm(dcplus, dcplus, aligner); - dcminus = vec_perm(dcminus, dcminus, aligner); - - for (i = 0; i < size; i += 4) { - v0 = vec_ld(0, dst+0*stride); - v1 = vec_ld(0, dst+1*stride); - v2 = vec_ld(0, dst+2*stride); - v3 = vec_ld(0, dst+3*stride); - - v0 = vec_adds(v0, dcplus); - v1 = vec_adds(v1, dcplus); - v2 = vec_adds(v2, dcplus); - v3 = vec_adds(v3, dcplus); - - v0 = vec_subs(v0, dcminus); - v1 = vec_subs(v1, dcminus); - v2 = vec_subs(v2, dcminus); - v3 = vec_subs(v3, dcminus); - - vec_st(v0, 0, dst+0*stride); - vec_st(v1, 0, dst+1*stride); - vec_st(v2, 0, dst+2*stride); - vec_st(v3, 0, dst+3*stride); - - dst += 4*stride; - } -} - -static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) -{ - h264_idct_dc_add_internal(dst, block, stride, 4); -} - -static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) -{ - h264_idct_dc_add_internal(dst, block, stride, 8); -} - -static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); - else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_altivec (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - else if(block[i*16]) - h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} - -#define transpose4x16(r0, r1, r2, r3) { \ - register vec_u8 r4; \ - register vec_u8 r5; \ - register vec_u8 r6; \ - register vec_u8 r7; \ - \ - r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ - r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ - r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ - r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ - \ - r0 = vec_mergeh(r4, r6); /*all set 0*/ \ - r1 = vec_mergel(r4, r6); /*all set 1*/ \ - r2 = vec_mergeh(r5, r7); /*all set 2*/ \ - r3 = vec_mergel(r5, r7); /*all set 3*/ \ -} - -static inline void write16x4(uint8_t *dst, int dst_stride, - register vec_u8 r0, register vec_u8 r1, - register vec_u8 r2, register vec_u8 r3) { - DECLARE_ALIGNED(16, unsigned char, result)[64]; - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; - int int_dst_stride = dst_stride/4; - - vec_st(r0, 0, result); - vec_st(r1, 16, result); - vec_st(r2, 32, result); - vec_st(r3, 48, result); - /* FIXME: there has to be a better way!!!! */ - *dst_int = *src_int; - *(dst_int+ int_dst_stride) = *(src_int + 1); - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); - *(dst_int+10*int_dst_stride) = *(src_int + 10); - *(dst_int+11*int_dst_stride) = *(src_int + 11); - *(dst_int+12*int_dst_stride) = *(src_int + 12); - *(dst_int+13*int_dst_stride) = *(src_int + 13); - *(dst_int+14*int_dst_stride) = *(src_int + 14); - *(dst_int+15*int_dst_stride) = *(src_int + 15); -} - -/** \brief performs a 6x16 transpose of data in src, and stores it to dst - \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing - out of unaligned_load() */ -#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ - register vec_u8 r0 = unaligned_load(0, src); \ - register vec_u8 r1 = unaligned_load( src_stride, src); \ - register vec_u8 r2 = unaligned_load(2* src_stride, src); \ - register vec_u8 r3 = unaligned_load(3* src_stride, src); \ - register vec_u8 r4 = unaligned_load(4* src_stride, src); \ - register vec_u8 r5 = unaligned_load(5* src_stride, src); \ - register vec_u8 r6 = unaligned_load(6* src_stride, src); \ - register vec_u8 r7 = unaligned_load(7* src_stride, src); \ - register vec_u8 r14 = unaligned_load(14*src_stride, src); \ - register vec_u8 r15 = unaligned_load(15*src_stride, src); \ - \ - r8 = unaligned_load( 8*src_stride, src); \ - r9 = unaligned_load( 9*src_stride, src); \ - r10 = unaligned_load(10*src_stride, src); \ - r11 = unaligned_load(11*src_stride, src); \ - r12 = unaligned_load(12*src_stride, src); \ - r13 = unaligned_load(13*src_stride, src); \ - \ - /*Merge first pairs*/ \ - r0 = vec_mergeh(r0, r8); /*0, 8*/ \ - r1 = vec_mergeh(r1, r9); /*1, 9*/ \ - r2 = vec_mergeh(r2, r10); /*2,10*/ \ - r3 = vec_mergeh(r3, r11); /*3,11*/ \ - r4 = vec_mergeh(r4, r12); /*4,12*/ \ - r5 = vec_mergeh(r5, r13); /*5,13*/ \ - r6 = vec_mergeh(r6, r14); /*6,14*/ \ - r7 = vec_mergeh(r7, r15); /*7,15*/ \ - \ - /*Merge second pairs*/ \ - r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ - r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ - r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ - r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ - r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ - r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ - r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ - r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ - \ - /*Third merge*/ \ - r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ - r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ - r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ - r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ - r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ - r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ - /* Don't need to compute 3 and 7*/ \ - \ - /*Final merge*/ \ - r8 = vec_mergeh(r0, r4); /*all set 0*/ \ - r9 = vec_mergel(r0, r4); /*all set 1*/ \ - r10 = vec_mergeh(r1, r5); /*all set 2*/ \ - r11 = vec_mergel(r1, r5); /*all set 3*/ \ - r12 = vec_mergeh(r2, r6); /*all set 4*/ \ - r13 = vec_mergel(r2, r6); /*all set 5*/ \ - /* Don't need to compute 14 and 15*/ \ - \ -} - -// out: o = |x-y| < a -static inline vec_u8 diff_lt_altivec ( register vec_u8 x, - register vec_u8 y, - register vec_u8 a) { - - register vec_u8 diff = vec_subs(x, y); - register vec_u8 diffneg = vec_subs(y, x); - register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */ - o = (vec_u8)vec_cmplt(o, a); - return o; -} - -static inline vec_u8 h264_deblock_mask ( register vec_u8 p0, - register vec_u8 p1, - register vec_u8 q0, - register vec_u8 q1, - register vec_u8 alpha, - register vec_u8 beta) { - - register vec_u8 mask; - register vec_u8 tempmask; - - mask = diff_lt_altivec(p0, q0, alpha); - tempmask = diff_lt_altivec(p1, p0, beta); - mask = vec_and(mask, tempmask); - tempmask = diff_lt_altivec(q1, q0, beta); - mask = vec_and(mask, tempmask); - - return mask; -} - -// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) -static inline vec_u8 h264_deblock_q1(register vec_u8 p0, - register vec_u8 p1, - register vec_u8 p2, - register vec_u8 q0, - register vec_u8 tc0) { - - register vec_u8 average = vec_avg(p0, q0); - register vec_u8 temp; - register vec_u8 uncliped; - register vec_u8 ones; - register vec_u8 max; - register vec_u8 min; - register vec_u8 newp1; - - temp = vec_xor(average, p2); - average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ - ones = vec_splat_u8(1); - temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ - uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ - max = vec_adds(p1, tc0); - min = vec_subs(p1, tc0); - newp1 = vec_max(min, uncliped); - newp1 = vec_min(max, newp1); - return newp1; -} - -#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ - \ - const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ - \ - register vec_u8 pq0bit = vec_xor(p0,q0); \ - register vec_u8 q1minus; \ - register vec_u8 p0minus; \ - register vec_u8 stage1; \ - register vec_u8 stage2; \ - register vec_u8 vec160; \ - register vec_u8 delta; \ - register vec_u8 deltaneg; \ - \ - q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ - stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ - stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ - p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ - stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ - pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ - stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ - stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ - vec160 = vec_ld(0, &A0v); \ - deltaneg = vec_subs(vec160, stage2); /* -d */ \ - delta = vec_subs(stage2, vec160); /* d */ \ - deltaneg = vec_min(tc0masked, deltaneg); \ - delta = vec_min(tc0masked, delta); \ - p0 = vec_subs(p0, deltaneg); \ - q0 = vec_subs(q0, delta); \ - p0 = vec_adds(p0, delta); \ - q0 = vec_adds(q0, deltaneg); \ -} - -#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ - DECLARE_ALIGNED(16, unsigned char, temp)[16]; \ - register vec_u8 alphavec; \ - register vec_u8 betavec; \ - register vec_u8 mask; \ - register vec_u8 p1mask; \ - register vec_u8 q1mask; \ - register vector signed char tc0vec; \ - register vec_u8 finaltc0; \ - register vec_u8 tc0masked; \ - register vec_u8 newp1; \ - register vec_u8 newq1; \ - \ - temp[0] = alpha; \ - temp[1] = beta; \ - alphavec = vec_ld(0, temp); \ - betavec = vec_splat(alphavec, 0x1); \ - alphavec = vec_splat(alphavec, 0x0); \ - mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ - \ - *((int *)temp) = *((int *)tc0); \ - tc0vec = vec_ld(0, (signed char*)temp); \ - tc0vec = vec_mergeh(tc0vec, tc0vec); \ - tc0vec = vec_mergeh(tc0vec, tc0vec); \ - mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ - finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \ - \ - p1mask = diff_lt_altivec(p2, p0, betavec); \ - p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ - tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \ - finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ - newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ - /*end if*/ \ - \ - q1mask = diff_lt_altivec(q2, q0, betavec); \ - q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ - tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \ - finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ - newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ - /*end if*/ \ - \ - h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ - p1 = newp1; \ - q1 = newq1; \ -} - -static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - - if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { - register vec_u8 p2 = vec_ld(-3*stride, pix); - register vec_u8 p1 = vec_ld(-2*stride, pix); - register vec_u8 p0 = vec_ld(-1*stride, pix); - register vec_u8 q0 = vec_ld(0, pix); - register vec_u8 q1 = vec_ld(stride, pix); - register vec_u8 q2 = vec_ld(2*stride, pix); - h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); - vec_st(p1, -2*stride, pix); - vec_st(p0, -1*stride, pix); - vec_st(q0, 0, pix); - vec_st(q1, stride, pix); - } -} - -static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - - register vec_u8 line0, line1, line2, line3, line4, line5; - if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) - return; - readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); - h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); - transpose4x16(line1, line2, line3, line4); - write16x4(pix-2, stride, line1, line2, line3, line4); -} - -static av_always_inline -void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) -{ - int y, aligned; - vec_u8 vblock; - vec_s16 vtemp, vweight, voffset, v0, v1; - vec_u16 vlog2_denom; - DECLARE_ALIGNED(16, int32_t, temp)[4]; - LOAD_ZERO; - - offset <<= log2_denom; - if(log2_denom) offset += 1<<(log2_denom-1); - temp[0] = log2_denom; - temp[1] = weight; - temp[2] = offset; - - vtemp = (vec_s16)vec_ld(0, temp); - vlog2_denom = (vec_u16)vec_splat(vtemp, 1); - vweight = vec_splat(vtemp, 3); - voffset = vec_splat(vtemp, 5); - aligned = !((unsigned long)block & 0xf); - - for (y=0; yput_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; - c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; - -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec - - dspfunc(put_h264_qpel, 0, 16); - dspfunc(avg_h264_qpel, 0, 16); -#undef dspfunc -} - -void ff_h264dsp_init_ppc(H264DSPContext *c){ - c->h264_idct_dc_add= h264_idct_dc_add_altivec; - c->h264_idct_add = ff_h264_idct_add_altivec; - c->h264_idct_add8 = ff_h264_idct_add8_altivec; - c->h264_idct_add16 = ff_h264_idct_add16_altivec; - c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec; - - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec; - c->h264_idct8_add = ff_h264_idct8_add_altivec; - c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; - - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; - c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; - c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; - c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; - c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c --- a/ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,783 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -//#define DEBUG_ALIGNMENT -#ifdef DEBUG_ALIGNMENT -#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); -#else -#define ASSERT_ALIGNED(ptr) ; -#endif - -/* this code assume that stride % 16 == 0 */ - -#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ - vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ - vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ - psum = vec_mladd(vB, vsrc1ssH, psum);\ - psum = vec_mladd(vC, vsrc2ssH, psum);\ - psum = vec_mladd(vD, vsrc3ssH, psum);\ - psum = BIAS2(psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - vsrc0ssH = vsrc2ssH;\ - vsrc1ssH = vsrc3ssH;\ -\ - dst += stride;\ - src += stride; - -#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ -\ - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, v32ss);\ - psum = vec_mladd(vE, vsrc1ssH, psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - dst += stride;\ - src += stride; - -#define noop(a) a -#define add28(a) vec_add(v28ss, a) - -static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, - int stride, int h, int x, int y) { - POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); - DECLARE_ALIGNED(16, signed int, ABCD)[4] = - {((8 - x) * (8 - y)), - (( x) * (8 - y)), - ((8 - x) * ( y)), - (( x) * ( y))}; - register int i; - vec_u8 fperm; - const vec_s32 vABCD = vec_ld(0, ABCD); - const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); - const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); - const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); - const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); - LOAD_ZERO; - const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); - const vec_u16 v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; - vec_u8 vsrc0uc, vsrc1uc; - vec_s16 vsrc0ssH, vsrc1ssH; - vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16 vsrc2ssH, vsrc3ssH, psum; - vec_u8 vdst, ppsum, vfdst, fsum; - - POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F}; - } else { - fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F}; - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); - - if (ABCD[3]) { - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) - } - } - } else { - const vec_s16 vE = vec_add(vB, vC); - if (ABCD[2]) { // x == 0 B == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 15, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } - } else { // y == 0 C == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrcDuc = vec_ld(15, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcDuc; - else - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } - } - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); -} - -/* this code assume that stride % 16 == 0 */ -static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - DECLARE_ALIGNED(16, signed int, ABCD)[4] = - {((8 - x) * (8 - y)), - (( x) * (8 - y)), - ((8 - x) * ( y)), - (( x) * ( y))}; - register int i; - vec_u8 fperm; - const vec_s32 vABCD = vec_ld(0, ABCD); - const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); - const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); - const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); - const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); - LOAD_ZERO; - const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); - const vec_u16 v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; - vec_u8 vsrc0uc, vsrc1uc; - vec_s16 vsrc0ssH, vsrc1ssH; - vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16 vsrc2ssH, vsrc3ssH, psum; - vec_u8 vdst, ppsum, vfdst, fsum; - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F}; - } else { - fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F}; - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); - vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); - - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - - - vsrcCuc = vec_ld(stride + 0, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) - } - } else { - vec_u8 vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) - } - } -} - -#undef noop -#undef add28 -#undef CHROMA_MC8_ALTIVEC_CORE - -/* this code assume stride % 16 == 0 */ -static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); - register int i; - - LOAD_ZERO; - const vec_u8 permM2 = vec_lvsl(-2, src); - const vec_u8 permM1 = vec_lvsl(-1, src); - const vec_u8 permP0 = vec_lvsl(+0, src); - const vec_u8 permP1 = vec_lvsl(+1, src); - const vec_u8 permP2 = vec_lvsl(+2, src); - const vec_u8 permP3 = vec_lvsl(+3, src); - const vec_s16 v5ss = vec_splat_s16(5); - const vec_u16 v5us = vec_splat_u16(5); - const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - - vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - - register int align = ((((unsigned long)src) - 2) % 16); - - vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB; - - vec_u8 sum, vdst, fsum; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); - - for (i = 0 ; i < 16 ; i ++) { - vec_u8 srcR1 = vec_ld(-2, src); - vec_u8 srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - vec_st(fsum, 0, dst); - - src += srcStride; - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); -} - -/* this code assume stride % 16 == 0 */ -static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); - - register int i; - - LOAD_ZERO; - const vec_u8 perm = vec_lvsl(0, src); - const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u16 v5us = vec_splat_u16(5); - const vec_s16 v5ss = vec_splat_s16(5); - const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - - uint8_t *srcbis = src - (srcStride * 2); - - const vec_u8 srcM2a = vec_ld(0, srcbis); - const vec_u8 srcM2b = vec_ld(16, srcbis); - const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); - //srcbis += srcStride; - const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcM1b = vec_ld(16, srcbis); - const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); - //srcbis += srcStride; - const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcP0b = vec_ld(16, srcbis); - const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); - //srcbis += srcStride; - const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcP1b = vec_ld(16, srcbis); - const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); - //srcbis += srcStride; - const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); - const vec_u8 srcP2b = vec_ld(16, srcbis); - const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); - //srcbis += srcStride; - - vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); - vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); - vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); - vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); - vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); - vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); - vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); - vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); - vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); - vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); - - vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB, - srcP3ssA, srcP3ssB, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; - - vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); - - for (i = 0 ; i < 16 ; i++) { - srcP3a = vec_ld(0, srcbis += srcStride); - srcP3b = vec_ld(16, srcbis); - srcP3 = vec_perm(srcP3a, srcP3b, perm); - srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); - srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); - //srcbis += srcStride; - - sum1A = vec_adds(srcP0ssA, srcP1ssA); - sum1B = vec_adds(srcP0ssB, srcP1ssB); - sum2A = vec_adds(srcM1ssA, srcP2ssA); - sum2B = vec_adds(srcM1ssB, srcP2ssB); - sum3A = vec_adds(srcM2ssA, srcP3ssA); - sum3B = vec_adds(srcM2ssB, srcP3ssB); - - srcM2ssA = srcM1ssA; - srcM2ssB = srcM1ssB; - srcM1ssA = srcP0ssA; - srcM1ssB = srcP0ssB; - srcP0ssA = srcP1ssA; - srcP0ssB = srcP1ssB; - srcP1ssA = srcP2ssA; - srcP1ssB = srcP2ssB; - srcP2ssA = srcP3ssA; - srcP2ssB = srcP3ssB; - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - vec_st(fsum, 0, dst); - - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); -} - -/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ -static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); - register int i; - LOAD_ZERO; - const vec_u8 permM2 = vec_lvsl(-2, src); - const vec_u8 permM1 = vec_lvsl(-1, src); - const vec_u8 permP0 = vec_lvsl(+0, src); - const vec_u8 permP1 = vec_lvsl(+1, src); - const vec_u8 permP2 = vec_lvsl(+2, src); - const vec_u8 permP3 = vec_lvsl(+3, src); - const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u32 v10ui = vec_splat_u32(10); - const vec_s16 v5ss = vec_splat_s16(5); - const vec_s16 v1ss = vec_splat_s16(1); - const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); - const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); - - register int align = ((((unsigned long)src) - 2) % 16); - - vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, psumA, psumB; - - const vec_u8 mperm = (const vec_u8) - {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, - 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; - int16_t *tmpbis = tmp; - - vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, - tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, - tmpP2ssA, tmpP2ssB; - - vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, - pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, - pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, - ssumAe, ssumAo, ssumBe, ssumBo; - vec_u8 fsum, sumv, sum, vdst; - vec_s16 ssume, ssumo; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); - src -= (2 * srcStride); - for (i = 0 ; i < 21 ; i ++) { - vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vec_u8 srcR1 = vec_ld(-2, src); - vec_u8 srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8 srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, sum3A); - pp1B = vec_mladd(sum1B, v20ss, sum3B); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - psumA = vec_sub(pp1A, pp2A); - psumB = vec_sub(pp1B, pp2B); - - vec_st(psumA, 0, tmp); - vec_st(psumB, 16, tmp); - - src += srcStride; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - tmpM2ssA = vec_ld(0, tmpbis); - tmpM2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpM1ssA = vec_ld(0, tmpbis); - tmpM1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP0ssA = vec_ld(0, tmpbis); - tmpP0ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP1ssA = vec_ld(0, tmpbis); - tmpP1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP2ssA = vec_ld(0, tmpbis); - tmpP2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - - for (i = 0 ; i < 16 ; i++) { - const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); - const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); - - const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); - const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); - const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); - const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); - const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); - const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); - - tmpbis += tmpStride; - - tmpM2ssA = tmpM1ssA; - tmpM2ssB = tmpM1ssB; - tmpM1ssA = tmpP0ssA; - tmpM1ssB = tmpP0ssB; - tmpP0ssA = tmpP1ssA; - tmpP0ssB = tmpP1ssB; - tmpP1ssA = tmpP2ssA; - tmpP1ssB = tmpP2ssB; - tmpP2ssA = tmpP3ssA; - tmpP2ssB = tmpP3ssB; - - pp1Ae = vec_mule(sum1A, v20ss); - pp1Ao = vec_mulo(sum1A, v20ss); - pp1Be = vec_mule(sum1B, v20ss); - pp1Bo = vec_mulo(sum1B, v20ss); - - pp2Ae = vec_mule(sum2A, v5ss); - pp2Ao = vec_mulo(sum2A, v5ss); - pp2Be = vec_mule(sum2B, v5ss); - pp2Bo = vec_mulo(sum2B, v5ss); - - pp3Ae = vec_sra((vec_s32)sum3A, v16ui); - pp3Ao = vec_mulo(sum3A, v1ss); - pp3Be = vec_sra((vec_s32)sum3B, v16ui); - pp3Bo = vec_mulo(sum3B, v1ss); - - pp1cAe = vec_add(pp1Ae, v512si); - pp1cAo = vec_add(pp1Ao, v512si); - pp1cBe = vec_add(pp1Be, v512si); - pp1cBo = vec_add(pp1Bo, v512si); - - pp32Ae = vec_sub(pp3Ae, pp2Ae); - pp32Ao = vec_sub(pp3Ao, pp2Ao); - pp32Be = vec_sub(pp3Be, pp2Be); - pp32Bo = vec_sub(pp3Bo, pp2Bo); - - sumAe = vec_add(pp1cAe, pp32Ae); - sumAo = vec_add(pp1cAo, pp32Ao); - sumBe = vec_add(pp1cBe, pp32Be); - sumBo = vec_add(pp1cBo, pp32Bo); - - ssumAe = vec_sra(sumAe, v10ui); - ssumAo = vec_sra(sumAo, v10ui); - ssumBe = vec_sra(sumBe, v10ui); - ssumBo = vec_sra(sumBo, v10ui); - - ssume = vec_packs(ssumAe, ssumBe); - ssumo = vec_packs(ssumAo, ssumBo); - - sumv = vec_packsu(ssume, ssumo); - sum = vec_perm(sumv, sumv, mperm); - - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - vec_st(fsum, 0, dst); - - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c --- a/ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2001 Michel Lespinasse - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * NOTE: This code is based on GPL code from the libmpeg2 project. The - * author, Michel Lespinasses, has given explicit permission to release - * under LGPL as part of FFmpeg. - */ - -/* - * FFmpeg integration by Dieter Shirley - * - * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 - * project. I've deleted all of the libmpeg2-specific code, renamed the - * functions and reordered the function parameters. The only change to the - * IDCT function itself was to factor out the partial transposition, and to - * perform a full transpose at the end of the function. - */ - - -#include /* malloc(), free() */ -#include -#include "config.h" -#if HAVE_ALTIVEC_H -#include -#endif -#include "libavcodec/dsputil.h" -#include "types_altivec.h" -#include "dsputil_ppc.h" -#include "dsputil_altivec.h" - -#define IDCT_HALF \ - /* 1st stage */ \ - t1 = vec_mradds (a1, vx7, vx1 ); \ - t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ - t7 = vec_mradds (a2, vx5, vx3); \ - t3 = vec_mradds (ma2, vx3, vx5); \ - \ - /* 2nd stage */ \ - t5 = vec_adds (vx0, vx4); \ - t0 = vec_subs (vx0, vx4); \ - t2 = vec_mradds (a0, vx6, vx2); \ - t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ - t6 = vec_adds (t8, t3); \ - t3 = vec_subs (t8, t3); \ - t8 = vec_subs (t1, t7); \ - t1 = vec_adds (t1, t7); \ - \ - /* 3rd stage */ \ - t7 = vec_adds (t5, t2); \ - t2 = vec_subs (t5, t2); \ - t5 = vec_adds (t0, t4); \ - t0 = vec_subs (t0, t4); \ - t4 = vec_subs (t8, t3); \ - t3 = vec_adds (t8, t3); \ - \ - /* 4th stage */ \ - vy0 = vec_adds (t7, t1); \ - vy7 = vec_subs (t7, t1); \ - vy1 = vec_mradds (c4, t3, t5); \ - vy6 = vec_mradds (mc4, t3, t5); \ - vy2 = vec_mradds (c4, t4, t0); \ - vy5 = vec_mradds (mc4, t4, t0); \ - vy3 = vec_adds (t2, t6); \ - vy4 = vec_subs (t2, t6); - - -#define IDCT \ - vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ - vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ - vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \ - vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ - vec_u16 shift; \ - \ - c4 = vec_splat (constants[0], 0); \ - a0 = vec_splat (constants[0], 1); \ - a1 = vec_splat (constants[0], 2); \ - a2 = vec_splat (constants[0], 3); \ - mc4 = vec_splat (constants[0], 4); \ - ma2 = vec_splat (constants[0], 5); \ - bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \ - \ - zero = vec_splat_s16 (0); \ - shift = vec_splat_u16 (4); \ - \ - vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ - vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ - vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ - vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ - vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ - vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ - vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ - vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ - \ - IDCT_HALF \ - \ - vx0 = vec_mergeh (vy0, vy4); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - vy0 = vec_mergeh (vx0, vx4); \ - vy1 = vec_mergel (vx0, vx4); \ - vy2 = vec_mergeh (vx1, vx5); \ - vy3 = vec_mergel (vx1, vx5); \ - vy4 = vec_mergeh (vx2, vx6); \ - vy5 = vec_mergel (vx2, vx6); \ - vy6 = vec_mergeh (vx3, vx7); \ - vy7 = vec_mergel (vx3, vx7); \ - \ - vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - IDCT_HALF \ - \ - shift = vec_splat_u16 (6); \ - vx0 = vec_sra (vy0, shift); \ - vx1 = vec_sra (vy1, shift); \ - vx2 = vec_sra (vy2, shift); \ - vx3 = vec_sra (vy3, shift); \ - vx4 = vec_sra (vy4, shift); \ - vx5 = vec_sra (vy5, shift); \ - vx6 = vec_sra (vy6, shift); \ - vx7 = vec_sra (vy7, shift); - - -static const vec_s16 constants[5] = { - {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, - {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, - {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, - {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, - {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} -}; - -void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) -{ -POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); - vec_s16 *block = (vec_s16*)blk; - vec_u8 tmp; - -#if CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); -#endif - IDCT - -#define COPY(dest,src) \ - tmp = vec_packsu (src, src); \ - vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); - - COPY (dest, vx0) dest += stride; - COPY (dest, vx1) dest += stride; - COPY (dest, vx2) dest += stride; - COPY (dest, vx3) dest += stride; - COPY (dest, vx4) dest += stride; - COPY (dest, vx5) dest += stride; - COPY (dest, vx6) dest += stride; - COPY (dest, vx7) - -POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); -} - -void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) -{ -POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); - vec_s16 *block = (vec_s16*)blk; - vec_u8 tmp; - vec_s16 tmp2, tmp3; - vec_u8 perm0; - vec_u8 perm1; - vec_u8 p0, p1, p; - -#if CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); -#endif - - IDCT - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); - - ADD (dest, vx0, perm0) dest += stride; - ADD (dest, vx1, perm1) dest += stride; - ADD (dest, vx2, perm0) dest += stride; - ADD (dest, vx3, perm1) dest += stride; - ADD (dest, vx4, perm0) dest += stride; - ADD (dest, vx5, perm1) dest += stride; - ADD (dest, vx6, perm0) dest += stride; - ADD (dest, vx7, perm1) - -POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h --- a/ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,79 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2001, 2002 Fabrice Bellard - * Copyright (c) 2006 Michael Niedermayer et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_MATHOPS_H -#define AVCODEC_PPC_MATHOPS_H - -#include -#include "config.h" -#include "libavutil/common.h" - -#if HAVE_PPC4XX -/* signed 16x16 -> 32 multiply add accumulate */ -#define MAC16(rt, ra, rb) \ - __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); - -/* signed 16x16 -> 32 multiply */ -#define MUL16(ra, rb) \ - ({ int __rt; \ - __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ - __rt; }) -#endif - -#define MULH MULH -static inline av_const int MULH(int a, int b){ - int r; - __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); - return r; -} - -#if !ARCH_PPC64 -static inline av_const int64_t MAC64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - int h, l; - __asm__ ("mullw %3, %4, %5 \n\t" - "mulhw %2, %4, %5 \n\t" - "addc %1, %1, %3 \n\t" - "adde %0, %0, %2 \n\t" - : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) - : "r"(a), "r"(b)); - return x.x; -} -#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) - -static inline av_const int64_t MLS64(int64_t d, int a, int b) -{ - union { uint64_t x; unsigned hl[2]; } x = { d }; - int h, l; - __asm__ ("mullw %3, %4, %5 \n\t" - "mulhw %2, %4, %5 \n\t" - "subfc %1, %3, %1 \n\t" - "subfe %0, %2, %0 \n\t" - : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) - : "r"(a), "r"(b)); - return x.x; -} -#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) -#endif - -#endif /* AVCODEC_PPC_MATHOPS_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h --- a/ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2006 Guillaume Poirier - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H -#define AVCODEC_PPC_TYPES_ALTIVEC_H - -/*********************************************************************** - * Vector types - **********************************************************************/ -#define vec_u8 vector unsigned char -#define vec_s8 vector signed char -#define vec_u16 vector unsigned short -#define vec_s16 vector signed short -#define vec_u32 vector unsigned int -#define vec_s32 vector signed int - -/*********************************************************************** - * Null vector - **********************************************************************/ -#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 ) - -#define zero_u8v (vec_u8) zerov -#define zero_s8v (vec_s8) zerov -#define zero_u16v (vec_u16) zerov -#define zero_s16v (vec_s16) zerov -#define zero_u32v (vec_u32) zerov -#define zero_s32v (vec_s32) zerov - -#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h --- a/ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * Contains misc utility macros and inline functions - */ - -#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H -#define AVCODEC_PPC_UTIL_ALTIVEC_H - -#include - -#include "config.h" - -#if HAVE_ALTIVEC_H -#include -#endif - -// used to build registers permutation vectors (vcprm) -// the 's' are for words in the _s_econd vector -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} -#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} - -// vcprmle is used to keep the same index as in the SSE version. -// it's the same as vcprm, with the index inversed -// ('le' is Little Endian) -#define vcprmle(a,b,c,d) vcprm(d,c,b,a) - -// used to build inverse/identity vectors (vcii) -// n is _n_egative, p is _p_ositive -#define FLOAT_n -1. -#define FLOAT_p 1. - - -// Transpose 8x8 matrix of 16-bit elements (in-place) -#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ -do { \ - vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ - vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ - \ - A1 = vec_mergeh (a, e); \ - B1 = vec_mergel (a, e); \ - C1 = vec_mergeh (b, f); \ - D1 = vec_mergel (b, f); \ - E1 = vec_mergeh (c, g); \ - F1 = vec_mergel (c, g); \ - G1 = vec_mergeh (d, h); \ - H1 = vec_mergel (d, h); \ - \ - A2 = vec_mergeh (A1, E1); \ - B2 = vec_mergel (A1, E1); \ - C2 = vec_mergeh (B1, F1); \ - D2 = vec_mergel (B1, F1); \ - E2 = vec_mergeh (C1, G1); \ - F2 = vec_mergel (C1, G1); \ - G2 = vec_mergeh (D1, H1); \ - H2 = vec_mergel (D1, H1); \ - \ - a = vec_mergeh (A2, E2); \ - b = vec_mergel (A2, E2); \ - c = vec_mergeh (B2, F2); \ - d = vec_mergel (B2, F2); \ - e = vec_mergeh (C2, G2); \ - f = vec_mergel (C2, G2); \ - g = vec_mergeh (D2, H2); \ - h = vec_mergel (D2, H2); \ -} while (0) - - -/** \brief loads unaligned vector \a *src with offset \a offset - and returns it */ -static inline vector unsigned char unaligned_load(int offset, uint8_t *src) -{ - register vector unsigned char first = vec_ld(offset, src); - register vector unsigned char second = vec_ld(offset+15, src); - register vector unsigned char mask = vec_lvsl(offset, src); - return vec_perm(first, second, mask); -} - -#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/raw.h --- a/ffmpeg_smp/h264dec/libavcodec/raw.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -/* - * Raw Video Codec - * Copyright (c) 2001 Fabrice Bellard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * Raw Video Codec - */ - -#ifndef AVCODEC_RAW_H -#define AVCODEC_RAW_H - -#include "avcodec.h" - -typedef struct PixelFormatTag { - enum PixelFormat pix_fmt; - unsigned int fourcc; -} PixelFormatTag; - -extern const PixelFormatTag ff_raw_pixelFormatTags[]; -int raw_init_encoder(AVCodecContext *avctx); -#endif /* AVCODEC_RAW_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/rectangle.h --- a/ffmpeg_smp/h264dec/libavcodec/rectangle.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ -/* - * rectangle filling function - * Copyright (c) 2003 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * useful rectangle filling function - * @author Michael Niedermayer - */ - -#ifndef AVCODEC_RECTANGLE_H -#define AVCODEC_RECTANGLE_H - -#include -//#include "config.h" -#include "libavutil/common.h" -#include "dsputil.h" - -/** - * fill a rectangle. - * @param h height of the rectangle, should be a constant - * @param w width of the rectangle, should be a constant - * @param size the size of val (1, 2 or 4), should be a constant - */ -static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ - uint8_t *p= (uint8_t*)vp; - assert(size==1 || size==2 || size==4); - assert(w<=4); - - w *= size; - stride *= size; - - assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); - assert((stride&(w-1))==0); - if(w==2){ - const uint16_t v= size==4 ? val : val*0x0101; - *(uint16_t*)(p + 0*stride)= v; - if(h==1) return; - *(uint16_t*)(p + 1*stride)= v; - if(h==2) return; - *(uint16_t*)(p + 2*stride)= v; - *(uint16_t*)(p + 3*stride)= v; - }else if(w==4){ - const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101; - *(uint32_t*)(p + 0*stride)= v; - if(h==1) return; - *(uint32_t*)(p + 1*stride)= v; - if(h==2) return; - *(uint32_t*)(p + 2*stride)= v; - *(uint32_t*)(p + 3*stride)= v; - }else if(w==8){ - const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL; - *(uint64_t*)(p + 0*stride)= v; - if(h==1) return; - *(uint64_t*)(p + 1*stride)= v; - if(h==2) return; - *(uint64_t*)(p + 2*stride)= v; - *(uint64_t*)(p + 3*stride)= v; - }else if(w==16){ - const uint64_t v= val*0x0100000001ULL; - *(uint64_t*)(p + 0+0*stride)= v; - *(uint64_t*)(p + 8+0*stride)= v; - *(uint64_t*)(p + 0+1*stride)= v; - *(uint64_t*)(p + 8+1*stride)= v; - if(h==2) return; - *(uint64_t*)(p + 0+2*stride)= v; - *(uint64_t*)(p + 8+2*stride)= v; - *(uint64_t*)(p + 0+3*stride)= v; - *(uint64_t*)(p + 8+3*stride)= v; - }else - assert(0); - assert(h==4); -} - -#endif /* AVCODEC_RECTANGLE_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/scratch.c --- a/ffmpeg_smp/h264dec/libavcodec/scratch.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,295 +0,0 @@ -static void *entropy_thread(void *arg){ - H264Context *h = (H264Context *) arg; - EDSlice *s; - - H264Cabac hcabac; - CABACContext cabac; - - ff_init_cabac_states(); - - if (init_cabac(h, &hcabac)<0) - return NULL; - - for(;;){ - { - pthread_mutex_lock(&h->lock[ENTROPY]); - while (h->ed_cnt<=0) - pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]); - s= &h->ed_q[h->ed_fo]; - pthread_mutex_unlock(&h->lock[ENTROPY]); - h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT; - } - if (s->state<0) - break; - - decode_slice_entropy(&hcabac, &cabac, s); - - { - pthread_mutex_lock(&h->lock[MBDEC]); - while (h->mbdec_cnt >= MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); - h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s); - h->mbdec_cnt++; - h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[MBDEC]); - pthread_mutex_unlock(&h->lock[MBDEC]); - } - { - pthread_mutex_lock(&h->lock[ENTROPY]); - h->ed_cnt--; - pthread_cond_signal(&h->cond[ENTROPY]); - pthread_mutex_unlock(&h->lock[ENTROPY]); - } - } - - { - pthread_mutex_lock(&h->lock[MBDEC]); - while (h->mbdec_cnt >= MAX_SLICE_COUNT) - pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); - h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s); - h->mbdec_cnt++; - h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; - pthread_cond_signal(&h->cond[MBDEC]); - pthread_mutex_unlock(&h->lock[MBDEC]); - - } - - free_cabac(&hcabac); - - pthread_exit(NULL); - return NULL; - -} -/* -* The following code is the main loop of the file converter -*/ -int av_transcode_1ed(int ifile, int ofile, int frame_width, int frame_height) { - H264Context *h; - pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; - - h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height); - - timer_start = av_gettime(); - - // pthread_create(&read_thr, NULL, read_thread, h); - // pthread_create(&parsenal_thr, NULL, parsenal_thread, h); - pthread_create(&entropy_thr, NULL, entropy_mbd_thread, h); - - // pthread_create(&mbdec_thr, NULL, mbdec_thread, h); - - // pthread_create(&write_thr, NULL, write_thread, h); - - // pthread_join(read_thr, NULL); - // pthread_join(parsenal_thr, NULL); - pthread_join(entropy_thr, NULL); - // pthread_join(mbdec_thr, NULL); - // printf("before write_thr\n"); - // pthread_join(write_thr, NULL); - - /* finished ! */ - ff_h264_decode_end(h); - - return 0; -} - -static void reset_h264mb(EDSlice *s, int mb_width, int mb_height){ - for (int i=0; imbs[i*mb_width + j]; - - m->left_mb_xy=0; - m->top_mb_xy = 0; - } - } -} - -static void *entropy_mbd_thread(void *arg){ - H264Context *h = (H264Context *) arg; - - EDSlice slice, *s=&slice; - MBSlice mbslice, *s2=&mbslice; - H264Cabac hcabac; - CABACContext cabac; - int frames =0; - MBDecContext mbdec, *d=&mbdec; - int size=h->width*h->height; - WriteContext write, *w=&write; - AVCodecParserContext parser, *pc= &parser; - NalContext nal, *n=&nal; - - - memset(pc, 0, sizeof(AVCodecParserContext)); - pc->buffer_size = 2048; - pc->final_frame = 0; - pc->cur_len= 0; - pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE); - pc->size = 2048; - pc->eof_reached =0; - pc->ifile = h->ifile; - - //init parse - memset(n, 0, sizeof(NalContext)); - n->width = h->width; - n->height = h->height; - n->mb_height = h->mb_height; - n->mb_width = h->mb_width; - n->b4_stride = n->mb_width*4 + 1; - n->mb_stride = n->mb_width + 1; - n->outputed_poc = INT_MIN; -// memset(s, 0, sizeof(EDSlice)); -// ff_init_slice(n, s); -// - - memset(w, 0, sizeof(WriteContext)); - w->bit_buffer_size= FFMAX(1024*256, 6*size + 200); - w->bit_buffer= av_mallocz(w->bit_buffer_size); - - - - ff_h264dsp_init(&d->hdsp); - ff_h264_pred_init(&d->hpc); - dsputil_init(&d->dsp); - d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab; - d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab; - d->mb_height = (h->height + 15) / 16; - d->mb_width = (h->width + 15) / 16; - d->linesize = h->width + EDGE_WIDTH*2; - d->uvlinesize = d->linesize>>1; - - for(int i=0; i<16; i++){ - d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3); - } - for(int i=0; i<4; i++){ - d->block_offset[16+i]= - d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3); - } - - d->scratchpad= av_mallocz((h->width+64)*4*16*2*sizeof(uint8_t)); - - ff_init_cabac_states(); - - if (init_cabac(h, &hcabac)<0) - return NULL; - - while(!pc->final_frame && frames_max++ < 1000){ - Picture *out; - - RawFrame *frm; - Picture *pic=NULL; - - RawFrame frm_read; - frm_read.state =0; - av_read_frame_internal(pc, &frm_read); - frm = &frm_read; - - if (frm->state < 0) - break; -/* - { - pthread_mutex_lock(&h->lock[PARSE2]); - while (h->slice_cnt<=0) - pthread_cond_wait(&h->cond[PARSE2], &h->lock[PARSE2]); - h->slice_cnt--; - s= &h->slices[h->slice_next++]; - h->slice_next %= MAX_SLICE_COUNT; - pthread_mutex_unlock(&h->lock[PARSE2]); - }*/ - ff_init_slice(n, s); - reset_h264mb(s, n->mb_width, n->mb_height); - for(int i=0; ipicture[i].reference==0){ - pic= &h->picture[i]; - break; - } - } -// { -// pthread_mutex_lock(&h->lock[PARSE3]); -// while (h->free_pic_cnt<=0) -// pthread_cond_wait(&h->cond[PARSE3], &h->lock[PARSE3]); -// h->free_pic_cnt--; -// /* use first free picture */ -// for(int i=0; ipicture[i].reference==0){ -// pic= &h->picture[i]; -// break; -// } -// } -// pthread_mutex_unlock(&h->lock[PARSE3]); -// } - ff_alloc_picture(n, s, pic); - - decode_nal_units(n, s, frm, pic); - - - decode_slice_entropy(&hcabac, &cabac, s); - memcpy( s2, s, sizeof(MBSlice)); //this only copys the COMMON_SLICE part - av_freep(&s->gb.raw); - decode_slice_mb_seq(d, s2); - -// if (s2->release_cnt>0) { -// int i; -// for (i=0; irelease_cnt; i++){ -// if ((s2->release_ref[i]->reference & ~2) == 0) -// default_release_buffer(h, s2->release_ref[i]); -// else -// s2->release_ref[i]->reference &= ~2; -// } -// s->release_cnt=0; -// } - -if (s->release_cnt>0) { - int i; - for (i=0; irelease_cnt; i++){ - s->release_ref[i]->reference &= ~2; - } - s->release_cnt=0; -} - - - { - pthread_mutex_lock(&h->lock[PARSE2]); - h->slice_cnt++; - pthread_cond_signal(&h->cond[PARSE2]); - pthread_mutex_unlock(&h->lock[PARSE2]); - } - - out =output_frame(w, s2->current_picture, h->ofile, h->width, h->height); - print_report(w->frame_number, w->video_size, 0); - - if (out){ -// if ((out->reference & ~1) == 0) -// default_release_buffer(h, out); -// else - out->reference &= ~1; - } - - { - pthread_mutex_lock(&h->lock[ENTROPY]); - h->ed_cnt--; - pthread_cond_signal(&h->cond[ENTROPY]); - pthread_mutex_unlock(&h->lock[ENTROPY]); - } - } - while (output_frame(w, NULL, h->ofile, h->width, h->height)); - print_report(w->frame_number, w->video_size, 1); - - av_free(w->bit_buffer); - - {//propagate exit - pthread_mutex_lock(&h->lock[WRITE]); - while (h->write_cnt>= MAX_DELAYED_PIC_COUNT) - pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); - last_pic.reference = -1; - h->write_q[h->write_fi] = &last_pic; - h->write_cnt++; - h->write_fi++; h->write_fi %= MAX_DELAYED_PIC_COUNT; - pthread_cond_signal(&h->cond[WRITE]); - pthread_mutex_unlock(&h->lock[WRITE]); - - } - free_cabac(&hcabac); - - pthread_exit(NULL); - return NULL; - -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/simple_idct.c --- a/ffmpeg_smp/h264dec/libavcodec/simple_idct.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,372 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * simpleidct in C. - */ - -/* - based upon some outcommented c code from mpeg2dec (idct_mmx.c - written by Aaron Holtzman ) - */ -#include "avcodec.h" -#include "dsputil.h" -#include "mathops.h" -#include "simple_idct.h" - -#if 0 -#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ -#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ -#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ -#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ -#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ -#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ -#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ -#define ROW_SHIFT 8 -#define COL_SHIFT 17 -#else -#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define ROW_SHIFT 11 -#define COL_SHIFT 20 // 6 -#endif - -static inline void idctRowCondDC (DCTELEM * row) -{ - int a0, a1, a2, a3, b0, b1, b2, b3; - uint64_t temp; - -#if HAVE_BIGENDIAN -#define ROW0_MASK 0xffff000000000000LL -#else -#define ROW0_MASK 0xffffLL -#endif - if(sizeof(DCTELEM)==2){ - if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | - ((uint64_t *)row)[1]) == 0) { - temp = (row[0] << 3) & 0xffff; - temp += temp << 16; - temp += temp << 32; - ((uint64_t *)row)[0] = temp; - ((uint64_t *)row)[1] = temp; - return; - } - }else{ - if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) { - row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3; - return; - } - } - - a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); - a1 = a0; - a2 = a0; - a3 = a0; - - /* no need to optimize : gcc does it */ - a0 += W2 * row[2]; - a1 += W6 * row[2]; - a2 -= W6 * row[2]; - a3 -= W2 * row[2]; - - b0 = MUL16(W1, row[1]); - MAC16(b0, W3, row[3]); - b1 = MUL16(W3, row[1]); - MAC16(b1, -W7, row[3]); - b2 = MUL16(W5, row[1]); - MAC16(b2, -W1, row[3]); - b3 = MUL16(W7, row[1]); - MAC16(b3, -W5, row[3]); - - temp = ((uint64_t*)row)[1]; - - if (temp != 0) { - a0 += W4*row[4] + W6*row[6]; - a1 += - W4*row[4] - W2*row[6]; - a2 += - W4*row[4] + W2*row[6]; - a3 += W4*row[4] - W6*row[6]; - - MAC16(b0, W5, row[5]); - MAC16(b0, W7, row[7]); - - MAC16(b1, -W1, row[5]); - MAC16(b1, -W5, row[7]); - - MAC16(b2, W7, row[5]); - MAC16(b2, W3, row[7]); - - MAC16(b3, W3, row[5]); - MAC16(b3, -W1, row[7]); - } - - row[0] = (a0 + b0) >> ROW_SHIFT; - row[7] = (a0 - b0) >> ROW_SHIFT; - row[1] = (a1 + b1) >> ROW_SHIFT; - row[6] = (a1 - b1) >> ROW_SHIFT; - row[2] = (a2 + b2) >> ROW_SHIFT; - row[5] = (a2 - b2) >> ROW_SHIFT; - row[3] = (a3 + b3) >> ROW_SHIFT; - row[4] = (a3 - b3) >> ROW_SHIFT; -} - -static inline void idctSparseColPut (uint8_t *dest, int line_size, - DCTELEM * col) -{ - int a0, a1, a2, a3, b0, b1, b2, b3; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - - /* XXX: I did that only to give same values as previous code */ - a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); - a1 = a0; - a2 = a0; - a3 = a0; - - a0 += + W2*col[8*2]; - a1 += + W6*col[8*2]; - a2 += - W6*col[8*2]; - a3 += - W2*col[8*2]; - - b0 = MUL16(W1, col[8*1]); - b1 = MUL16(W3, col[8*1]); - b2 = MUL16(W5, col[8*1]); - b3 = MUL16(W7, col[8*1]); - - MAC16(b0, + W3, col[8*3]); - MAC16(b1, - W7, col[8*3]); - MAC16(b2, - W1, col[8*3]); - MAC16(b3, - W5, col[8*3]); - - if(col[8*4]){ - a0 += + W4*col[8*4]; - a1 += - W4*col[8*4]; - a2 += - W4*col[8*4]; - a3 += + W4*col[8*4]; - } - - if (col[8*5]) { - MAC16(b0, + W5, col[8*5]); - MAC16(b1, - W1, col[8*5]); - MAC16(b2, + W7, col[8*5]); - MAC16(b3, + W3, col[8*5]); - } - - if(col[8*6]){ - a0 += + W6*col[8*6]; - a1 += - W2*col[8*6]; - a2 += + W2*col[8*6]; - a3 += - W6*col[8*6]; - } - - if (col[8*7]) { - MAC16(b0, + W7, col[8*7]); - MAC16(b1, - W5, col[8*7]); - MAC16(b2, + W3, col[8*7]); - MAC16(b3, - W1, col[8*7]); - } - - dest[0] = cm[(a0 + b0) >> COL_SHIFT]; - dest += line_size; - dest[0] = cm[(a1 + b1) >> COL_SHIFT]; - dest += line_size; - dest[0] = cm[(a2 + b2) >> COL_SHIFT]; - dest += line_size; - dest[0] = cm[(a3 + b3) >> COL_SHIFT]; - dest += line_size; - dest[0] = cm[(a3 - b3) >> COL_SHIFT]; - dest += line_size; - dest[0] = cm[(a2 - b2) >> COL_SHIFT]; - dest += line_size; - dest[0] = cm[(a1 - b1) >> COL_SHIFT]; - dest += line_size; - dest[0] = cm[(a0 - b0) >> COL_SHIFT]; -} - -static inline void idctSparseColAdd (uint8_t *dest, int line_size, - DCTELEM * col) -{ - int a0, a1, a2, a3, b0, b1, b2, b3; - uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; - - /* XXX: I did that only to give same values as previous code */ - a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); - a1 = a0; - a2 = a0; - a3 = a0; - - a0 += + W2*col[8*2]; - a1 += + W6*col[8*2]; - a2 += - W6*col[8*2]; - a3 += - W2*col[8*2]; - - b0 = MUL16(W1, col[8*1]); - b1 = MUL16(W3, col[8*1]); - b2 = MUL16(W5, col[8*1]); - b3 = MUL16(W7, col[8*1]); - - MAC16(b0, + W3, col[8*3]); - MAC16(b1, - W7, col[8*3]); - MAC16(b2, - W1, col[8*3]); - MAC16(b3, - W5, col[8*3]); - - if(col[8*4]){ - a0 += + W4*col[8*4]; - a1 += - W4*col[8*4]; - a2 += - W4*col[8*4]; - a3 += + W4*col[8*4]; - } - - if (col[8*5]) { - MAC16(b0, + W5, col[8*5]); - MAC16(b1, - W1, col[8*5]); - MAC16(b2, + W7, col[8*5]); - MAC16(b3, + W3, col[8*5]); - } - - if(col[8*6]){ - a0 += + W6*col[8*6]; - a1 += - W2*col[8*6]; - a2 += + W2*col[8*6]; - a3 += - W6*col[8*6]; - } - - if (col[8*7]) { - MAC16(b0, + W7, col[8*7]); - MAC16(b1, - W5, col[8*7]); - MAC16(b2, + W3, col[8*7]); - MAC16(b3, - W1, col[8*7]); - } - - dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)]; - dest += line_size; - dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)]; - dest += line_size; - dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)]; - dest += line_size; - dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)]; - dest += line_size; - dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)]; - dest += line_size; - dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)]; - dest += line_size; - dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)]; - dest += line_size; - dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)]; -} - -static inline void idctSparseCol (DCTELEM * col) -{ - int a0, a1, a2, a3, b0, b1, b2, b3; - - /* XXX: I did that only to give same values as previous code */ - a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); - a1 = a0; - a2 = a0; - a3 = a0; - - a0 += + W2*col[8*2]; - a1 += + W6*col[8*2]; - a2 += - W6*col[8*2]; - a3 += - W2*col[8*2]; - - b0 = MUL16(W1, col[8*1]); - b1 = MUL16(W3, col[8*1]); - b2 = MUL16(W5, col[8*1]); - b3 = MUL16(W7, col[8*1]); - - MAC16(b0, + W3, col[8*3]); - MAC16(b1, - W7, col[8*3]); - MAC16(b2, - W1, col[8*3]); - MAC16(b3, - W5, col[8*3]); - - if(col[8*4]){ - a0 += + W4*col[8*4]; - a1 += - W4*col[8*4]; - a2 += - W4*col[8*4]; - a3 += + W4*col[8*4]; - } - - if (col[8*5]) { - MAC16(b0, + W5, col[8*5]); - MAC16(b1, - W1, col[8*5]); - MAC16(b2, + W7, col[8*5]); - MAC16(b3, + W3, col[8*5]); - } - - if(col[8*6]){ - a0 += + W6*col[8*6]; - a1 += - W2*col[8*6]; - a2 += + W2*col[8*6]; - a3 += - W6*col[8*6]; - } - - if (col[8*7]) { - MAC16(b0, + W7, col[8*7]); - MAC16(b1, - W5, col[8*7]); - MAC16(b2, + W3, col[8*7]); - MAC16(b3, - W1, col[8*7]); - } - - col[0 ] = ((a0 + b0) >> COL_SHIFT); - col[8 ] = ((a1 + b1) >> COL_SHIFT); - col[16] = ((a2 + b2) >> COL_SHIFT); - col[24] = ((a3 + b3) >> COL_SHIFT); - col[32] = ((a3 - b3) >> COL_SHIFT); - col[40] = ((a2 - b2) >> COL_SHIFT); - col[48] = ((a1 - b1) >> COL_SHIFT); - col[56] = ((a0 - b0) >> COL_SHIFT); -} - -void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - int i; - for(i=0; i<8; i++) - idctRowCondDC(block + i*8); - - for(i=0; i<8; i++) - idctSparseColPut(dest + i, line_size, block + i); -} - -void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - int i; - for(i=0; i<8; i++) - idctRowCondDC(block + i*8); - - for(i=0; i<8; i++) - idctSparseColAdd(dest + i, line_size, block + i); -} - -void ff_simple_idct(DCTELEM *block) -{ - int i; - for(i=0; i<8; i++) - idctRowCondDC(block + i*8); - - for(i=0; i<8; i++) - idctSparseCol(block + i); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/simple_idct.h --- a/ffmpeg_smp/h264dec/libavcodec/simple_idct.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * simple idct header. - */ - -#ifndef AVCODEC_SIMPLE_IDCT_H -#define AVCODEC_SIMPLE_IDCT_H - -#include -#include "dsputil.h" - -void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block); -void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block); -void ff_simple_idct_mmx(int16_t *block); -void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block); -void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block); -void ff_simple_idct(DCTELEM *block); - -void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block); - -void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block); -void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block); -void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block); - -#endif /* AVCODEC_SIMPLE_IDCT_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/utils.c --- a/ffmpeg_smp/h264dec/libavcodec/utils.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ -/* - * utils for libavcodec - * Copyright (c) 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * utils. - */ - -/* needed for mkstemp() */ -#define _XOPEN_SOURCE 600 - -#include "avcodec.h" -#include "dsputil.h" - -#include -#include -#include -#include -//#undef NDEBUG -#include - -#include - -void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size) -{ - if(min_size < *size) - return ptr; - - *size= FFMAX(17*min_size/16 + 32, min_size); - - ptr= av_realloc(ptr, *size); - if(!ptr) //we could set this to the unmodified min_size but this is safer if the user lost the ptr and uses NULL now - *size= 0; - - return ptr; -} - -void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size) -{ - void **p = ptr; - if (min_size < *size) - return; - *size= FFMAX(17*min_size/16 + 32, min_size); - av_free(*p); - *p = av_malloc(*size); - if (!*p) *size = 0; -} - - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c --- a/ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,135 +0,0 @@ -/* - * CPU detection code, extracted from mmx.h - * (c)1997-99 by H. Dietz and R. Fisher - * Converted to C and improved by Fabrice Bellard. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" - -#undef printf - -/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ -#define cpuid(index,eax,ebx,ecx,edx)\ - __asm__ volatile\ - ("mov %%"REG_b", %%"REG_S"\n\t"\ - "cpuid\n\t"\ - "xchg %%"REG_b", %%"REG_S\ - : "=a" (eax), "=S" (ebx),\ - "=c" (ecx), "=d" (edx)\ - : "0" (index)); - -/* Function to test if multimedia instructions are supported... */ -int mm_support() -{ - int rval = 0; - int eax, ebx, ecx, edx; - int max_std_level, max_ext_level, std_caps=0, ext_caps=0; - -#if ARCH_X86_32 - x86_reg a, c; - __asm__ volatile ( - /* See if CPUID instruction is supported ... */ - /* ... Get copies of EFLAGS into eax and ecx */ - "pushfl\n\t" - "pop %0\n\t" - "mov %0, %1\n\t" - - /* ... Toggle the ID bit in one copy and store */ - /* to the EFLAGS reg */ - "xor $0x200000, %0\n\t" - "push %0\n\t" - "popfl\n\t" - - /* ... Get the (hopefully modified) EFLAGS */ - "pushfl\n\t" - "pop %0\n\t" - : "=a" (a), "=c" (c) - : - : "cc" - ); - - if (a == c) - return 0; /* CPUID not supported */ -#endif - - cpuid(0, max_std_level, ebx, ecx, edx); - - if(max_std_level >= 1){ - cpuid(1, eax, ebx, ecx, std_caps); - if (std_caps & (1<<23)) - rval |= FF_MM_MMX; - if (std_caps & (1<<25)) - rval |= FF_MM_MMX2 -#if HAVE_SSE - | FF_MM_SSE; - if (std_caps & (1<<26)) - rval |= FF_MM_SSE2; - if (ecx & 1) - rval |= FF_MM_SSE3; - if (ecx & 0x00000200 ) - rval |= FF_MM_SSSE3; - if (ecx & 0x00080000 ) - rval |= FF_MM_SSE4; - if (ecx & 0x00100000 ) - rval |= FF_MM_SSE42; -#endif - ; - } - - cpuid(0x80000000, max_ext_level, ebx, ecx, edx); - - if(max_ext_level >= 0x80000001){ - cpuid(0x80000001, eax, ebx, ecx, ext_caps); - if (ext_caps & (1<<31)) - rval |= FF_MM_3DNOW; - if (ext_caps & (1<<30)) - rval |= FF_MM_3DNOWEXT; - if (ext_caps & (1<<23)) - rval |= FF_MM_MMX; - if (ext_caps & (1<<22)) - rval |= FF_MM_MMX2; - } - -#if 0 - av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n", - (rval&FF_MM_MMX) ? "MMX ":"", - (rval&FF_MM_MMX2) ? "MMX2 ":"", - (rval&FF_MM_SSE) ? "SSE ":"", - (rval&FF_MM_SSE2) ? "SSE2 ":"", - (rval&FF_MM_SSE3) ? "SSE3 ":"", - (rval&FF_MM_SSSE3) ? "SSSE3 ":"", - (rval&FF_MM_SSE4) ? "SSE4.1 ":"", - (rval&FF_MM_SSE42) ? "SSE4.2 ":"", - (rval&FF_MM_3DNOW) ? "3DNow ":"", - (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":""); -#endif - return rval; -} - -#ifdef TEST -int main ( void ) -{ - int mm_flags; - mm_flags = mm_support(); - printf("mm_support = 0x%08X\n",mm_flags); - return 0; -} -#endif diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c --- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,304 +0,0 @@ -/* - * Copyright (c) 2005 Zoltan Hidvegi , - * Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * MMX optimized version of (put|avg)_h264_chroma_mc8. - * H264_CHROMA_MC8_TMPL must be defined to the desired function name - * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg - * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function - */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) -{ - DECLARE_ALIGNED(8, uint64_t, AA); - DECLARE_ALIGNED(8, uint64_t, DD); - int i; - - if(y==0 && x==0) { - /* no filter needed */ - H264_CHROMA_MC8_MV0(dst, src, stride, h); - return; - } - - assert(x<8 && y<8 && x>=0 && y>=0); - - if(y==0 || x==0) - { - /* 1 dimensional filter only */ - const int dxy = x ? 1 : stride; - - __asm__ volatile( - "movd %0, %%mm5\n\t" - "movq %1, %%mm4\n\t" - "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ - "punpcklwd %%mm5, %%mm5\n\t" - "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ - "pxor %%mm7, %%mm7\n\t" - "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ - :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); - - for(i=0; i> 3)) >> 3 */ - "paddw %%mm6, %%mm0\n\t" - "paddw %%mm6, %%mm1\n\t" - "paddw %%mm2, %%mm0\n\t" - "paddw %%mm3, %%mm1\n\t" - "psrlw $3, %%mm0\n\t" - "psrlw $3, %%mm1\n\t" - "packuswb %%mm1, %%mm0\n\t" - H264_CHROMA_OP(%0, %%mm0) - "movq %%mm0, %0\n\t" - : "=m" (dst[0])); - - src += stride; - dst += stride; - } - return; - } - - /* general case, bilinear */ - __asm__ volatile("movd %2, %%mm4\n\t" - "movd %3, %%mm6\n\t" - "punpcklwd %%mm4, %%mm4\n\t" - "punpcklwd %%mm6, %%mm6\n\t" - "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ - "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ - "movq %%mm4, %%mm5\n\t" - "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ - "psllw $3, %%mm5\n\t" - "psllw $3, %%mm6\n\t" - "movq %%mm5, %%mm7\n\t" - "paddw %%mm6, %%mm7\n\t" - "movq %%mm4, %1\n\t" /* DD = x * y */ - "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ - "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ - "paddw %4, %%mm4\n\t" - "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ - "pxor %%mm7, %%mm7\n\t" - "movq %%mm4, %0\n\t" - : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); - - __asm__ volatile( - /* mm0 = src[0..7], mm1 = src[1..8] */ - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - : : "m" (src[0]), "m" (src[1])); - - for(i=0; i> 6 */ - "paddw %1, %%mm2\n\t" - "paddw %1, %%mm3\n\t" - "psrlw $6, %%mm2\n\t" - "psrlw $6, %%mm3\n\t" - "packuswb %%mm3, %%mm2\n\t" - H264_CHROMA_OP(%0, %%mm2) - "movq %%mm2, %0\n\t" - : "=m" (dst[0]) : "m" (*rnd_reg)); - dst+= stride; - } -} - -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) -{ - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movd %5, %%mm2 \n\t" - "movd %6, %%mm3 \n\t" - "movq "MANGLE(ff_pw_8)", %%mm4\n\t" - "movq "MANGLE(ff_pw_8)", %%mm5\n\t" - "punpcklwd %%mm2, %%mm2 \n\t" - "punpcklwd %%mm3, %%mm3 \n\t" - "punpcklwd %%mm2, %%mm2 \n\t" - "punpcklwd %%mm3, %%mm3 \n\t" - "psubw %%mm2, %%mm4 \n\t" - "psubw %%mm3, %%mm5 \n\t" - - "movd (%1), %%mm0 \n\t" - "movd 1(%1), %%mm6 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "pmullw %%mm4, %%mm0 \n\t" - "pmullw %%mm2, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "movd 1(%1), %%mm1 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm4, %%mm0 \n\t" - "pmullw %%mm2, %%mm1 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "pmullw %%mm5, %%mm6 \n\t" - "pmullw %%mm3, %%mm1 \n\t" - "paddw %4, %%mm6 \n\t" - "paddw %%mm6, %%mm1 \n\t" - "psrlw $6, %%mm1 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm6) - "movd %%mm1, (%0) \n\t" - "add %3, %0 \n\t" - "movd (%1), %%mm6 \n\t" - "movd 1(%1), %%mm1 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm4, %%mm6 \n\t" - "pmullw %%mm2, %%mm1 \n\t" - "paddw %%mm6, %%mm1 \n\t" - "movq %%mm1, %%mm6 \n\t" - "pmullw %%mm5, %%mm0 \n\t" - "pmullw %%mm3, %%mm1 \n\t" - "paddw %4, %%mm0 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psrlw $6, %%mm1 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm0) - "movd %%mm1, (%0) \n\t" - "add %3, %0 \n\t" - "sub $2, %2 \n\t" - "jnz 1b \n\t" - : "+r"(dst), "+r"(src), "+r"(h) - : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) - ); -} - -#ifdef H264_CHROMA_MC2_TMPL -static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - int tmp = ((1<<16)-1)*x + 8; - int CD= tmp*y; - int AB= (tmp<<3) - CD; - __asm__ volatile( - /* mm5 = {A,B,A,B} */ - /* mm6 = {C,D,C,D} */ - "movd %0, %%mm5\n\t" - "movd %1, %%mm6\n\t" - "punpckldq %%mm5, %%mm5\n\t" - "punpckldq %%mm6, %%mm6\n\t" - "pxor %%mm7, %%mm7\n\t" - /* mm0 = src[0,1,1,2] */ - "movd %2, %%mm2\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "pshufw $0x94, %%mm2, %%mm2\n\t" - :: "r"(AB), "r"(CD), "m"(src[0])); - - - __asm__ volatile( - "1:\n\t" - "add %4, %1\n\t" - /* mm1 = A * src[0,1] + B * src[1,2] */ - "movq %%mm2, %%mm1\n\t" - "pmaddwd %%mm5, %%mm1\n\t" - /* mm0 = src[0,1,1,2] */ - "movd (%1), %%mm0\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "pshufw $0x94, %%mm0, %%mm0\n\t" - /* mm1 += C * src[0,1] + D * src[1,2] */ - "movq %%mm0, %%mm2\n\t" - "pmaddwd %%mm6, %%mm0\n\t" - "paddw %3, %%mm1\n\t" - "paddw %%mm0, %%mm1\n\t" - /* dst[0,1] = pack((mm1 + 32) >> 6) */ - "psrlw $6, %%mm1\n\t" - "packssdw %%mm7, %%mm1\n\t" - "packuswb %%mm7, %%mm1\n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm3) - "movd %%mm1, %%esi\n\t" - "movw %%si, (%0)\n\t" - "add %4, %0\n\t" - "sub $1, %2\n\t" - "jnz 1b\n\t" - : "+r" (dst), "+r"(src), "+r"(h) - : "m" (ff_pw_32), "r"((x86_reg)stride) - : "%esi"); - -} -#endif - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c --- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2008 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. - * H264_CHROMA_MC8_TMPL must be defined to the desired function name - * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function - * AVG_OP must be defined to empty for put and the identify for avg - */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) -{ - if(y==0 && x==0) { - /* no filter needed */ - H264_CHROMA_MC8_MV0(dst, src, stride, h); - return; - } - - assert(x<8 && y<8 && x>=0 && y>=0); - - if(y==0 || x==0) - { - /* 1 dimensional filter only */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movq %1, %%xmm6 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) - ); - - if(x) { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "movq (%1,%3), %%xmm2 \n\t" - "movq 1(%1,%3), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } else { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq (%1,%3), %%xmm1 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movq (%1,%3,2), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } - return; - } - - /* general case, bilinear */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movd %1, %%xmm6 \n\t" - "movdqa %2, %%xmm5 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "pshuflw $0, %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) - ); - - __asm__ volatile( - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movq (%1), %%xmm1 \n\t" - "movq 1(%1), %%xmm2 \n\t" - "movq (%1,%3), %%xmm3 \n\t" - "movq 1(%1,%3), %%xmm4 \n\t" - "lea (%1,%3,2), %1 \n\t" - "punpcklbw %%xmm2, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movdqa %%xmm3, %%xmm4 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm6, %%xmm1 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - "pmaddubsw %%xmm6, %%xmm3 \n\t" - "paddw %%xmm5, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm2 \n\t" - "paddw %%xmm0, %%xmm1 \n\t" - "paddw %%xmm2, %%xmm3 \n\t" - "movdqa %%xmm4, %%xmm0 \n\t" - "psrlw $6, %%xmm1 \n\t" - "psrlw $6, %%xmm3 \n\t" - AVG_OP("movq (%0), %%xmm2 \n\t") - AVG_OP("movhps (%0,%3), %%xmm2 \n\t") - "packuswb %%xmm3, %%xmm1 \n\t" - AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") - "movq %%xmm1, (%0)\n\t" - "movhps %%xmm1, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} - -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - __asm__ volatile( - "movd %0, %%mm7 \n\t" - "movd %1, %%mm6 \n\t" - "movq %2, %%mm5 \n\t" - "pshufw $0, %%mm7, %%mm7 \n\t" - "pshufw $0, %%mm6, %%mm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) - ); - - __asm__ volatile( - "movd (%1), %%mm0 \n\t" - "punpcklbw 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movd (%1), %%mm1 \n\t" - "movd (%1,%3), %%mm3 \n\t" - "punpcklbw 1(%1), %%mm1 \n\t" - "punpcklbw 1(%1,%3), %%mm3 \n\t" - "lea (%1,%3,2), %1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pmaddubsw %%mm7, %%mm0 \n\t" - "pmaddubsw %%mm6, %%mm1 \n\t" - "pmaddubsw %%mm7, %%mm2 \n\t" - "pmaddubsw %%mm6, %%mm3 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm2 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm0 \n\t" - "psrlw $6, %%mm1 \n\t" - "psrlw $6, %%mm3 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - "packuswb %%mm3, %%mm3 \n\t" - AVG_OP("pavgb (%0), %%mm1 \n\t") - AVG_OP("pavgb (%0,%3), %%mm3 \n\t") - "movd %%mm1, (%0)\n\t" - "movd %%mm3, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c --- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,821 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev - */ - -#include "libavutil/x86_cpu.h" -#include "libavutil/internal.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/h264_dsp.h" -#include "dsputil_mmx.h" - - -//#undef NDEBUG -//#include - -int mm_flags; /* multimedia extension flags */ - -/* pixel operations */ -DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; - -DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = -{0x8000000080000000ULL, 0x8000000080000000ULL}; - -DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; -DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; - -DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; - -DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; -DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; - -#define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t" -#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ - "paddb %%" #regd ", %%" #regd " \n\t" ::) - -#ifndef PIC -#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) -#else -// for shared library it's better to use this way for accessing constants -// pcmpeqd -> -1 -#define MOVQ_BONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd " \n\t" \ - "packuswb %%" #regd ", %%" #regd " \n\t" ::) - -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd " \n\t" \ - "psllw $1, %%" #regd " \n\t"::) - -#endif - -// using regr as temporary and for the output result -// first argument is unmodifed and second is trashed -// regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ - "movq " #rega ", " #regr " \n\t"\ - "pand " #regb ", " #regr " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pand " #regfe "," #regb " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "paddb " #regb ", " #regr " \n\t" - -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq " #rega ", " #regr " \n\t"\ - "por " #regb ", " #regr " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pand " #regfe "," #regb " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psubb " #regb ", " #regr " \n\t" - -// mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ - "movq " #rega ", " #regr " \n\t"\ - "movq " #regc ", " #regp " \n\t"\ - "pand " #regb ", " #regr " \n\t"\ - "pand " #regd ", " #regp " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pxor " #regc ", " #regd " \n\t"\ - "pand %%mm6, " #regb " \n\t"\ - "pand %%mm6, " #regd " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psrlq $1, " #regd " \n\t"\ - "paddb " #regb ", " #regr " \n\t"\ - "paddb " #regd ", " #regp " \n\t" - -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq " #rega ", " #regr " \n\t"\ - "movq " #regc ", " #regp " \n\t"\ - "por " #regb ", " #regr " \n\t"\ - "por " #regd ", " #regp " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pxor " #regc ", " #regd " \n\t"\ - "pand %%mm6, " #regb " \n\t"\ - "pand %%mm6, " #regd " \n\t"\ - "psrlq $1, " #regd " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psubb " #regb ", " #regr " \n\t"\ - "psubb " #regd ", " #regp " \n\t" - -/***********************************/ -/* MMX2 specific */ - -#define DEF(x) x ## _mmx2 - -/* Introduced only in MMX2 set */ -#define PAVGB "pavgb" -#define OP_AVG PAVGB - -#include "dsputil_mmx_avg_template.c" - -#undef DEF -#undef PAVGB -#undef OP_AVG - -#define put_no_rnd_pixels16_mmx put_pixels16_mmx -#define put_no_rnd_pixels8_mmx put_pixels8_mmx -#define put_pixels16_mmx2 put_pixels16_mmx -#define put_pixels8_mmx2 put_pixels8_mmx -#define put_pixels4_mmx2 put_pixels4_mmx -#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx -#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx -#define put_pixels16_3dnow put_pixels16_mmx -#define put_pixels8_3dnow put_pixels8_mmx -#define put_pixels4_3dnow put_pixels4_mmx -#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx -#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx - -/***********************************/ -/* standard MMX */ - -void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - const DCTELEM *p; - uint8_t *pix; - - /* read the pixels */ - p = block; - pix = pixels; - /* unrolled loop */ - __asm__ volatile( - "movq %3, %%mm0 \n\t" - "movq 8%3, %%mm1 \n\t" - "movq 16%3, %%mm2 \n\t" - "movq 24%3, %%mm3 \n\t" - "movq 32%3, %%mm4 \n\t" - "movq 40%3, %%mm5 \n\t" - "movq 48%3, %%mm6 \n\t" - "movq 56%3, %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) - :"memory"); - pix += line_size*4; - p += 32; - - // if here would be an exact copy of the code above - // compiler would generate some very strange code - // thus using "r" - __asm__ volatile( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) - :"memory"); -} - -DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - -#define put_signed_pixels_clamped_mmx_half(off) \ - "movq "#off"(%2), %%mm1 \n\t"\ - "movq 16+"#off"(%2), %%mm2 \n\t"\ - "movq 32+"#off"(%2), %%mm3 \n\t"\ - "movq 48+"#off"(%2), %%mm4 \n\t"\ - "packsswb 8+"#off"(%2), %%mm1 \n\t"\ - "packsswb 24+"#off"(%2), %%mm2 \n\t"\ - "packsswb 40+"#off"(%2), %%mm3 \n\t"\ - "packsswb 56+"#off"(%2), %%mm4 \n\t"\ - "paddb %%mm0, %%mm1 \n\t"\ - "paddb %%mm0, %%mm2 \n\t"\ - "paddb %%mm0, %%mm3 \n\t"\ - "paddb %%mm0, %%mm4 \n\t"\ - "movq %%mm1, (%0) \n\t"\ - "movq %%mm2, (%0, %3) \n\t"\ - "movq %%mm3, (%0, %3, 2) \n\t"\ - "movq %%mm4, (%0, %1) \n\t" - -void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - x86_reg line_skip = line_size; - x86_reg line_skip3; - - __asm__ volatile ( - "movq "MANGLE(ff_vector128)", %%mm0 \n\t" - "lea (%3, %3, 2), %1 \n\t" - put_signed_pixels_clamped_mmx_half(0) - "lea (%0, %3, 4), %0 \n\t" - put_signed_pixels_clamped_mmx_half(64) - :"+&r" (pixels), "=&r" (line_skip3) - :"r" (block), "r"(line_skip) - :"memory"); -} - -void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - const DCTELEM *p; - uint8_t *pix; - int i; - - /* read the pixels */ - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - i = 4; - do { - __asm__ volatile( - "movq (%2), %%mm0 \n\t" - "movq 8(%2), %%mm1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "movq %0, %%mm4 \n\t" - "movq %1, %%mm6 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm4, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm6, %%mm2 \n\t" - "paddsw %%mm5, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm2, %1 \n\t" - :"+m"(*pix), "+m"(*(pix+line_size)) - :"r"(p) - :"memory"); - pix += line_size*2; - p += 16; - } while (--i); -} - -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1,%3), %%xmm1 \n\t" - "movdqu (%1,%3,2), %%xmm2 \n\t" - "movdqu (%1,%4), %%xmm3 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2,%3) \n\t" - "movdqa %%xmm2, (%2,%3,2) \n\t" - "movdqa %%xmm3, (%2,%4) \n\t" - "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" - "lea (%2,%3,4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) - : "memory" - ); -} - -static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1,%3), %%xmm1 \n\t" - "movdqu (%1,%3,2), %%xmm2 \n\t" - "movdqu (%1,%4), %%xmm3 \n\t" - "pavgb (%2), %%xmm0 \n\t" - "pavgb (%2,%3), %%xmm1 \n\t" - "pavgb (%2,%3,2), %%xmm2 \n\t" - "pavgb (%2,%4), %%xmm3 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2,%3) \n\t" - "movdqa %%xmm2, (%2,%3,2) \n\t" - "movdqa %%xmm3, (%2,%4) \n\t" - "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" - "lea (%2,%3,4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) - : "memory" - ); -} - -static void clear_block_sse(DCTELEM *block) -{ - __asm__ volatile( - "xorps %%xmm0, %%xmm0 \n" - "movaps %%xmm0, (%0) \n" - "movaps %%xmm0, 16(%0) \n" - "movaps %%xmm0, 32(%0) \n" - "movaps %%xmm0, 48(%0) \n" - "movaps %%xmm0, 64(%0) \n" - "movaps %%xmm0, 80(%0) \n" - "movaps %%xmm0, 96(%0) \n" - "movaps %%xmm0, 112(%0) \n" - :: "r"(block) - : "memory" - ); -} - -static void clear_blocks_sse(DCTELEM *blocks) -{\ - __asm__ volatile( - "xorps %%xmm0, %%xmm0 \n" - "mov %1, %%"REG_a" \n" - "1: \n" - "movaps %%xmm0, (%0, %%"REG_a") \n" - "movaps %%xmm0, 16(%0, %%"REG_a") \n" - "movaps %%xmm0, 32(%0, %%"REG_a") \n" - "movaps %%xmm0, 48(%0, %%"REG_a") \n" - "movaps %%xmm0, 64(%0, %%"REG_a") \n" - "movaps %%xmm0, 80(%0, %%"REG_a") \n" - "movaps %%xmm0, 96(%0, %%"REG_a") \n" - "movaps %%xmm0, 112(%0, %%"REG_a") \n" - "add $128, %%"REG_a" \n" - " js 1b \n" - : : "r" (((uint8_t *)blocks)+128*6), - "i" (-128*6) - : "%"REG_a - ); -} - -static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ - __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %4, %%mm0 \n\t" - "movd %5, %%mm1 \n\t" - "movd %6, %%mm2 \n\t" - "movd %7, %%mm3 \n\t" - "punpcklbw %%mm1, %%mm0 \n\t" - "punpcklbw %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "movd %%mm0, %0 \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, %1 \n\t" - "movd %%mm1, %2 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, %3 \n\t" - - : "=m" (*(uint32_t*)(dst + 0*dst_stride)), - "=m" (*(uint32_t*)(dst + 1*dst_stride)), - "=m" (*(uint32_t*)(dst + 2*dst_stride)), - "=m" (*(uint32_t*)(dst + 3*dst_stride)) - : "m" (*(uint32_t*)(src + 0*src_stride)), - "m" (*(uint32_t*)(src + 1*src_stride)), - "m" (*(uint32_t*)(src + 2*src_stride)), - "m" (*(uint32_t*)(src + 3*src_stride)) - ); -} - -#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ -\ -static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ -}\ -static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ -}\ -static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ -}\ -static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[9];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ -}\ -static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ -}\ -static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[17*2];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ -}\ -static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[17*2];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ -}\ -static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[17*2];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ -} - -#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -#define PREFETCH(name, op) \ -static void name(void *mem, int stride, int h){\ - const uint8_t *p= mem;\ - do{\ - __asm__ volatile(#op" %0" :: "m"(*p));\ - p+= stride;\ - }while(--h);\ -} -PREFETCH(prefetch_mmx2, prefetcht0) -#undef PREFETCH - -#include "h264dsp_mmx.c" - -void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); - -void dsputil_init_mmx(DSPContext* c) -{ - mm_flags = mm_support(); - - if (mm_flags & FF_MM_MMX) { - c->clear_block = clear_block_sse; - c->clear_blocks = clear_blocks_sse; - c->prefetch = prefetch_mmx2; - - -#define H264_QPEL_FUNCS(x, y, CPU)\ - c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ - c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ - c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ - c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; - - if((mm_flags & FF_MM_SSE2)){ - c->put_pixels_tab[0][0] = put_pixels16_sse2; - c->avg_pixels_tab[0][0] = avg_pixels16_sse2; - - } - if(mm_flags & FF_MM_SSE2){ - H264_QPEL_FUNCS(0, 1, sse2); - H264_QPEL_FUNCS(0, 2, sse2); - H264_QPEL_FUNCS(0, 3, sse2); - H264_QPEL_FUNCS(1, 1, sse2); - H264_QPEL_FUNCS(1, 2, sse2); - H264_QPEL_FUNCS(1, 3, sse2); - H264_QPEL_FUNCS(2, 1, sse2); - H264_QPEL_FUNCS(2, 2, sse2); - H264_QPEL_FUNCS(2, 3, sse2); - H264_QPEL_FUNCS(3, 1, sse2); - H264_QPEL_FUNCS(3, 2, sse2); - H264_QPEL_FUNCS(3, 3, sse2); - } -#if HAVE_SSSE3 - if(mm_flags & FF_MM_SSSE3){ - H264_QPEL_FUNCS(1, 0, ssse3); - H264_QPEL_FUNCS(1, 1, ssse3); - H264_QPEL_FUNCS(1, 2, ssse3); - H264_QPEL_FUNCS(1, 3, ssse3); - H264_QPEL_FUNCS(2, 0, ssse3); - H264_QPEL_FUNCS(2, 1, ssse3); - H264_QPEL_FUNCS(2, 2, ssse3); - H264_QPEL_FUNCS(2, 3, ssse3); - H264_QPEL_FUNCS(3, 0, ssse3); - H264_QPEL_FUNCS(3, 1, ssse3); - H264_QPEL_FUNCS(3, 2, ssse3); - H264_QPEL_FUNCS(3, 3, ssse3); - - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; - } -#endif - - - } -} - -void ff_h264dsp_init_x86(H264DSPContext *c) -{ - mm_flags = mm_support(); - - if (mm_flags & FF_MM_MMX) { - c->h264_idct_dc_add= - c->h264_idct_add= ff_h264_idct_add_mmx; - c->h264_idct8_dc_add= - c->h264_idct8_add= ff_h264_idct8_add_mmx; - - if (mm_flags & FF_MM_MMX2) { - c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; - c->h264_idct_add8 = ff_h264_idct_add8_mmx2; - c->h264_idct_add16 = ff_h264_idct_add16_mmx2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - - c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; - c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; - - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; - c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; - c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; - c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; - c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; - c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; - - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; - c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; - c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; - c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; - - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; - c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; - c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; - c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; - } - if(mm_flags & FF_MM_SSE2){ - c->h264_idct8_add = ff_h264_idct8_add_sse2; - c->h264_idct8_add4= ff_h264_idct8_add4_sse2; - } - - } -} - diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h --- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,170 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2007 Aurelien Jacobs - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_DSPUTIL_MMX_H -#define AVCODEC_X86_DSPUTIL_MMX_H - -#include -#include "libavcodec/dsputil.h" - -typedef struct { uint64_t a, b; } xmm_reg; - -extern const uint64_t ff_bone; -extern const uint64_t ff_wtwo; - -extern const uint64_t ff_pdw_80000000[2]; - -extern const uint64_t ff_pw_3; -extern const uint64_t ff_pw_4; -extern const xmm_reg ff_pw_5; -extern const xmm_reg ff_pw_8; -extern const uint64_t ff_pw_15; -extern const xmm_reg ff_pw_16; -extern const uint64_t ff_pw_20; -extern const xmm_reg ff_pw_28; -extern const xmm_reg ff_pw_32; -extern const uint64_t ff_pw_42; -extern const xmm_reg ff_pw_64; -extern const uint64_t ff_pw_96; -extern const uint64_t ff_pw_128; -extern const uint64_t ff_pw_255; - -extern const uint64_t ff_pb_1; -extern const uint64_t ff_pb_3; -extern const uint64_t ff_pb_7; -extern const uint64_t ff_pb_1F; -extern const uint64_t ff_pb_3F; -extern const uint64_t ff_pb_81; -extern const uint64_t ff_pb_A1; -extern const uint64_t ff_pb_FC; - -extern const double ff_pd_1[2]; -extern const double ff_pd_2[2]; - -#define LOAD4(stride,in,a,b,c,d)\ - "movq 0*"#stride"+"#in", "#a"\n\t"\ - "movq 1*"#stride"+"#in", "#b"\n\t"\ - "movq 2*"#stride"+"#in", "#c"\n\t"\ - "movq 3*"#stride"+"#in", "#d"\n\t" - -#define STORE4(stride,out,a,b,c,d)\ - "movq "#a", 0*"#stride"+"#out"\n\t"\ - "movq "#b", 1*"#stride"+"#out"\n\t"\ - "movq "#c", 2*"#stride"+"#out"\n\t"\ - "movq "#d", 3*"#stride"+"#out"\n\t" - -/* in/out: mma=mma+mmb, mmb=mmb-mma */ -#define SUMSUB_BA( a, b ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "psubw "#a", "#b" \n\t" - -#define SBUTTERFLY(a,b,t,n,m)\ - "mov" #m " " #a ", " #t " \n\t" /* abcd */\ - "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ - "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ - -#define TRANSPOSE4(a,b,c,d,t)\ - SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ - SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ - SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ - SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ - -// e,f,g,h can be memory -// out: a,d,t,c -#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ - "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ - "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ - "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ - "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ - SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\ - /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\ - SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\ - /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\ - SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\ - /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\ - SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\ - /* c= a3 b3 c3 d3 e3 f3 g3 h3 */ - -#if ARCH_X86_64 -// permutes 01234567 -> 05736421 -#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ - SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ - SBUTTERFLY(c,d,b,wd,dqa)\ - SBUTTERFLY(e,f,d,wd,dqa)\ - SBUTTERFLY(g,h,f,wd,dqa)\ - SBUTTERFLY(a,c,h,dq,dqa)\ - SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ - SBUTTERFLY(e,g,b,dq,dqa)\ - SBUTTERFLY(d,f,g,dq,dqa)\ - SBUTTERFLY(a,e,f,qdq,dqa)\ - SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ - SBUTTERFLY(h,b,d,qdq,dqa)\ - SBUTTERFLY(c,g,b,qdq,dqa)\ - "movdqa %%xmm8, "#g" \n\t" -#else -#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ - "movdqa "#h", "#t" \n\t"\ - SBUTTERFLY(a,b,h,wd,dqa)\ - "movdqa "#h", 16"#t" \n\t"\ - "movdqa "#t", "#h" \n\t"\ - SBUTTERFLY(c,d,b,wd,dqa)\ - SBUTTERFLY(e,f,d,wd,dqa)\ - SBUTTERFLY(g,h,f,wd,dqa)\ - SBUTTERFLY(a,c,h,dq,dqa)\ - "movdqa "#h", "#t" \n\t"\ - "movdqa 16"#t", "#h" \n\t"\ - SBUTTERFLY(h,b,c,dq,dqa)\ - SBUTTERFLY(e,g,b,dq,dqa)\ - SBUTTERFLY(d,f,g,dq,dqa)\ - SBUTTERFLY(a,e,f,qdq,dqa)\ - SBUTTERFLY(h,d,e,qdq,dqa)\ - "movdqa "#h", 16"#t" \n\t"\ - "movdqa "#t", "#h" \n\t"\ - SBUTTERFLY(h,b,d,qdq,dqa)\ - SBUTTERFLY(c,g,b,qdq,dqa)\ - "movdqa 16"#t", "#g" \n\t" -#endif - -#define MOVQ_WONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd ::) - -void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); -void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); -void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); - -void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); -void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); - -void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); -void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); - -void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag, - double *autoc); - -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); - -#endif /* AVCODEC_X86_DSPUTIL_MMX_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c --- a/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,250 +0,0 @@ -/* - * DSP utils : average functions are compiled twice for 3dnow/mmx2 - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * MMX optimization by Nick Kurshev - * mostly rewritten by Michael Niedermayer - * and improved by Zdenek Kabelac - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - - -static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" - - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) - - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" - - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%2), %%mm0 \n\t" - "movq (%2, %3), %%mm1 \n\t" - PAVGB" (%1), %%mm0 \n\t" - PAVGB" (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%2), %%mm0 \n\t" - "movq (%2, %3), %%mm1 \n\t" - PAVGB" (%1), %%mm0 \n\t" - PAVGB" (%1, %3), %%mm1 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c --- a/ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1741 +0,0 @@ -/* - * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil_mmx.h" - -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; -DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; - -/***********************************/ -/* IDCT */ - -#define SUMSUB_BADC( a, b, c, d ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#d", "#c" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "paddw "#d", "#d" \n\t"\ - "psubw "#a", "#b" \n\t"\ - "psubw "#c", "#d" \n\t" - -#define SUMSUBD2_AB( a, b, t ) \ - "movq "#b", "#t" \n\t"\ - "psraw $1 , "#b" \n\t"\ - "paddw "#a", "#b" \n\t"\ - "psraw $1 , "#a" \n\t"\ - "psubw "#t", "#a" \n\t" - -#define IDCT4_1D( s02, s13, d02, d13, t ) \ - SUMSUB_BA ( s02, d02 )\ - SUMSUBD2_AB( s13, d13, t )\ - SUMSUB_BADC( d13, s02, s13, d02 ) - -#define STORE_DIFF_4P( p, t, z ) \ - "psraw $6, "#p" \n\t"\ - "movd (%0), "#t" \n\t"\ - "punpcklbw "#z", "#t" \n\t"\ - "paddsw "#t", "#p" \n\t"\ - "packuswb "#z", "#p" \n\t"\ - "movd "#p", (%0) \n\t" - -static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - /* Load dct coeffs */ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm1 \n\t" - "movq 16(%0), %%mm2 \n\t" - "movq 24(%0), %%mm3 \n\t" - :: "r"(block) ); - - __asm__ volatile( - /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ - IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) - - "movq %0, %%mm6 \n\t" - /* in: 1,4,0,2 out: 1,2,3,0 */ - TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) - - "paddw %%mm6, %%mm3 \n\t" - - /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ - IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) - - "pxor %%mm7, %%mm7 \n\t" - :: "m"(ff_pw_32)); - - __asm__ volatile( - STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) - : "+r"(dst) - : "r" ((x86_reg)stride) - ); -} - -static inline void h264_idct8_1d(int16_t *block) -{ - __asm__ volatile( - "movq 112(%0), %%mm7 \n\t" - "movq 80(%0), %%mm0 \n\t" - "movq 48(%0), %%mm3 \n\t" - "movq 16(%0), %%mm5 \n\t" - - "movq %%mm0, %%mm4 \n\t" - "movq %%mm5, %%mm1 \n\t" - "psraw $1, %%mm4 \n\t" - "psraw $1, %%mm1 \n\t" - "paddw %%mm0, %%mm4 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm7, %%mm4 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psubw %%mm5, %%mm4 \n\t" - "paddw %%mm3, %%mm1 \n\t" - - "psubw %%mm3, %%mm5 \n\t" - "psubw %%mm3, %%mm0 \n\t" - "paddw %%mm7, %%mm5 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psraw $1, %%mm3 \n\t" - "psraw $1, %%mm7 \n\t" - "psubw %%mm3, %%mm5 \n\t" - "psubw %%mm7, %%mm0 \n\t" - - "movq %%mm4, %%mm3 \n\t" - "movq %%mm1, %%mm7 \n\t" - "psraw $2, %%mm1 \n\t" - "psraw $2, %%mm3 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psraw $2, %%mm5 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psraw $2, %%mm0 \n\t" - "psubw %%mm4, %%mm5 \n\t" - "psubw %%mm0, %%mm7 \n\t" - - "movq 32(%0), %%mm2 \n\t" - "movq 96(%0), %%mm6 \n\t" - "movq %%mm2, %%mm4 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psraw $1, %%mm4 \n\t" - "psraw $1, %%mm6 \n\t" - "psubw %%mm0, %%mm4 \n\t" - "paddw %%mm2, %%mm6 \n\t" - - "movq (%0), %%mm2 \n\t" - "movq 64(%0), %%mm0 \n\t" - SUMSUB_BA( %%mm0, %%mm2 ) - SUMSUB_BA( %%mm6, %%mm0 ) - SUMSUB_BA( %%mm4, %%mm2 ) - SUMSUB_BA( %%mm7, %%mm6 ) - SUMSUB_BA( %%mm5, %%mm4 ) - SUMSUB_BA( %%mm3, %%mm2 ) - SUMSUB_BA( %%mm1, %%mm0 ) - :: "r"(block) - ); -} - -static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - int i; - DECLARE_ALIGNED(8, int16_t, b2)[64]; - - block[0] += 32; - - for(i=0; i<2; i++){ - DECLARE_ALIGNED(8, uint64_t, tmp); - - h264_idct8_1d(block+4*i); - - __asm__ volatile( - "movq %%mm7, %0 \n\t" - TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) - "movq %%mm0, 8(%1) \n\t" - "movq %%mm6, 24(%1) \n\t" - "movq %%mm7, 40(%1) \n\t" - "movq %%mm4, 56(%1) \n\t" - "movq %0, %%mm7 \n\t" - TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) - "movq %%mm7, (%1) \n\t" - "movq %%mm1, 16(%1) \n\t" - "movq %%mm0, 32(%1) \n\t" - "movq %%mm3, 48(%1) \n\t" - : "=m"(tmp) - : "r"(b2+32*i) - : "memory" - ); - } - - for(i=0; i<2; i++){ - h264_idct8_1d(b2+4*i); - - __asm__ volatile( - "psraw $6, %%mm7 \n\t" - "psraw $6, %%mm6 \n\t" - "psraw $6, %%mm5 \n\t" - "psraw $6, %%mm4 \n\t" - "psraw $6, %%mm3 \n\t" - "psraw $6, %%mm2 \n\t" - "psraw $6, %%mm1 \n\t" - "psraw $6, %%mm0 \n\t" - - "movq %%mm7, (%0) \n\t" - "movq %%mm5, 16(%0) \n\t" - "movq %%mm3, 32(%0) \n\t" - "movq %%mm1, 48(%0) \n\t" - "movq %%mm0, 64(%0) \n\t" - "movq %%mm2, 80(%0) \n\t" - "movq %%mm4, 96(%0) \n\t" - "movq %%mm6, 112(%0) \n\t" - :: "r"(b2+4*i) - : "memory" - ); - } - - add_pixels_clamped_mmx(b2, dst, stride); -} - -#define STORE_DIFF_8P( p, d, t, z )\ - "movq "#d", "#t" \n"\ - "psraw $6, "#p" \n"\ - "punpcklbw "#z", "#t" \n"\ - "paddsw "#t", "#p" \n"\ - "packuswb "#p", "#p" \n"\ - "movq "#p", "#d" \n" - -#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ - "movdqa "#c", "#a" \n"\ - "movdqa "#g", "#e" \n"\ - "psraw $1, "#c" \n"\ - "psraw $1, "#g" \n"\ - "psubw "#e", "#c" \n"\ - "paddw "#a", "#g" \n"\ - "movdqa "#b", "#e" \n"\ - "psraw $1, "#e" \n"\ - "paddw "#b", "#e" \n"\ - "paddw "#d", "#e" \n"\ - "paddw "#f", "#e" \n"\ - "movdqa "#f", "#a" \n"\ - "psraw $1, "#a" \n"\ - "paddw "#f", "#a" \n"\ - "paddw "#h", "#a" \n"\ - "psubw "#b", "#a" \n"\ - "psubw "#d", "#b" \n"\ - "psubw "#d", "#f" \n"\ - "paddw "#h", "#b" \n"\ - "psubw "#h", "#f" \n"\ - "psraw $1, "#d" \n"\ - "psraw $1, "#h" \n"\ - "psubw "#d", "#b" \n"\ - "psubw "#h", "#f" \n"\ - "movdqa "#e", "#d" \n"\ - "movdqa "#a", "#h" \n"\ - "psraw $2, "#d" \n"\ - "psraw $2, "#h" \n"\ - "paddw "#f", "#d" \n"\ - "paddw "#b", "#h" \n"\ - "psraw $2, "#f" \n"\ - "psraw $2, "#b" \n"\ - "psubw "#f", "#e" \n"\ - "psubw "#a", "#b" \n"\ - "movdqa 0x00(%1), "#a" \n"\ - "movdqa 0x40(%1), "#f" \n"\ - SUMSUB_BA(f, a)\ - SUMSUB_BA(g, f)\ - SUMSUB_BA(c, a)\ - SUMSUB_BA(e, g)\ - SUMSUB_BA(b, c)\ - SUMSUB_BA(h, a)\ - SUMSUB_BA(d, f) - -static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) -{ - __asm__ volatile( - "movdqa 0x10(%1), %%xmm1 \n" - "movdqa 0x20(%1), %%xmm2 \n" - "movdqa 0x30(%1), %%xmm3 \n" - "movdqa 0x50(%1), %%xmm5 \n" - "movdqa 0x60(%1), %%xmm6 \n" - "movdqa 0x70(%1), %%xmm7 \n" - H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) - TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) - "paddw %4, %%xmm4 \n" - "movdqa %%xmm4, 0x00(%1) \n" - "movdqa %%xmm2, 0x40(%1) \n" - H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) - "movdqa %%xmm6, 0x60(%1) \n" - "movdqa %%xmm7, 0x70(%1) \n" - "pxor %%xmm7, %%xmm7 \n" - STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) - "lea (%0,%2,4), %0 \n" - STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) - "movdqa 0x60(%1), %%xmm0 \n" - "movdqa 0x70(%1), %%xmm1 \n" - STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) - :"+r"(dst) - :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) - ); -} - -static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) -{ - int dc = (block[0] + 32) >> 6; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dst+0*stride)), - "+m"(*(uint32_t*)(dst+1*stride)), - "+m"(*(uint32_t*)(dst+2*stride)), - "+m"(*(uint32_t*)(dst+3*stride)) - ); -} - -static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) -{ - int dc = (block[0] + 32) >> 6; - int y; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - for(y=2; y--; dst += 4*stride){ - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint64_t*)(dst+0*stride)), - "+m"(*(uint64_t*)(dst+1*stride)), - "+m"(*(uint64_t*)(dst+2*stride)), - "+m"(*(uint64_t*)(dst+3*stride)) - ); - } -} - -//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split -static const uint8_t scan8[16 + 2*4]={ - 4+1*8, 5+1*8, 4+2*8, 5+2*8, - 6+1*8, 7+1*8, 6+2*8, 7+2*8, - 4+3*8, 5+3*8, 4+4*8, 5+4*8, - 6+3*8, 7+3*8, 6+4*8, 7+4*8, - 1+1*8, 2+1*8, - 1+2*8, 2+2*8, - 1+4*8, 2+4*8, - 1+5*8, 2+5*8, -}; - -static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); - else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - else if(block[i*16]) - ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} - -/***********************************/ -/* deblocking */ - -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "por "#t", "#o" \n\t"\ - "psubusb "#a", "#o" \n\t" - -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT2_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "psubusb "#a", "#t" \n\t"\ - "psubusb "#a", "#o" \n\t"\ - "pcmpeqb "#t", "#o" \n\t"\ - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 -// out: mm5=beta-1, mm7=mask -// clobbers: mm4,mm6 -#define H264_DEBLOCK_MASK(alpha1, beta1) \ - "pshufw $0, "#alpha1", %%mm4 \n\t"\ - "pshufw $0, "#beta1 ", %%mm5 \n\t"\ - "packuswb %%mm4, %%mm4 \n\t"\ - "packuswb %%mm5, %%mm5 \n\t"\ - DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ - DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pcmpeqb %%mm6, %%mm7 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) -// out: mm1=p0' mm2=q0' -// clobbers: mm0,3-6 -#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ - "movq %%mm1 , %%mm5 \n\t"\ - "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ - "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ - "pcmpeqb %%mm4 , %%mm4 \n\t"\ - "pxor %%mm4 , %%mm3 \n\t"\ - "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ - "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ - "pxor %%mm1 , %%mm4 \n\t"\ - "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ - "pavgb %%mm5 , %%mm3 \n\t"\ - "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ - "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ - "psubusb %%mm3 , %%mm6 \n\t"\ - "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ - "pminub %%mm7 , %%mm6 \n\t"\ - "pminub %%mm7 , %%mm3 \n\t"\ - "psubusb %%mm6 , %%mm1 \n\t"\ - "psubusb %%mm3 , %%mm2 \n\t"\ - "paddusb %%mm3 , %%mm1 \n\t"\ - "paddusb %%mm6 , %%mm2 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone -// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -// clobbers: q2, tmp, tc0 -#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ - "movq %%mm1, "#tmp" \n\t"\ - "pavgb %%mm2, "#tmp" \n\t"\ - "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ - "pxor "q2addr", "#tmp" \n\t"\ - "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ - "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ - "movq "#p1", "#tmp" \n\t"\ - "psubusb "#tc0", "#tmp" \n\t"\ - "paddusb "#p1", "#tc0" \n\t"\ - "pmaxub "#tmp", "#q2" \n\t"\ - "pminub "#tc0", "#q2" \n\t"\ - "movq "#q2", "q1addr" \n\t" - -static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; - - __asm__ volatile( - "movq (%2,%4), %%mm0 \n\t" //p1 - "movq (%2,%4,2), %%mm1 \n\t" //p0 - "movq (%3), %%mm2 \n\t" //q0 - "movq (%3,%4), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%7, %8) - - "movd %6, %%mm4 \n\t" - "punpcklbw %%mm4, %%mm4 \n\t" - "punpcklwd %%mm4, %%mm4 \n\t" - "pcmpeqb %%mm3, %%mm3 \n\t" - "movq %%mm4, %%mm6 \n\t" - "pcmpgtb %%mm3, %%mm4 \n\t" - "movq %%mm6, %1 \n\t" - "pand %%mm4, %%mm7 \n\t" - "movq %%mm7, %0 \n\t" - - /* filter p1 */ - "movq (%2), %%mm3 \n\t" //p2 - DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 - "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|beta-1 - "pand %0, %%mm6 \n\t" - "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then - "pand %%mm6, %%mm5 \n\t" - "psubb %%mm6, %%mm7 \n\t" - "movq (%3,%4), %%mm3 \n\t" - H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) - - /* filter p0, q0 */ - H264_DEBLOCK_P0_Q0(%9, unused) - "movq %%mm1, (%2,%4,2) \n\t" - "movq %%mm2, (%3) \n\t" - - : "=m"(tmp0[0]), "=m"(tmp0[1]) - : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), - "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), - "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - if((tc0[0] & tc0[1]) >= 0) - h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); - if((tc0[2] & tc0[3]) >= 0) - h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); -} -static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - // also, it only needs to transpose 6x8 - DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; - int i; - for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { - if((tc0[0] & tc0[1]) < 0) - continue; - transpose4x4(trans, pix-4, 8, stride); - transpose4x4(trans +4*8, pix, 8, stride); - transpose4x4(trans+4, pix-4+4*stride, 8, stride); - transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); - h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans +2*8, stride, 8); - transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); - } -} - -static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" //p1 - "movq (%0,%2), %%mm1 \n\t" //p0 - "movq (%1), %%mm2 \n\t" //q0 - "movq (%1,%2), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%4, %5) - "movd %3, %%mm6 \n\t" - "punpcklbw %%mm6, %%mm6 \n\t" - "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask - H264_DEBLOCK_P0_Q0(%6, %7) - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "r"(*(uint32_t*)tc0), - "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) - ); -} - -static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); -} - -static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - -// p0 = (p0 + q1 + 2*p1 + 2) >> 2 -#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ - "movq "#p0", %%mm4 \n\t"\ - "pxor "#q1", %%mm4 \n\t"\ - "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ - "pavgb "#q1", "#p0" \n\t"\ - "psubusb %%mm4, "#p0" \n\t"\ - "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ - -static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq (%0,%2), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "movq (%1,%2), %%mm3 \n\t" - H264_DEBLOCK_MASK(%3, %4) - "movq %%mm1, %%mm5 \n\t" - "movq %%mm2, %%mm6 \n\t" - H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' - H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' - "psubb %%mm5, %%mm1 \n\t" - "psubb %%mm6, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "paddb %%mm5, %%mm1 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "m"(alpha1), "m"(beta1), "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); -} - -static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - -static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], - int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { - int dir; - __asm__ volatile( - "movq %0, %%mm7 \n" - "movq %1, %%mm6 \n" - ::"m"(ff_pb_1), "m"(ff_pb_3) - ); - if(field) - __asm__ volatile( - "movq %0, %%mm6 \n" - ::"m"(ff_pb_3_1) - ); - __asm__ volatile( - "movq %%mm6, %%mm5 \n" - "paddb %%mm5, %%mm5 \n" - :); - - // could do a special case for dir==0 && edges==1, but it only reduces the - // average filter time by 1.2% - for( dir=1; dir>=0; dir-- ) { - const x86_reg d_idx = dir ? -8 : -1; - const int mask_mv = dir ? mask_mv1 : mask_mv0; - DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; - int b_idx, edge; - for( b_idx=12, edge=0; edge= limit - "psubusb %%mm5, %%mm3 \n" - "packsswb %%mm3, %%mm1 \n" - "add $40, %0 \n" - "cmp $40, %0 \n" - "jl 1b \n" - "sub $80, %0 \n" - "pshufw $0x4E, %%mm1, %%mm1 \n" - "por %%mm1, %%mm0 \n" - "pshufw $0x4E, %%mm0, %%mm1 \n" - "pminub %%mm1, %%mm0 \n" - ::"r"(d_idx), - "r"(ref[0]+b_idx), - "r"(mv[0]+b_idx) - ); - } else { - __asm__ volatile( - "movd (%1), %%mm0 \n" - "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] - "movq (%2), %%mm1 \n" - "movq 8(%2), %%mm2 \n" - "psubw (%2,%0,4), %%mm1 \n" - "psubw 8(%2,%0,4), %%mm2 \n" - "packsswb %%mm2, %%mm1 \n" - "paddb %%mm6, %%mm1 \n" - "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit - "packsswb %%mm1, %%mm1 \n" - "por %%mm1, %%mm0 \n" - ::"r"(d_idx), - "r"(ref[0]+b_idx), - "r"(mv[0]+b_idx) - ); - } - } - __asm__ volatile( - "movd %0, %%mm1 \n" - "por %1, %%mm1 \n" // nnz[b] || nnz[bn] - ::"m"(nnz[b_idx]), - "m"(nnz[b_idx+d_idx]) - ); - __asm__ volatile( - "pminub %%mm7, %%mm1 \n" - "pminub %%mm7, %%mm0 \n" - "psllw $1, %%mm1 \n" - "pxor %%mm2, %%mm2 \n" - "pmaxub %%mm0, %%mm1 \n" - "punpcklbw %%mm2, %%mm1 \n" - "movq %%mm1, %0 \n" - :"=m"(*bS[dir][edge]) - ::"memory" - ); - } - edges = 4; - step = 1; - } - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm1 \n\t" - "movq 16(%0), %%mm2 \n\t" - "movq 24(%0), %%mm3 \n\t" - TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) - "movq %%mm0, (%0) \n\t" - "movq %%mm3, 8(%0) \n\t" - "movq %%mm4, 16(%0) \n\t" - "movq %%mm2, 24(%0) \n\t" - ::"r"(bS[0]) - :"memory" - ); -} - -/***********************************/ -/* motion compensation */ - -#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw %4, "#T" \n\t"\ - "paddw %5, "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#F", "#A" \n\t"\ - "paddw "#A", "#T" \n\t"\ - "psraw $5, "#T" \n\t"\ - "packuswb "#T", "#T" \n\t"\ - OP(T, (%1), A, d)\ - "add %3, %1 \n\t" - -#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "paddw %4, "#A" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw %3, "#T" \n\t"\ - "paddw "#F", "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#A", "#T" \n\t"\ - "mov"#q" "#T", "#OF"(%1) \n\t" - -#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) -#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) -#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) -#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) - - -#define QPEL_H264(OPNAME, OP, MMX)\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm6 \n\t"\ - :: "m"(ff_pw_5)\ - );\ - do{\ - __asm__ volatile(\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq %5, %%mm5 \n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "movq (%2), %%mm4 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - PAVGB" %%mm4, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_16)\ - : "memory"\ - );\ - }while(--h);\ -}\ -\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int w = size>>4;\ - do{\ - int h = size;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm3 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "movq 10(%0), %%mm4 \n\t"\ - "paddw %%mm4, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw 18(%0), %%mm3 \n\t"\ - "paddw 16(%0), %%mm4 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "movq 12(%0), %%mm5 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "paddw 14(%0), %%mm5 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "paddsw %%mm2, %%mm0 \n\t"\ - "paddsw %%mm5, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm5, %%mm3 \n\t"\ - "psraw $6, %%mm0 \n\t"\ - "psraw $6, %%mm3 \n\t"\ - "packuswb %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - tmp += 8 - size*24;\ - dst += 8 - size*dstStride;\ - }while(w--);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -}\ -static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - do{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 8(%1), %%mm1 \n\t"\ - "movq 48(%1), %%mm2 \n\t"\ - "movq 8+48(%1), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "psraw $5, %%mm2 \n\t"\ - "psraw $5, %%mm3 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - "packuswb %%mm3, %%mm2 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm2 \n\t"\ - OP(%%mm0, (%2), %%mm5, q)\ - OP(%%mm2, (%2,%4), %%mm5, q)\ - ::"a"(src8), "c"(src16), "d"(dst),\ - "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ - :"memory");\ - src8 += 2L*src8Stride;\ - src16 += 48;\ - dst += 2L*dstStride;\ - }while(h-=2);\ -}\ -static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ -}\ - - -#if ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=16;\ - __asm__ volatile(\ - "pxor %%xmm15, %%xmm15 \n\t"\ - "movdqa %6, %%xmm14 \n\t"\ - "movdqa %7, %%xmm13 \n\t"\ - "1: \n\t"\ - "lddqu 6(%0), %%xmm1 \n\t"\ - "lddqu -2(%0), %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm15, %%xmm1 \n\t"\ - "punpcklbw %%xmm15, %%xmm0 \n\t"\ - "punpcklbw %%xmm15, %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm0, %%xmm6 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm0, %%xmm8 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm0, %%xmm9 \n\t"\ - "movdqa %%xmm0, %%xmm12 \n\t"\ - "movdqa %%xmm1, %%xmm11 \n\t"\ - "palignr $10,%%xmm0, %%xmm11\n\t"\ - "palignr $10,%%xmm7, %%xmm12\n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm9 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm7, %%xmm8 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $6, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm0 ,%%xmm11 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $8, %%xmm7, %%xmm0 \n\t"\ - "paddw %%xmm12,%%xmm7 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm8, %%xmm6 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm9, %%xmm0 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psllw $2, %%xmm6 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "psubw %%xmm0, %%xmm6 \n\t"\ - "paddw %%xmm13,%%xmm11 \n\t"\ - "paddw %%xmm13,%%xmm7 \n\t"\ - "pmullw %%xmm14,%%xmm2 \n\t"\ - "pmullw %%xmm14,%%xmm6 \n\t"\ - "lddqu (%2), %%xmm3 \n\t"\ - "paddw %%xmm11,%%xmm2 \n\t"\ - "paddw %%xmm7, %%xmm6 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "psraw $5, %%xmm6 \n\t"\ - "packuswb %%xmm2,%%xmm6 \n\t"\ - "pavgb %%xmm3, %%xmm6 \n\t"\ - OP(%%xmm6, (%1), %%xmm4, dqa)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -} -#else // ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -} -#endif // ARCH_X86_64 - -#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa %0, %%xmm6 \n\t"\ - :: "m"(ff_pw_5)\ - );\ - do{\ - __asm__ volatile(\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "movq (%2), %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw %5, %%xmm0 \n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - "pavgb %%xmm3, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_16)\ - : "memory"\ - );\ - }while(--h);\ -}\ -QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa %5, %%xmm6 \n\t"\ - "1: \n\t"\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw %6, %%xmm0 \n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - src -= 2*srcStride;\ - \ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movq (%0), %%xmm0 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm1 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm2 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm3 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm2 \n\t"\ - "punpcklbw %%xmm7, %%xmm3 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - }\ -}\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -} - -static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ - int w = (size+8)>>3; - src -= 2*srcStride+2; - while(w--){ - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "movq (%0), %%xmm0 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm1 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm2 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm3 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm4 \n\t" - "add %2, %0 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "punpcklbw %%xmm7, %%xmm4 \n\t" - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) - : "memory" - ); - if(size==16){ - __asm__ volatile( - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) - : "memory" - ); - } - tmp += 8; - src += 8 - (size+5)*srcStride; - } -} - -#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int h = size;\ - if(size == 16){\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 32(%0), %%xmm4 \n\t"\ - "movdqa 16(%0), %%xmm5 \n\t"\ - "movdqa (%0), %%xmm7 \n\t"\ - "movdqa %%xmm4, %%xmm3 \n\t"\ - "movdqa %%xmm4, %%xmm2 \n\t"\ - "movdqa %%xmm4, %%xmm1 \n\t"\ - "movdqa %%xmm4, %%xmm0 \n\t"\ - "palignr $10, %%xmm5, %%xmm0 \n\t"\ - "palignr $8, %%xmm5, %%xmm1 \n\t"\ - "palignr $6, %%xmm5, %%xmm2 \n\t"\ - "palignr $4, %%xmm5, %%xmm3 \n\t"\ - "palignr $2, %%xmm5, %%xmm4 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "movdqa %%xmm5, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm3 \n\t"\ - "palignr $8, %%xmm7, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm6 \n\t"\ - "palignr $10, %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "palignr $6, %%xmm7, %%xmm5 \n\t"\ - "palignr $4, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm5 \n\t"\ - \ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "psraw $6, %%xmm3 \n\t"\ - "packuswb %%xmm0, %%xmm3 \n\t"\ - OP(%%xmm3, (%1), %%xmm7, dqa)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }else{\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 16(%0), %%xmm1 \n\t"\ - "movdqa (%0), %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $10, %%xmm0, %%xmm5 \n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $6, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm0, %%xmm2 \n\t"\ - "palignr $2, %%xmm0, %%xmm1 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "packuswb %%xmm0, %%xmm0 \n\t"\ - OP(%%xmm0, (%1), %%xmm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }\ -} - -#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ -}\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ -}\ - -#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 -#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 - -#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 -#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 - -#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 -#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 -#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 -#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 - -#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 -#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 -#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 -#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 - -#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 -#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 - -#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ -H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ - -// static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ -// put_pixels16_sse2(dst, src, stride, 16); -// } -// static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ -// avg_pixels16_sse2(dst, src, stride, 16); -// } -#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 -#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 - -#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ -}\ - -#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ -}\ - -#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ -}\ - -#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ -}\ - -#define H264_MC_4816(MMX)\ -H264_MC(put_, 4, MMX, 8)\ -H264_MC(put_, 8, MMX, 8)\ -H264_MC(put_, 16,MMX, 8)\ -H264_MC(avg_, 4, MMX, 8)\ -H264_MC(avg_, 8, MMX, 8)\ -H264_MC(avg_, 16,MMX, 8)\ - -#define H264_MC_816(QPEL, XMM)\ -QPEL(put_, 8, XMM, 16)\ -QPEL(put_, 16,XMM, 16)\ -QPEL(avg_, 8, XMM, 16)\ -QPEL(avg_, 16,XMM, 16)\ - - -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -///this does not get detected correctly, uncomment on AMD machine -#ifdef HAVE_AMD3DNOW -#define PAVGB "pavgusb" -//QPEL_H264(put_, PUT_OP, 3dnow) -//QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) -#undef PAVGB -#endif - -#define PAVGB "pavgb" -QPEL_H264(put_, PUT_OP, mmx2) -QPEL_H264(avg_, AVG_MMX2_OP, mmx2) -QPEL_H264_V_XMM(put_, PUT_OP, sse2) -QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) -QPEL_H264_HV_XMM(put_, PUT_OP, sse2) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) -#if HAVE_SSSE3 -QPEL_H264_H_XMM(put_, PUT_OP, ssse3) -QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) -#endif -#undef PAVGB - -H264_MC_816(H264_MC_V, sse2) -H264_MC_816(H264_MC_HV, sse2) -#if HAVE_SSSE3 -H264_MC_816(H264_MC_H, ssse3) -H264_MC_816(H264_MC_HV, ssse3) -#endif - -/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ -DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = { - 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL -}; - -#if HAVE_SSSE3 -#define AVG_OP(X) -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 -#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 -#define H264_CHROMA_MC8_MV0 put_pixels8_mmx -#include "dsputil_h264_template_ssse3.c" -static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); -} - -#undef AVG_OP -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 -#define AVG_OP(X) X -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 -#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 -#include "dsputil_h264_template_ssse3.c" -static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); -} -#undef AVG_OP -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 -#endif - -/***********************************/ -/* weighted prediction */ - -static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) -{ - int x, y; - offset <<= log2_denom; - offset += (1 << log2_denom) >> 1; - __asm__ volatile( - "movd %0, %%mm4 \n\t" - "movd %1, %%mm5 \n\t" - "movd %2, %%mm6 \n\t" - "pshufw $0, %%mm4, %%mm4 \n\t" - "pshufw $0, %%mm5, %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - :: "g"(weight), "g"(offset), "g"(log2_denom) - ); - for(y=0; y et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_MATHOPS_H -#define AVCODEC_X86_MATHOPS_H - -#include "config.h" -#include "libavutil/common.h" - -#if ARCH_X86_32 -#define MULL(ra, rb, shift) \ - ({ int rt, dummy; __asm__ (\ - "imull %3 \n\t"\ - "shrdl %4, %%edx, %%eax \n\t"\ - : "=a"(rt), "=d"(dummy)\ - : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\ - rt; }) - -#define MULH(ra, rb) \ - ({ int rt, dummy;\ - __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\ - rt; }) - -#define MUL64(ra, rb) \ - ({ int64_t rt;\ - __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\ - rt; }) -#endif - -// avoid +32 for shift optimization (gcc should do that ...) -#define NEG_SSR32 NEG_SSR32 -static inline int32_t NEG_SSR32( int32_t a, int8_t s){ - __asm__ ("sarl %1, %0\n\t" - : "+r" (a) - : "ic" ((uint8_t)(-s)) - ); - return a; -} - -#define NEG_USR32 NEG_USR32 -static inline uint32_t NEG_USR32(uint32_t a, int8_t s){ - __asm__ ("shrl %1, %0\n\t" - : "+r" (a) - : "ic" ((uint8_t)(-s)) - ); - return a; -} - -#endif /* AVCODEC_X86_MATHOPS_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavcodec/x86/mmx.h --- a/ffmpeg_smp/h264dec/libavcodec/x86/mmx.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,267 +0,0 @@ -/* - * mmx.h - * Copyright (C) 1997-2001 H. Dietz and R. Fisher - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef AVCODEC_X86_MMX_H -#define AVCODEC_X86_MMX_H - -#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected. - - -#define mmx_i2r(op,imm,reg) \ - __asm__ volatile (#op " %0, %%" #reg \ - : /* nothing */ \ - : "i" (imm) ) - -#define mmx_m2r(op,mem,reg) \ - __asm__ volatile (#op " %0, %%" #reg \ - : /* nothing */ \ - : "m" (mem)) - -#define mmx_r2m(op,reg,mem) \ - __asm__ volatile (#op " %%" #reg ", %0" \ - : "=m" (mem) \ - : /* nothing */ ) - -#define mmx_r2r(op,regs,regd) \ - __asm__ volatile (#op " %" #regs ", %" #regd) - - -#define emms() __asm__ volatile ("emms") - -#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) -#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) -#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) - -#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) -#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) -#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) - -#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) -#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) -#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) -#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) - -#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) -#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) - -#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) -#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) -#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) -#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) -#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) -#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) - -#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) -#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) -#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) -#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) - -#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) -#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) -#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) -#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) - -#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) -#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) - -#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) -#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) - -#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) -#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) -#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) -#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) -#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) -#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) - -#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) -#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) -#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) -#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) -#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) -#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) - -#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) -#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) - -#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) -#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) - -#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) -#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) - -#define por_m2r(var,reg) mmx_m2r (por, var, reg) -#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) - -#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) -#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) -#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) -#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) -#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) -#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) -#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) -#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) -#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) - -#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) -#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) -#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) -#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) -#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) -#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) - -#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) -#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) -#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) -#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) -#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) -#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) -#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) -#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) -#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) - -#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) -#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) -#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) -#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) -#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) -#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) - -#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) -#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) -#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) -#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) - -#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) -#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) -#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) -#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) - -#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) -#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) -#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) -#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) -#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) -#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) - -#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) -#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) -#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) -#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) -#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) -#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) - -#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) -#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) - - -/* 3DNOW extensions */ - -#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) -#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) - - -/* AMD MMX extensions - also available in intel SSE */ - - -#define mmx_m2ri(op,mem,reg,imm) \ - __asm__ volatile (#op " %1, %0, %%" #reg \ - : /* nothing */ \ - : "m" (mem), "i" (imm)) -#define mmx_r2ri(op,regs,regd,imm) \ - __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \ - : /* nothing */ \ - : "i" (imm) ) - -#define mmx_fetch(mem,hint) \ - __asm__ volatile ("prefetch" #hint " %0" \ - : /* nothing */ \ - : "m" (mem)) - - -#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) - -#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) - -#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) -#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) -#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) -#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) - -#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) - -#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) - -#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) -#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) - -#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) -#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) - -#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) -#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) - -#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) -#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) - -#define pmovmskb(mmreg,reg) \ - __asm__ volatile ("movmskps %" #mmreg ", %" #reg) - -#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) -#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) - -#define prefetcht0(mem) mmx_fetch (mem, t0) -#define prefetcht1(mem) mmx_fetch (mem, t1) -#define prefetcht2(mem) mmx_fetch (mem, t2) -#define prefetchnta(mem) mmx_fetch (mem, nta) - -#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) -#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) - -#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) -#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) - -#define sfence() __asm__ volatile ("sfence\n\t") - -/* SSE2 */ -#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm) -#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm) -#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm) -#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm) - -#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) - -#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) -#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) -#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) -#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) -#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) -#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) - -#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var) - -#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg) -#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg) - -#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) -#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) - - -#endif /* AVCODEC_X86_MMX_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/arm/bswap.h --- a/ffmpeg_smp/h264dec/libavutil/arm/bswap.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,72 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_ARM_BSWAP_H -#define AVUTIL_ARM_BSWAP_H - -#include -#include "config.h" -#include "libavutil/attributes.h" - -#ifdef __ARMCC_VERSION - -#if HAVE_ARMV6 -#define bswap_16 bswap_16 -static av_always_inline av_const unsigned bswap_16(unsigned x) -{ - __asm { rev16 x, x } - return x; -} - -#define bswap_32 bswap_32 -static av_always_inline av_const uint32_t bswap_32(uint32_t x) -{ - return __rev(x); -} -#endif /* HAVE_ARMV6 */ - -#elif HAVE_INLINE_ASM - -#if HAVE_ARMV6 -#define bswap_16 bswap_16 -static av_always_inline av_const unsigned bswap_16(unsigned x) -{ - __asm__("rev16 %0, %0" : "+r"(x)); - return x; -} -#endif - -#define bswap_32 bswap_32 -static av_always_inline av_const uint32_t bswap_32(uint32_t x) -{ -#if HAVE_ARMV6 - __asm__("rev %0, %0" : "+r"(x)); -#else - uint32_t t; - __asm__ ("eor %1, %0, %0, ror #16 \n\t" - "bic %1, %1, #0xFF0000 \n\t" - "mov %0, %0, ror #8 \n\t" - "eor %0, %0, %1, lsr #8 \n\t" - : "+r"(x), "=&r"(t)); -#endif /* HAVE_ARMV6 */ - return x; -} - -#endif /* __ARMCC_VERSION */ - -#endif /* AVUTIL_ARM_BSWAP_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h --- a/ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_ARM_INTREADWRITE_H -#define AVUTIL_ARM_INTREADWRITE_H - -#include -#include "config.h" - -#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM - -#define AV_RN16 AV_RN16 -static av_always_inline uint16_t AV_RN16(const void *p) -{ - uint16_t v; - __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p)); - return v; -} - -#define AV_WN16 AV_WN16 -static av_always_inline void AV_WN16(void *p, uint16_t v) -{ - __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v)); -} - -#define AV_RN32 AV_RN32 -static av_always_inline uint32_t AV_RN32(const void *p) -{ - uint32_t v; - __asm__ ("ldr %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p)); - return v; -} - -#define AV_WN32 AV_WN32 -static av_always_inline void AV_WN32(void *p, uint32_t v) -{ - __asm__ ("str %1, %0" : "=m"(*(uint32_t *)p) : "r"(v)); -} - -#define AV_RN64 AV_RN64 -static av_always_inline uint64_t AV_RN64(const void *p) -{ - union { uint64_t v; uint32_t hl[2]; } v; - __asm__ ("ldr %0, %2 \n\t" - "ldr %1, %3 \n\t" - : "=&r"(v.hl[0]), "=r"(v.hl[1]) - : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1))); - return v.v; -} - -#define AV_WN64 AV_WN64 -static av_always_inline void AV_WN64(void *p, uint64_t v) -{ - union { uint64_t v; uint32_t hl[2]; } vv = { v }; - __asm__ ("str %2, %0 \n\t" - "str %3, %1 \n\t" - : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1)) - : "r"(vv.hl[0]), "r"(vv.hl[1])); -} - -#endif /* HAVE_INLINE_ASM */ - -#endif /* AVUTIL_ARM_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/arm/timer.h --- a/ffmpeg_smp/h264dec/libavutil/arm/timer.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_ARM_TIMER_H -#define AVUTIL_ARM_TIMER_H - -#include -#include "config.h" - -#if HAVE_INLINE_ASM && defined(__ARM_ARCH_7A__) - -#define AV_READ_TIME read_time - -static inline uint64_t read_time(void) -{ - unsigned cc; - __asm__ volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc)); - return cc; -} - -#endif /* HAVE_INLINE_ASM && __ARM_ARCH_7A__ */ - -#endif /* AVUTIL_ARM_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/attributes.h --- a/ffmpeg_smp/h264dec/libavutil/attributes.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * Macro definitions for various function/variable attributes - */ - -#ifndef AVUTIL_ATTRIBUTES_H -#define AVUTIL_ATTRIBUTES_H - -#ifdef __GNUC__ -# define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y) -#else -# define AV_GCC_VERSION_AT_LEAST(x,y) 0 -#endif - -#ifndef av_always_inline -#if AV_GCC_VERSION_AT_LEAST(3,1) -# define av_always_inline __attribute__((always_inline)) inline -#else -# define av_always_inline inline -#endif -#endif - -#ifndef av_noinline -#if AV_GCC_VERSION_AT_LEAST(3,1) -# define av_noinline __attribute__((noinline)) -#else -# define av_noinline -#endif -#endif - -#ifndef av_pure -#if AV_GCC_VERSION_AT_LEAST(3,1) -# define av_pure __attribute__((pure)) -#else -# define av_pure -#endif -#endif - -#ifndef av_const -#if AV_GCC_VERSION_AT_LEAST(2,6) -# define av_const __attribute__((const)) -#else -# define av_const -#endif -#endif - -#ifndef av_cold -#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3) -# define av_cold __attribute__((cold)) -#else -# define av_cold -#endif -#endif - -#ifndef av_flatten -#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1) -# define av_flatten __attribute__((flatten)) -#else -# define av_flatten -#endif -#endif - -#ifndef attribute_deprecated -#if AV_GCC_VERSION_AT_LEAST(3,1) -# define attribute_deprecated __attribute__((deprecated)) -#else -# define attribute_deprecated -#endif -#endif - -#ifndef av_unused -#if defined(__GNUC__) -# define av_unused __attribute__((unused)) -#else -# define av_unused -#endif -#endif - -#ifndef av_uninit -#if defined(__GNUC__) && !defined(__ICC) -# define av_uninit(x) x=x -#else -# define av_uninit(x) x -#endif -#endif - -#ifdef __GNUC__ -# define av_builtin_constant_p __builtin_constant_p -#else -# define av_builtin_constant_p(x) 0 -#endif - -#endif /* AVUTIL_ATTRIBUTES_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/bswap.h --- a/ffmpeg_smp/h264dec/libavutil/bswap.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,95 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * byte swapping routines - */ - -#ifndef AVUTIL_BSWAP_H -#define AVUTIL_BSWAP_H - -#include -#include "config.h" -#include "attributes.h" - -#if ARCH_ARM -# include "arm/bswap.h" -#elif ARCH_X86 -# include "x86/bswap.h" -#endif - -#ifndef bswap_16 -static av_always_inline av_const uint16_t bswap_16(uint16_t x) -{ - x= (x>>8) | (x<<8); - return x; -} -#endif - -#ifndef bswap_32 -static av_always_inline av_const uint32_t bswap_32(uint32_t x) -{ - x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); - x= (x>>16) | (x<<16); - return x; -} -#endif - -#ifndef bswap_64 -static inline uint64_t av_const bswap_64(uint64_t x) -{ -#if 0 - x= ((x<< 8)&0xFF00FF00FF00FF00ULL) | ((x>> 8)&0x00FF00FF00FF00FFULL); - x= ((x<<16)&0xFFFF0000FFFF0000ULL) | ((x>>16)&0x0000FFFF0000FFFFULL); - return (x>>32) | (x<<32); -#else - union { - uint64_t ll; - uint32_t l[2]; - } w, r; - w.ll = x; - r.l[0] = bswap_32 (w.l[1]); - r.l[1] = bswap_32 (w.l[0]); - return r.ll; -#endif -} -#endif - -// be2me ... big-endian to machine-endian -// le2me ... little-endian to machine-endian - -#if HAVE_BIGENDIAN -#define be2me_16(x) (x) -#define be2me_32(x) (x) -#define be2me_64(x) (x) -#define le2me_16(x) bswap_16(x) -#define le2me_32(x) bswap_32(x) -#define le2me_64(x) bswap_64(x) -#else -#define be2me_16(x) bswap_16(x) -#define be2me_32(x) bswap_32(x) -#define be2me_64(x) bswap_64(x) -#define le2me_16(x) (x) -#define le2me_32(x) (x) -#define le2me_64(x) (x) -#endif - -#endif /* AVUTIL_BSWAP_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/common.h --- a/ffmpeg_smp/h264dec/libavutil/common.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,298 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * common internal and external API header - */ - -#ifndef AVUTIL_COMMON_H -#define AVUTIL_COMMON_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include "attributes.h" - -//rounded division & shift -#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b)) -/* assume b>0 */ -#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) -#define FFABS(a) ((a) >= 0 ? (a) : (-(a))) -#define FFSIGN(a) ((a) > 0 ? 1 : -1) - -#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) -#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c) -#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) -#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c) - -#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) -#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0])) -#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1)) - -/* misc math functions */ -extern const uint8_t ff_log2_tab[256]; - -static inline av_const int av_log2_c(unsigned int v) -{ - int n = 0; - if (v & 0xffff0000) { - v >>= 16; - n += 16; - } - if (v & 0xff00) { - v >>= 8; - n += 8; - } - n += ff_log2_tab[v]; - - return n; -} - -static inline av_const int av_log2_16bit_c(unsigned int v) -{ - int n = 0; - if (v & 0xff00) { - v >>= 8; - n += 8; - } - n += ff_log2_tab[v]; - - return n; -} - -#ifdef HAVE_AV_CONFIG_H -# include "config.h" -#endif - -/** - * Clips a signed integer value into the amin-amax range. - * @param a value to clip - * @param amin minimum value of the clip range - * @param amax maximum value of the clip range - * @return clipped value - */ -static inline av_const int av_clip(int a, int amin, int amax) -{ - if (a < amin) return amin; - else if (a > amax) return amax; - else return a; -} - -/** - * Clips a signed integer value into the 0-255 range. - * @param a value to clip - * @return clipped value - */ -static inline av_const uint8_t av_clip_uint8(int a) -{ - if (a&(~0xFF)) return (-a)>>31; - else return a; -} - -/** - * Clips a signed integer value into the 0-65535 range. - * @param a value to clip - * @return clipped value - */ -static inline av_const uint16_t av_clip_uint16(int a) -{ - if (a&(~0xFFFF)) return (-a)>>31; - else return a; -} - -/** - * Clips a signed integer value into the -32768,32767 range. - * @param a value to clip - * @return clipped value - */ -static inline av_const int16_t av_clip_int16(int a) -{ - if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF; - else return a; -} - -/** - * Clips a signed 64-bit integer value into the -2147483648,2147483647 range. - * @param a value to clip - * @return clipped value - */ -static inline av_const int32_t av_clipl_int32(int64_t a) -{ - if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF; - else return a; -} - -/** - * Clips a float value into the amin-amax range. - * @param a value to clip - * @param amin minimum value of the clip range - * @param amax maximum value of the clip range - * @return clipped value - */ -static inline av_const float av_clipf(float a, float amin, float amax) -{ - if (a < amin) return amin; - else if (a > amax) return amax; - else return a; -} - -/** Computes ceil(log2(x)). - * @param x value used to compute ceil(log2(x)) - * @return computed ceiling of log2(x) - */ -static inline av_const int av_ceil_log2(int x) -{ - return av_log2_c((x - 1) << 1); -} - -#define MKTAG(a,b,c,d) (a | (b << 8) | (c << 16) | (d << 24)) -#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24)) - -/*! - * \def GET_UTF8(val, GET_BYTE, ERROR) - * Converts a UTF-8 character (up to 4 bytes long) to its 32-bit UCS-4 encoded form - * \param val is the output and should be of type uint32_t. It holds the converted - * UCS-4 character and should be a left value. - * \param GET_BYTE gets UTF-8 encoded bytes from any proper source. It can be - * a function or a statement whose return value or evaluated value is of type - * uint8_t. It will be executed up to 4 times for values in the valid UTF-8 range, - * and up to 7 times in the general case. - * \param ERROR action that should be taken when an invalid UTF-8 byte is returned - * from GET_BYTE. It should be a statement that jumps out of the macro, - * like exit(), goto, return, break, or continue. - */ -#define GET_UTF8(val, GET_BYTE, ERROR)\ - val= GET_BYTE;\ - {\ - int ones= 7 - av_log2(val ^ 255);\ - if(ones==1)\ - ERROR\ - val&= 127>>ones;\ - while(--ones > 0){\ - int tmp= GET_BYTE - 128;\ - if(tmp>>6)\ - ERROR\ - val= (val<<6) + tmp;\ - }\ - } - -/*! - * \def GET_UTF16(val, GET_16BIT, ERROR) - * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form - * \param val is the output and should be of type uint32_t. It holds the converted - * UCS-4 character and should be a left value. - * \param GET_16BIT gets two bytes of UTF-16 encoded data converted to native endianness. - * It can be a function or a statement whose return value or evaluated value is of type - * uint16_t. It will be executed up to 2 times. - * \param ERROR action that should be taken when an invalid UTF-16 surrogate is - * returned from GET_BYTE. It should be a statement that jumps out of the macro, - * like exit(), goto, return, break, or continue. - */ -#define GET_UTF16(val, GET_16BIT, ERROR)\ - val = GET_16BIT;\ - {\ - unsigned int hi = val - 0xD800;\ - if (hi < 0x800) {\ - val = GET_16BIT - 0xDC00;\ - if (val > 0x3FFU || hi > 0x3FFU)\ - ERROR\ - val += (hi<<10) + 0x10000;\ - }\ - }\ - -/*! - * \def PUT_UTF8(val, tmp, PUT_BYTE) - * Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long). - * \param val is an input-only argument and should be of type uint32_t. It holds - * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If - * val is given as a function it is executed only once. - * \param tmp is a temporary variable and should be of type uint8_t. It - * represents an intermediate value during conversion that is to be - * output by PUT_BYTE. - * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination. - * It could be a function or a statement, and uses tmp as the input byte. - * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be - * executed up to 4 times for values in the valid UTF-8 range and up to - * 7 times in the general case, depending on the length of the converted - * Unicode character. - */ -#define PUT_UTF8(val, tmp, PUT_BYTE)\ - {\ - int bytes, shift;\ - uint32_t in = val;\ - if (in < 0x80) {\ - tmp = in;\ - PUT_BYTE\ - } else {\ - bytes = (av_log2(in) + 4) / 5;\ - shift = (bytes - 1) * 6;\ - tmp = (256 - (256 >> bytes)) | (in >> shift);\ - PUT_BYTE\ - while (shift >= 6) {\ - shift -= 6;\ - tmp = 0x80 | ((in >> shift) & 0x3f);\ - PUT_BYTE\ - }\ - }\ - } - -/*! - * \def PUT_UTF16(val, tmp, PUT_16BIT) - * Converts a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes). - * \param val is an input-only argument and should be of type uint32_t. It holds - * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If - * val is given as a function it is executed only once. - * \param tmp is a temporary variable and should be of type uint16_t. It - * represents an intermediate value during conversion that is to be - * output by PUT_16BIT. - * \param PUT_16BIT writes the converted UTF-16 data to any proper destination - * in desired endianness. It could be a function or a statement, and uses tmp - * as the input byte. For example, PUT_BYTE could be "*output++ = tmp;" - * PUT_BYTE will be executed 1 or 2 times depending on input character. - */ -#define PUT_UTF16(val, tmp, PUT_16BIT)\ - {\ - uint32_t in = val;\ - if (in < 0x10000) {\ - tmp = in;\ - PUT_16BIT\ - } else {\ - tmp = 0xD800 | ((in - 0x10000) >> 10);\ - PUT_16BIT\ - tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\ - PUT_16BIT\ - }\ - }\ - - - -#include "mem.h" - -#ifdef HAVE_AV_CONFIG_H -# include "internal.h" -#endif /* HAVE_AV_CONFIG_H */ - -#endif /* AVUTIL_COMMON_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/error.h --- a/ffmpeg_smp/h264dec/libavutil/error.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * error code definitions - */ - -#ifndef AVUTIL_ERROR_H -#define AVUTIL_ERROR_H - -#include -#include "common.h" - -/* error handling */ -#if EDOM > 0 -#define AVERROR(e) (-(e)) ///< Returns a negative error code from a POSIX error code, to return from library functions. -#define AVUNERROR(e) (-(e)) ///< Returns a POSIX error code from a library function error return value. -#else -/* Some platforms have E* and errno already negated. */ -#define AVERROR(e) (e) -#define AVUNERROR(e) (e) -#endif - -#define AVERROR_EOF AVERROR(EPIPE) ///< End of file - - -/** - * Puts a description of the AVERROR code errnum in errbuf. - * In case of failure the global variable errno is set to indicate the - * error. - * - * @param errbuf_size the size in bytes of errbuf - * @return 0 on success, a negative value otherwise - */ -int av_strerror(int errnum, char *errbuf, size_t errbuf_size); - -#endif /* AVUTIL_ERROR_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/internal.h --- a/ffmpeg_smp/h264dec/libavutil/internal.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,168 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * common internal API header - */ - -#ifndef AVUTIL_INTERNAL_H -#define AVUTIL_INTERNAL_H - -#if !defined(DEBUG) && !defined(NDEBUG) -# define NDEBUG -#endif - -#include -#include -#include -#include -#include "config.h" -#include "attributes.h" -#include "timer.h" - - - -#ifndef INT16_MIN -#define INT16_MIN (-0x7fff - 1) -#endif - -#ifndef INT16_MAX -#define INT16_MAX 0x7fff -#endif - -#ifndef INT32_MIN -#define INT32_MIN (-0x7fffffff - 1) -#endif - -#ifndef INT32_MAX -#define INT32_MAX 0x7fffffff -#endif - -#ifndef UINT32_MAX -#define UINT32_MAX 0xffffffff -#endif - -#ifndef INT64_MIN -#define INT64_MIN (-0x7fffffffffffffffLL - 1) -#endif - -#ifndef INT64_MAX -#define INT64_MAX INT64_C(9223372036854775807) -#endif - -#ifndef UINT64_MAX -#define UINT64_MAX UINT64_C(0xFFFFFFFFFFFFFFFF) -#endif - -#ifndef INT_BIT -# define INT_BIT (CHAR_BIT * sizeof(int)) -#endif - -#ifndef offsetof -# define offsetof(T, F) ((unsigned int)((char *)&((T *)0)->F)) -#endif - -/* Use to export labels from asm. */ -#define LABEL_MANGLE(a) #a -#define LOCAL_MANGLE(a) #a -#define MANGLE(a) #a - -// Use rip-relative addressing if compiling PIC code on x86-64. -// #if ARCH_X86_64 && defined(PIC) -// # define LOCAL_MANGLE(a) #a "(%%rip)" -// #else -// # define LOCAL_MANGLE(a) #a -// #endif -// -// #define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a) - -/* debug stuff */ - -/* dprintf macros */ -#ifdef DEBUG -# define dprintf(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__) -#else -# define dprintf(pctx, ...) -#endif - -#define av_abort() do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0) - -/* math */ - - -/* avoid usage of dangerous/inappropriate system functions */ -// #undef malloc -// #define malloc please_use_av_malloc -// #undef free -// #define free please_use_av_free -#undef realloc -#define realloc please_use_av_realloc -#undef time -#define time time_is_forbidden_due_to_security_issues -#undef rand -#define rand rand_is_forbidden_due_to_state_trashing_use_av_lfg_get -#undef srand -#define srand srand_is_forbidden_due_to_state_trashing_use_av_lfg_init -#undef random -#define random random_is_forbidden_due_to_state_trashing_use_av_lfg_get -#undef sprintf -#define sprintf sprintf_is_forbidden_due_to_security_issues_use_snprintf -//#undef exit -//#define exit exit_is_forbidden -#ifndef LIBAVFORMAT_BUILD - -#undef puts -#define puts please_use_av_log_instead_of_puts -#undef perror -#define perror please_use_av_log_instead_of_perror -#endif - -#define FF_ALLOC_OR_GOTO(p, size, label)\ -{\ - p = av_malloc(size);\ - if (p == NULL && (size) != 0) {\ - av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\ - goto label;\ - }\ -} - -#define FF_ALLOCZ_OR_GOTO(p, size, label)\ -{\ - p = av_mallocz(size);\ - if (p == NULL && (size) != 0) {\ - av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\ - goto label;\ - }\ -} - - -/** - * Returns NULL if CONFIG_SMALL is true, otherwise the argument - * without modification. Used to disable the definition of strings - * (for example AVCodec long_names). - */ -#if CONFIG_SMALL -# define NULL_IF_CONFIG_SMALL(x) NULL -#else -# define NULL_IF_CONFIG_SMALL(x) x -#endif - -#endif /* AVUTIL_INTERNAL_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/intreadwrite.h --- a/ffmpeg_smp/h264dec/libavutil/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,498 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_INTREADWRITE_H -#define AVUTIL_INTREADWRITE_H - -#include -#include "config.h" -#include "bswap.h" -#include "common.h" - -typedef union { - uint64_t u64; - uint32_t u32[2]; - uint16_t u16[4]; - uint8_t u8 [8]; - double f64; - float f32[2]; -} __attribute__((__may_alias__)) av_alias64; - -typedef union { - uint32_t u32; - uint16_t u16[2]; - uint8_t u8 [4]; - float f32; -} __attribute__((__may_alias__)) av_alias32; - -typedef union { - uint16_t u16; - uint8_t u8 [2]; -} __attribute__((__may_alias__)) av_alias16 ; - -/* - * Arch-specific headers can provide any combination of - * AV_[RW][BLN](16|24|32|64) and AV_(COPY|SWAP|ZERO)(64|128) macros. - * Preprocessor symbols must be defined, even if these are implemented - * as inline functions. - */ - -#if ARCH_ARM -# include "arm/intreadwrite.h" -#elif ARCH_PPC -# include "ppc/intreadwrite.h" -#elif ARCH_X86 -# include "x86/intreadwrite.h" -#endif - -/* - * Map AV_RNXX <-> AV_R[BL]XX for all variants provided by per-arch headers. - */ - -#if HAVE_BIGENDIAN - -# if defined(AV_RN16) && !defined(AV_RB16) -# define AV_RB16(p) AV_RN16(p) -# elif !defined(AV_RN16) && defined(AV_RB16) -# define AV_RN16(p) AV_RB16(p) -# endif - -# if defined(AV_WN16) && !defined(AV_WB16) -# define AV_WB16(p, v) AV_WN16(p, v) -# elif !defined(AV_WN16) && defined(AV_WB16) -# define AV_WN16(p, v) AV_WB16(p, v) -# endif - -# if defined(AV_RN24) && !defined(AV_RB24) -# define AV_RB24(p) AV_RN24(p) -# elif !defined(AV_RN24) && defined(AV_RB24) -# define AV_RN24(p) AV_RB24(p) -# endif - -# if defined(AV_WN24) && !defined(AV_WB24) -# define AV_WB24(p, v) AV_WN24(p, v) -# elif !defined(AV_WN24) && defined(AV_WB24) -# define AV_WN24(p, v) AV_WB24(p, v) -# endif - -# if defined(AV_RN32) && !defined(AV_RB32) -# define AV_RB32(p) AV_RN32(p) -# elif !defined(AV_RN32) && defined(AV_RB32) -# define AV_RN32(p) AV_RB32(p) -# endif - -# if defined(AV_WN32) && !defined(AV_WB32) -# define AV_WB32(p, v) AV_WN32(p, v) -# elif !defined(AV_WN32) && defined(AV_WB32) -# define AV_WN32(p, v) AV_WB32(p, v) -# endif - -# if defined(AV_RN64) && !defined(AV_RB64) -# define AV_RB64(p) AV_RN64(p) -# elif !defined(AV_RN64) && defined(AV_RB64) -# define AV_RN64(p) AV_RB64(p) -# endif - -# if defined(AV_WN64) && !defined(AV_WB64) -# define AV_WB64(p, v) AV_WN64(p, v) -# elif !defined(AV_WN64) && defined(AV_WB64) -# define AV_WN64(p, v) AV_WB64(p, v) -# endif - -#else /* HAVE_BIGENDIAN */ - -# if defined(AV_RN16) && !defined(AV_RL16) -# define AV_RL16(p) AV_RN16(p) -# elif !defined(AV_RN16) && defined(AV_RL16) -# define AV_RN16(p) AV_RL16(p) -# endif - -# if defined(AV_WN16) && !defined(AV_WL16) -# define AV_WL16(p, v) AV_WN16(p, v) -# elif !defined(AV_WN16) && defined(AV_WL16) -# define AV_WN16(p, v) AV_WL16(p, v) -# endif - -# if defined(AV_RN24) && !defined(AV_RL24) -# define AV_RL24(p) AV_RN24(p) -# elif !defined(AV_RN24) && defined(AV_RL24) -# define AV_RN24(p) AV_RL24(p) -# endif - -# if defined(AV_WN24) && !defined(AV_WL24) -# define AV_WL24(p, v) AV_WN24(p, v) -# elif !defined(AV_WN24) && defined(AV_WL24) -# define AV_WN24(p, v) AV_WL24(p, v) -# endif - -# if defined(AV_RN32) && !defined(AV_RL32) -# define AV_RL32(p) AV_RN32(p) -# elif !defined(AV_RN32) && defined(AV_RL32) -# define AV_RN32(p) AV_RL32(p) -# endif - -# if defined(AV_WN32) && !defined(AV_WL32) -# define AV_WL32(p, v) AV_WN32(p, v) -# elif !defined(AV_WN32) && defined(AV_WL32) -# define AV_WN32(p, v) AV_WL32(p, v) -# endif - -# if defined(AV_RN64) && !defined(AV_RL64) -# define AV_RL64(p) AV_RN64(p) -# elif !defined(AV_RN64) && defined(AV_RL64) -# define AV_RN64(p) AV_RL64(p) -# endif - -# if defined(AV_WN64) && !defined(AV_WL64) -# define AV_WL64(p, v) AV_WN64(p, v) -# elif !defined(AV_WN64) && defined(AV_WL64) -# define AV_WN64(p, v) AV_WL64(p, v) -# endif - -#endif /* !HAVE_BIGENDIAN */ - -/* - * Define AV_[RW]N helper macros to simplify definitions not provided - * by per-arch headers. - */ - - - -#if defined(__DECC) - -# define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p))) -# define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v)) - -#else - -#ifndef AV_RB16 -# define AV_RB16(x) \ - ((((const uint8_t*)(x))[0] << 8) | \ - ((const uint8_t*)(x))[1]) -#endif -#ifndef AV_WB16 -# define AV_WB16(p, d) do { \ - ((uint8_t*)(p))[1] = (d); \ - ((uint8_t*)(p))[0] = (d)>>8; \ - } while(0) -#endif - -#ifndef AV_RL16 -# define AV_RL16(x) \ - ((((const uint8_t*)(x))[1] << 8) | \ - ((const uint8_t*)(x))[0]) -#endif -#ifndef AV_WL16 -# define AV_WL16(p, d) do { \ - ((uint8_t*)(p))[0] = (d); \ - ((uint8_t*)(p))[1] = (d)>>8; \ - } while(0) -#endif - -#ifndef AV_RB32 -# define AV_RB32(x) \ - ((((const uint8_t*)(x))[0] << 24) | \ - (((const uint8_t*)(x))[1] << 16) | \ - (((const uint8_t*)(x))[2] << 8) | \ - ((const uint8_t*)(x))[3]) -#endif -#ifndef AV_WB32 -# define AV_WB32(p, d) do { \ - ((uint8_t*)(p))[3] = (d); \ - ((uint8_t*)(p))[2] = (d)>>8; \ - ((uint8_t*)(p))[1] = (d)>>16; \ - ((uint8_t*)(p))[0] = (d)>>24; \ - } while(0) -#endif - -#ifndef AV_RL32 -# define AV_RL32(x) \ - ((((const uint8_t*)(x))[3] << 24) | \ - (((const uint8_t*)(x))[2] << 16) | \ - (((const uint8_t*)(x))[1] << 8) | \ - ((const uint8_t*)(x))[0]) -#endif -#ifndef AV_WL32 -# define AV_WL32(p, d) do { \ - ((uint8_t*)(p))[0] = (d); \ - ((uint8_t*)(p))[1] = (d)>>8; \ - ((uint8_t*)(p))[2] = (d)>>16; \ - ((uint8_t*)(p))[3] = (d)>>24; \ - } while(0) -#endif - -#ifndef AV_RB64 -# define AV_RB64(x) \ - (((uint64_t)((const uint8_t*)(x))[0] << 56) | \ - ((uint64_t)((const uint8_t*)(x))[1] << 48) | \ - ((uint64_t)((const uint8_t*)(x))[2] << 40) | \ - ((uint64_t)((const uint8_t*)(x))[3] << 32) | \ - ((uint64_t)((const uint8_t*)(x))[4] << 24) | \ - ((uint64_t)((const uint8_t*)(x))[5] << 16) | \ - ((uint64_t)((const uint8_t*)(x))[6] << 8) | \ - (uint64_t)((const uint8_t*)(x))[7]) -#endif -#ifndef AV_WB64 -# define AV_WB64(p, d) do { \ - ((uint8_t*)(p))[7] = (d); \ - ((uint8_t*)(p))[6] = (d)>>8; \ - ((uint8_t*)(p))[5] = (d)>>16; \ - ((uint8_t*)(p))[4] = (d)>>24; \ - ((uint8_t*)(p))[3] = (d)>>32; \ - ((uint8_t*)(p))[2] = (d)>>40; \ - ((uint8_t*)(p))[1] = (d)>>48; \ - ((uint8_t*)(p))[0] = (d)>>56; \ - } while(0) -#endif - -#ifndef AV_RL64 -# define AV_RL64(x) \ - (((uint64_t)((const uint8_t*)(x))[7] << 56) | \ - ((uint64_t)((const uint8_t*)(x))[6] << 48) | \ - ((uint64_t)((const uint8_t*)(x))[5] << 40) | \ - ((uint64_t)((const uint8_t*)(x))[4] << 32) | \ - ((uint64_t)((const uint8_t*)(x))[3] << 24) | \ - ((uint64_t)((const uint8_t*)(x))[2] << 16) | \ - ((uint64_t)((const uint8_t*)(x))[1] << 8) | \ - (uint64_t)((const uint8_t*)(x))[0]) -#endif -#ifndef AV_WL64 -# define AV_WL64(p, d) do { \ - ((uint8_t*)(p))[0] = (d); \ - ((uint8_t*)(p))[1] = (d)>>8; \ - ((uint8_t*)(p))[2] = (d)>>16; \ - ((uint8_t*)(p))[3] = (d)>>24; \ - ((uint8_t*)(p))[4] = (d)>>32; \ - ((uint8_t*)(p))[5] = (d)>>40; \ - ((uint8_t*)(p))[6] = (d)>>48; \ - ((uint8_t*)(p))[7] = (d)>>56; \ - } while(0) -#endif - -#if HAVE_BIGENDIAN -# define AV_RN(s, p) AV_RB##s(p) -# define AV_WN(s, p, v) AV_WB##s(p, v) -#else -# define AV_RN(s, p) AV_RL##s(p) -# define AV_WN(s, p, v) AV_WL##s(p, v) -#endif - -#endif /* HAVE_FAST_UNALIGNED */ - -#ifndef AV_RN16 -# define AV_RN16(p) AV_RN(16, p) -#endif - -#ifndef AV_RN32 -# define AV_RN32(p) AV_RN(32, p) -#endif - -#ifndef AV_RN64 -# define AV_RN64(p) AV_RN(64, p) -#endif - -#ifndef AV_WN16 -# define AV_WN16(p, v) AV_WN(16, p, v) -#endif - -#ifndef AV_WN32 -# define AV_WN32(p, v) AV_WN(32, p, v) -#endif - -#ifndef AV_WN64 -# define AV_WN64(p, v) AV_WN(64, p, v) -#endif - -#if HAVE_BIGENDIAN -# define AV_RB(s, p) AV_RN##s(p) -# define AV_WB(s, p, v) AV_WN##s(p, v) -# define AV_RL(s, p) bswap_##s(AV_RN##s(p)) -# define AV_WL(s, p, v) AV_WN##s(p, bswap_##s(v)) -#else -# define AV_RB(s, p) bswap_##s(AV_RN##s(p)) -# define AV_WB(s, p, v) AV_WN##s(p, bswap_##s(v)) -# define AV_RL(s, p) AV_RN##s(p) -# define AV_WL(s, p, v) AV_WN##s(p, v) -#endif - -#define AV_RB8(x) (((const uint8_t*)(x))[0]) -#define AV_WB8(p, d) do { ((uint8_t*)(p))[0] = (d); } while(0) - -#define AV_RL8(x) AV_RB8(x) -#define AV_WL8(p, d) AV_WB8(p, d) - -#ifndef AV_RB16 -# define AV_RB16(p) AV_RB(16, p) -#endif -#ifndef AV_WB16 -# define AV_WB16(p, v) AV_WB(16, p, v) -#endif - -#ifndef AV_RL16 -# define AV_RL16(p) AV_RL(16, p) -#endif -#ifndef AV_WL16 -# define AV_WL16(p, v) AV_WL(16, p, v) -#endif - -#ifndef AV_RB32 -# define AV_RB32(p) AV_RB(32, p) -#endif -#ifndef AV_WB32 -# define AV_WB32(p, v) AV_WB(32, p, v) -#endif - -#ifndef AV_RL32 -# define AV_RL32(p) AV_RL(32, p) -#endif -#ifndef AV_WL32 -# define AV_WL32(p, v) AV_WL(32, p, v) -#endif - -#ifndef AV_RB64 -# define AV_RB64(p) AV_RB(64, p) -#endif -#ifndef AV_WB64 -# define AV_WB64(p, v) AV_WB(64, p, v) -#endif - -#ifndef AV_RL64 -# define AV_RL64(p) AV_RL(64, p) -#endif -#ifndef AV_WL64 -# define AV_WL64(p, v) AV_WL(64, p, v) -#endif - -#ifndef AV_RB24 -# define AV_RB24(x) \ - ((((const uint8_t*)(x))[0] << 16) | \ - (((const uint8_t*)(x))[1] << 8) | \ - ((const uint8_t*)(x))[2]) -#endif -#ifndef AV_WB24 -# define AV_WB24(p, d) do { \ - ((uint8_t*)(p))[2] = (d); \ - ((uint8_t*)(p))[1] = (d)>>8; \ - ((uint8_t*)(p))[0] = (d)>>16; \ - } while(0) -#endif - -#ifndef AV_RL24 -# define AV_RL24(x) \ - ((((const uint8_t*)(x))[2] << 16) | \ - (((const uint8_t*)(x))[1] << 8) | \ - ((const uint8_t*)(x))[0]) -#endif -#ifndef AV_WL24 -# define AV_WL24(p, d) do { \ - ((uint8_t*)(p))[0] = (d); \ - ((uint8_t*)(p))[1] = (d)>>8; \ - ((uint8_t*)(p))[2] = (d)>>16; \ - } while(0) -#endif - -/* - * The AV_[RW]NA macros access naturally aligned data - * in a type-safe way. - */ - -#define AV_RNA(s, p) (((const av_alias##s*)(p))->u##s) -#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v)) - -#ifndef AV_RN16A -# define AV_RN16A(p) AV_RNA(16, p) -#endif - -#ifndef AV_RN32A -# define AV_RN32A(p) AV_RNA(32, p) -#endif - -#ifndef AV_RN64A -# define AV_RN64A(p) AV_RNA(64, p) -#endif - -#ifndef AV_WN16A -# define AV_WN16A(p, v) AV_WNA(16, p, v) -#endif - -#ifndef AV_WN32A -# define AV_WN32A(p, v) AV_WNA(32, p, v) -#endif - -#ifndef AV_WN64A -# define AV_WN64A(p, v) AV_WNA(64, p, v) -#endif - -/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be - * naturally aligned. They may be implemented using MMX, - * so emms_c() must be called before using any float code - * afterwards. - */ - -#define AV_COPY(n, d, s) \ - (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n) - -#ifndef AV_COPY16 -# define AV_COPY16(d, s) AV_COPY(16, d, s) -#endif - -#ifndef AV_COPY32 -# define AV_COPY32(d, s) AV_COPY(32, d, s) -#endif - -#ifndef AV_COPY64 -# define AV_COPY64(d, s) AV_COPY(64, d, s) -#endif - -#ifndef AV_COPY128 -# define AV_COPY128(d, s) \ - do { \ - AV_COPY64(d, s); \ - AV_COPY64((char*)(d)+8, (char*)(s)+8); \ - } while(0) -#endif - -#define AV_SWAP(n, a, b) FFSWAP(av_alias##n, *(av_alias##n*)(a), *(av_alias##n*)(b)) - -#ifndef AV_SWAP64 -# define AV_SWAP64(a, b) AV_SWAP(64, a, b) -#endif - -#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0) - -#ifndef AV_ZERO16 -# define AV_ZERO16(d) AV_ZERO(16, d) -#endif - -#ifndef AV_ZERO32 -# define AV_ZERO32(d) AV_ZERO(32, d) -#endif - -#ifndef AV_ZERO64 -# define AV_ZERO64(d) AV_ZERO(64, d) -#endif - -#ifndef AV_ZERO128 -# define AV_ZERO128(d) \ - do { \ - AV_ZERO64(d); \ - AV_ZERO64((char*)(d)+8); \ - } while(0) -#endif - -#endif /* AVUTIL_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/log.c --- a/ffmpeg_smp/h264dec/libavutil/log.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ -/* - * log functions - * Copyright (c) 2003 Michel Bardiaux - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * logging functions - */ -#include "error.h" -#include -#include -#include "log.h" - - -static int av_log_level = AV_LOG_INFO; - -static int use_ansi_color=-1; - -#undef fprintf -static void colored_fputs(int color, const char *str){ - if(use_ansi_color<0){ -#if HAVE_ISATTY && !defined(_WIN32) - use_ansi_color= getenv("TERM") && !getenv("NO_COLOR") && isatty(2); -#else - use_ansi_color= 0; -#endif - } - - if(use_ansi_color){ - fprintf(stderr, "\033[%d;3%dm", color>>4, color&15); - } - fputs(str, stderr); - if(use_ansi_color){ - fprintf(stderr, "\033[0m"); - } -} - -void av_log_default_callback(int level, const char* fmt, va_list vl) -{ - static int print_prefix=1; - static int count; - static char line[1024], prev[1024]; - static const uint8_t color[]={0x41,0x41,0x11,0x03,9,9,9}; - - if(level>av_log_level) - return; -#undef fprintf - - line[0]=0; - - vsnprintf(line + strlen(line), sizeof(line) - strlen(line), fmt, vl); - - print_prefix= line[strlen(line)-1] == '\n'; - if(print_prefix && !strcmp(line, prev)){ - count++; - return; - } - if(count>0){ - fprintf(stderr, " Last message repeated %d times\n", count); - count=0; - } - colored_fputs(color[av_clip(level>>3, 0, 6)], line); - strcpy(prev, line); -} - -static void (*av_log_callback)(int, const char*, va_list) = av_log_default_callback; - -void av_log(int level, const char *fmt, ...) -{ - va_list vl; - va_start(vl, fmt); - av_vlog(level, fmt, vl); - va_end(vl); -} - -void av_vlog(int level, const char *fmt, va_list vl) -{ - av_log_callback(level, fmt, vl); -} - -int av_log_get_level(void) -{ - return av_log_level; -} - -void av_log_set_level(int level) -{ - av_log_level = level; -} - -void av_log_set_callback(void (*callback)(int, const char*, va_list)) -{ - av_log_callback = callback; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/log.h --- a/ffmpeg_smp/h264dec/libavutil/log.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_LOG_H -#define AVUTIL_LOG_H - -#include -//#include "avutil.h" - -/** - * Describes the class of an AVClass context structure. That is an - * arbitrary struct of which the first field is a pointer to an - * AVClass struct (e.g. AVCodecContext, AVFormatContext etc.). - */ -typedef struct { - /** - * The name of the class; usually it is the same name as the - * context structure type to which the AVClass is associated. - */ - const char* class_name; - - /** - * A pointer to a function which returns the name of a context - * instance ctx associated with the class. - */ - const char* (*item_name)(void* ctx); - - /** - * a pointer to the first option specified in the class if any or NULL - * - * @see av_set_default_options() - */ - const struct AVOption *option; - - /** - * LIBAVUTIL_VERSION with which this structure was created. - * This is used to allow fields to be added without requiring major - * version bumps everywhere. - */ - - int version; -} AVClass; - -/* av_log API */ - -#define AV_LOG_QUIET -8 - -/** - * Something went really wrong and we will crash now. - */ -#define AV_LOG_PANIC 0 - -/** - * Something went wrong and recovery is not possible. - * For example, no header was found for a format which depends - * on headers or an illegal combination of parameters is used. - */ -#define AV_LOG_FATAL 8 - -/** - * Something went wrong and cannot losslessly be recovered. - * However, not all future data is affected. - */ -#define AV_LOG_ERROR 16 - -/** - * Something somehow does not look correct. This may or may not - * lead to problems. An example would be the use of '-vstrict -2'. - */ -#define AV_LOG_WARNING 24 - -#define AV_LOG_INFO 32 -#define AV_LOG_VERBOSE 40 - -/** - * Stuff which is only useful for libav* developers. - */ -#define AV_LOG_DEBUG 48 - -/** - * Sends the specified message to the log if the level is less than or equal - * to the current av_log_level. By default, all logging messages are sent to - * stderr. This behavior can be altered by setting a different av_vlog callback - * function. - * - * @param avcl A pointer to an arbitrary struct of which the first field is a - * pointer to an AVClass struct. - * @param level The importance level of the message, lower values signifying - * higher importance. - * @param fmt The format string (printf-compatible) that specifies how - * subsequent arguments are converted to output. - * @see av_vlog - */ - -void av_log(int level, const char *fmt, ...); - -void av_vlog(int level, const char *fmt, va_list); -int av_log_get_level(void); -void av_log_set_level(int); -void av_log_set_callback(void (*)(int, const char*, va_list)); -void av_log_default_callback(int level, const char* fmt, va_list vl); - -#endif /* AVUTIL_LOG_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/mem.c --- a/ffmpeg_smp/h264dec/libavutil/mem.c Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ -/* - * default memory allocator for libavutil - * Copyright (c) 2002 Fabrice Bellard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * default memory allocator for libavutil - */ - -#include "config.h" - -#include -#include -#include -#include -#if HAVE_MALLOC_H -#include -#endif - -#include "mem.h" - -/* here we can use OS-dependent allocation functions */ -#undef free -#undef malloc -#undef realloc - -#ifdef MALLOC_PREFIX - -#define malloc AV_JOIN(MALLOC_PREFIX, malloc) -#define memalign AV_JOIN(MALLOC_PREFIX, memalign) -#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign) -#define realloc AV_JOIN(MALLOC_PREFIX, realloc) -#define free AV_JOIN(MALLOC_PREFIX, free) - -void *malloc(size_t size); -void *memalign(size_t align, size_t size); -int posix_memalign(void **ptr, size_t align, size_t size); -void *realloc(void *ptr, size_t size); -void free(void *ptr); - -#endif /* MALLOC_PREFIX */ - - -/* You can redefine av_malloc and av_free in your project to use your - memory allocator. You do not need to suppress this file because the - linker will do it automatically. */ - -void *av_malloc(unsigned int size) -{ - void *ptr = NULL; - /* let's disallow possible ambiguous cases */ - if(size > (INT_MAX-16) ) - return NULL; - -//FIXME: when no aligned mallocs vector code should be disabled. -#if HAVE_POSIX_MEMALIGN - if (posix_memalign(&ptr,16,size)) - ptr = NULL; -#elif HAVE_MEMALIGN - ptr = memalign(16,size); -#else - ptr = malloc(size); -#endif - return ptr; -} - -void *av_realloc(void *ptr, unsigned int size) -{ - /* let's disallow possible ambiguous cases */ - if(size > (INT_MAX-16) ) - return NULL; - - return realloc(ptr, size); - -} - -void av_free(void *ptr) -{ - /* XXX: this test should not be needed on most libcs */ - if (ptr) - free(ptr); - -} - -void av_freep(void *arg) -{ - void **ptr= (void**)arg; - av_free(*ptr); - *ptr = NULL; -} - -void *av_mallocz(unsigned int size) -{ - void *ptr = av_malloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; -} - -char *av_strdup(const char *s) -{ - char *ptr= NULL; - if(s){ - int len = strlen(s) + 1; - ptr = av_malloc(len); - if (ptr) - memcpy(ptr, s, len); - } - return ptr; -} diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/mem.h --- a/ffmpeg_smp/h264dec/libavutil/mem.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,143 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * memory handling functions - */ - -#ifndef AVUTIL_MEM_H -#define AVUTIL_MEM_H - -#include "attributes.h" -#include "config.h" - -#define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v -#define DECLARE_ALIGNED_16(t,v) t __attribute__ ((aligned (16))) v -#define DECLARE_ASM_CONST(n,t,v) static const t __attribute__((used)) __attribute__ ((aligned (n))) v - -#if AV_GCC_VERSION_AT_LEAST(3,1) - #define av_malloc_attrib __attribute__((__malloc__)) -#else - #define av_malloc_attrib -#endif - -/** - * Allocates a block of size bytes with alignment suitable for all - * memory accesses (including vectors if available on the CPU). - * @param size Size in bytes for the memory block to be allocated. - * @return Pointer to the allocated block, NULL if the block cannot - * be allocated. - * @see av_mallocz() - */ -void *av_malloc(unsigned int size) av_malloc_attrib; - -/** - * Allocates or reallocates a block of memory. - * If ptr is NULL and size > 0, allocates a new block. If - * size is zero, frees the memory block pointed to by ptr. - * @param size Size in bytes for the memory block to be allocated or - * reallocated. - * @param ptr Pointer to a memory block already allocated with - * av_malloc(z)() or av_realloc() or NULL. - * @return Pointer to a newly reallocated block or NULL if the block - * cannot be reallocated or the function is used to free the memory block. - * @see av_fast_realloc() - */ -void *av_realloc(void *ptr, unsigned int size); - -/** - * Reallocates the given block if it is not large enough, otherwise it - * does nothing. - * - * @see av_realloc - */ -void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size); - -/** - * Allocates a buffer, reusing the given one if large enough. - * - * Contrary to av_fast_realloc the current buffer contents might not be - * preserved and on error the old buffer is freed, thus no special - * handling to avoid memleaks is necessary. - * - * @param ptr pointer to pointer to already allocated buffer, overwritten with pointer to new buffer - * @param size size of the buffer *ptr points to - * @param min_size minimum size of *ptr buffer after returning, *ptr will be NULL and - * *size 0 if an error occurred. - */ -void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size); - -/** - * Frees a memory block which has been allocated with av_malloc(z)() or - * av_realloc(). - * @param ptr Pointer to the memory block which should be freed. - * @note ptr = NULL is explicitly allowed. - * @note It is recommended that you use av_freep() instead. - * @see av_freep() - */ - -void av_free(void *ptr); - -/** - * Allocates a block of size bytes with alignment suitable for all - * memory accesses (including vectors if available on the CPU) and - * zeroes all the bytes of the block. - * @param size Size in bytes for the memory block to be allocated. - * @return Pointer to the allocated block, NULL if it cannot be allocated. - * @see av_malloc() - */ -void *av_mallocz(unsigned int size) av_malloc_attrib; - -/** - * Duplicates the string s. - * @param s string to be duplicated - * @return Pointer to a newly allocated string containing a - * copy of s or NULL if the string cannot be allocated. - */ -char *av_strdup(const char *s) av_malloc_attrib; - -/** - * Frees a memory block which has been allocated with av_malloc(z)() or - * av_realloc() and set the pointer pointing to it to NULL. - * @param ptr Pointer to the pointer to the memory block which should - * be freed. - * @see av_free() - */ -void av_freep(void *ptr); - - -static av_always_inline uint32_t pack16to32(int a, int b){ -#if HAVE_BIGENDIAN - return (b&0xFFFF) + (a<<16); -#else - return (a&0xFFFF) + (b<<16); -#endif -} - -static av_always_inline uint16_t pack8to16(int a, int b){ -#if HAVE_BIGENDIAN - return (b&0xFF) + (a<<8); -#else - return (a&0xFF) + (b<<8); -#endif -} - -#endif /* AVUTIL_MEM_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/pixfmt.h --- a/ffmpeg_smp/h264dec/libavutil/pixfmt.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,161 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_PIXFMT_H -#define AVUTIL_PIXFMT_H - -/** - * @file - * pixel format definitions - * - * @warning This file has to be considered an internal but installed - * header, so it should not be directly included in your projects. - */ - -/** - * Pixel format. Notes: - * - * PIX_FMT_RGB32 is handled in an endian-specific manner. An RGBA - * color is put together as: - * (A << 24) | (R << 16) | (G << 8) | B - * This is stored as BGRA on little-endian CPU architectures and ARGB on - * big-endian CPUs. - * - * When the pixel format is palettized RGB (PIX_FMT_PAL8), the palettized - * image data is stored in AVFrame.data[0]. The palette is transported in - * AVFrame.data[1], is 1024 bytes long (256 4-byte entries) and is - * formatted the same as in PIX_FMT_RGB32 described above (i.e., it is - * also endian-specific). Note also that the individual RGB palette - * components stored in AVFrame.data[1] should be in the range 0..255. - * This is important as many custom PAL8 video codecs that were designed - * to run on the IBM VGA graphics adapter use 6-bit palette components. - * - * For all the 8bit per pixel formats, an RGB32 palette is in data[1] like - * for pal8. This palette is filled in automatically by the function - * allocating the picture. - * - * Note, make sure that all newly added big endian formats have pix_fmt&1==1 - * and that all newly added little endian formats have pix_fmt&1==0 - * this allows simpler detection of big vs little endian. - */ -enum PixelFormat { - PIX_FMT_NONE= -1, - PIX_FMT_YUV420P, ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples) - PIX_FMT_YUYV422, ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr - PIX_FMT_RGB24, ///< packed RGB 8:8:8, 24bpp, RGBRGB... - PIX_FMT_BGR24, ///< packed RGB 8:8:8, 24bpp, BGRBGR... - PIX_FMT_YUV422P, ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples) - PIX_FMT_YUV444P, ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples) - PIX_FMT_YUV410P, ///< planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples) - PIX_FMT_YUV411P, ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples) - PIX_FMT_GRAY8, ///< Y , 8bpp - PIX_FMT_MONOWHITE, ///< Y , 1bpp, 0 is white, 1 is black - PIX_FMT_MONOBLACK, ///< Y , 1bpp, 0 is black, 1 is white - PIX_FMT_PAL8, ///< 8 bit with PIX_FMT_RGB32 palette - PIX_FMT_YUVJ420P, ///< planar YUV 4:2:0, 12bpp, full scale (JPEG) - PIX_FMT_YUVJ422P, ///< planar YUV 4:2:2, 16bpp, full scale (JPEG) - PIX_FMT_YUVJ444P, ///< planar YUV 4:4:4, 24bpp, full scale (JPEG) - PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing - PIX_FMT_XVMC_MPEG2_IDCT, - PIX_FMT_UYVY422, ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1 - PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3 - PIX_FMT_BGR8, ///< packed RGB 3:3:2, 8bpp, (msb)2B 3G 3R(lsb) - PIX_FMT_BGR4, ///< packed RGB 1:2:1, 4bpp, (msb)1B 2G 1R(lsb) - PIX_FMT_BGR4_BYTE, ///< packed RGB 1:2:1, 8bpp, (msb)1B 2G 1R(lsb) - PIX_FMT_RGB8, ///< packed RGB 3:3:2, 8bpp, (msb)2R 3G 3B(lsb) - PIX_FMT_RGB4, ///< packed RGB 1:2:1, 4bpp, (msb)1R 2G 1B(lsb) - PIX_FMT_RGB4_BYTE, ///< packed RGB 1:2:1, 8bpp, (msb)1R 2G 1B(lsb) - PIX_FMT_NV12, ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 for UV - PIX_FMT_NV21, ///< as above, but U and V bytes are swapped - - PIX_FMT_ARGB, ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB... - PIX_FMT_RGBA, ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA... - PIX_FMT_ABGR, ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR... - PIX_FMT_BGRA, ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA... - - PIX_FMT_GRAY16BE, ///< Y , 16bpp, big-endian - PIX_FMT_GRAY16LE, ///< Y , 16bpp, little-endian - PIX_FMT_YUV440P, ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples) - PIX_FMT_YUVJ440P, ///< planar YUV 4:4:0 full scale (JPEG) - PIX_FMT_YUVA420P, ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples) - PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers - PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers - PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers - PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers - PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers - PIX_FMT_RGB48BE, ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, big-endian - PIX_FMT_RGB48LE, ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, little-endian - - PIX_FMT_RGB565BE, ///< packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), big-endian - PIX_FMT_RGB565LE, ///< packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), little-endian - PIX_FMT_RGB555BE, ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), big-endian, most significant bit to 0 - PIX_FMT_RGB555LE, ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), little-endian, most significant bit to 0 - - PIX_FMT_BGR565BE, ///< packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), big-endian - PIX_FMT_BGR565LE, ///< packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), little-endian - PIX_FMT_BGR555BE, ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), big-endian, most significant bit to 1 - PIX_FMT_BGR555LE, ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), little-endian, most significant bit to 1 - - PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers - PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers - PIX_FMT_VAAPI_VLD, ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers - - PIX_FMT_YUV420P16LE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian - PIX_FMT_YUV420P16BE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian - PIX_FMT_YUV422P16LE, ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian - PIX_FMT_YUV422P16BE, ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian - PIX_FMT_YUV444P16LE, ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian - PIX_FMT_YUV444P16BE, ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian - PIX_FMT_VDPAU_MPEG4, ///< MPEG4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers - PIX_FMT_DXVA2_VLD, ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer - - PIX_FMT_RGB444BE, ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), big-endian, most significant bits to 0 - PIX_FMT_RGB444LE, ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), little-endian, most significant bits to 0 - PIX_FMT_BGR444BE, ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), big-endian, most significant bits to 1 - PIX_FMT_BGR444LE, ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), little-endian, most significant bits to 1 - PIX_FMT_Y400A, ///< 8bit gray, 8bit alpha - PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions -}; - -#if HAVE_BIGENDIAN -# define PIX_FMT_NE(be, le) PIX_FMT_##be -#else -# define PIX_FMT_NE(be, le) PIX_FMT_##le -#endif - -#define PIX_FMT_RGB32 PIX_FMT_NE(ARGB, BGRA) -#define PIX_FMT_RGB32_1 PIX_FMT_NE(RGBA, ABGR) -#define PIX_FMT_BGR32 PIX_FMT_NE(ABGR, RGBA) -#define PIX_FMT_BGR32_1 PIX_FMT_NE(BGRA, ARGB) - -#define PIX_FMT_GRAY16 PIX_FMT_NE(GRAY16BE, GRAY16LE) -#define PIX_FMT_RGB48 PIX_FMT_NE(RGB48BE, RGB48LE) -#define PIX_FMT_RGB565 PIX_FMT_NE(RGB565BE, RGB565LE) -#define PIX_FMT_RGB555 PIX_FMT_NE(RGB555BE, RGB555LE) -#define PIX_FMT_RGB444 PIX_FMT_NE(RGB444BE, RGB444LE) -#define PIX_FMT_BGR565 PIX_FMT_NE(BGR565BE, BGR565LE) -#define PIX_FMT_BGR555 PIX_FMT_NE(BGR555BE, BGR555LE) -#define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE) - -#define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE) -#define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE) -#define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE) - -#endif /* AVUTIL_PIXFMT_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h --- a/ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_PPC_INTREADWRITE_H -#define AVUTIL_PPC_INTREADWRITE_H - -#include -#include "config.h" - -#if HAVE_XFORM_ASM - -#define AV_RL16 AV_RL16 -static av_always_inline uint16_t AV_RL16(const void *p) -{ - uint16_t v; - __asm__ ("lhbrx %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p)); - return v; -} - -#define AV_WL16 AV_WL16 -static av_always_inline void AV_WL16(void *p, uint16_t v) -{ - __asm__ ("sthbrx %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v)); -} - -#define AV_RL32 AV_RL32 -static av_always_inline uint32_t AV_RL32(const void *p) -{ - uint32_t v; - __asm__ ("lwbrx %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p)); - return v; -} - -#define AV_WL32 AV_WL32 -static av_always_inline void AV_WL32(void *p, uint32_t v) -{ - __asm__ ("stwbrx %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v)); -} - -#if HAVE_LDBRX - -#define AV_RL64 AV_RL64 -static av_always_inline uint64_t AV_RL64(const void *p) -{ - uint64_t v; - __asm__ ("ldbrx %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p)); - return v; -} - -#define AV_WL64 AV_WL64 -static av_always_inline void AV_WL64(void *p, uint64_t v) -{ - __asm__ ("stdbrx %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v)); -} - -#else - -#define AV_RL64 AV_RL64 -static av_always_inline uint64_t AV_RL64(const void *p) -{ - union { uint64_t v; uint32_t hl[2]; } v; - __asm__ ("lwbrx %0, %y2 \n\t" - "lwbrx %1, %y3 \n\t" - : "=&r"(v.hl[1]), "=r"(v.hl[0]) - : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1))); - return v.v; -} - -#define AV_WL64 AV_WL64 -static av_always_inline void AV_WL64(void *p, uint64_t v) -{ - union { uint64_t v; uint32_t hl[2]; } vv = { v }; - __asm__ ("stwbrx %2, %y0 \n\t" - "stwbrx %3, %y1 \n\t" - : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1)) - : "r"(vv.hl[1]), "r"(vv.hl[0])); -} - -#endif /* HAVE_LDBRX */ - -#endif /* HAVE_XFORM_ASM */ - -/* - * GCC fails miserably on the packed struct version which is used by - * default, so we override it here. - */ - -#define AV_RB64(p) (*(const uint64_t *)(p)) -#define AV_WB64(p, v) (*(uint64_t *)(p) = (v)) - -#endif /* AVUTIL_PPC_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/ppc/timer.h --- a/ffmpeg_smp/h264dec/libavutil/ppc/timer.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2005 Luca Barbato - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_PPC_TIMER_H -#define AVUTIL_PPC_TIMER_H - -#include - -#define AV_READ_TIME read_time - -static inline uint64_t read_time(void) -{ - uint32_t tbu, tbl, temp; - - /* from section 2.2.1 of the 32-bit PowerPC PEM */ - __asm__ volatile( - "1:\n" - "mftbu %2\n" - "mftb %0\n" - "mftbu %1\n" - "cmpw %2,%1\n" - "bne 1b\n" - : "=r"(tbl), "=r"(tbu), "=r"(temp) - : - : "cc"); - - return (((uint64_t)tbu)<<32) | (uint64_t)tbl; -} - -#endif /* AVUTIL_PPC_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/timer.h --- a/ffmpeg_smp/h264dec/libavutil/timer.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -/** - * @file - * high precision timer, useful to profile code - * - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_TIMER_H -#define AVUTIL_TIMER_H - -#include -#include -#include "config.h" - -#if ARCH_ARM -# include "arm/timer.h" -#elif ARCH_PPC -# include "ppc/timer.h" -#elif ARCH_X86 -# include "x86/timer.h" -#endif - -#if !defined(AV_READ_TIME) && HAVE_GETHRTIME -# define AV_READ_TIME gethrtime -#endif - -#ifdef AV_READ_TIME -#define START_TIMER \ -uint64_t tend;\ -uint64_t tstart= AV_READ_TIME();\ - -#define STOP_TIMER(id) \ -tend= AV_READ_TIME();\ -{\ - static uint64_t tsum=0;\ - static int tcount=0;\ - static int tskip_count=0;\ - if(tcount<2 || tend - tstart < 8*tsum/tcount || tend - tstart < 2000){\ - tsum+= tend - tstart;\ - tcount++;\ - }else\ - tskip_count++;\ - if(((tcount+tskip_count)&(tcount+tskip_count-1))==0){\ - av_log(NULL, AV_LOG_ERROR, "%"PRIu64" dezicycles in %s, %d runs, %d skips\n",\ - tsum*10/tcount, id, tcount, tskip_count);\ - }\ -} -#else -#define START_TIMER -#define STOP_TIMER(id) {} -#endif - -#endif /* AVUTIL_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86/bswap.h --- a/ffmpeg_smp/h264dec/libavutil/x86/bswap.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * byte swapping routines - */ - -#ifndef AVUTIL_X86_BSWAP_H -#define AVUTIL_X86_BSWAP_H - -#include -#include "config.h" -#include "libavutil/attributes.h" - -#define bswap_16 bswap_16 -static av_always_inline av_const uint16_t bswap_16(uint16_t x) -{ - __asm__("rorw $8, %0" : "+r"(x)); - return x; -} - -#define bswap_32 bswap_32 -static av_always_inline av_const uint32_t bswap_32(uint32_t x) -{ -// #if HAVE_BSWAP - __asm__("bswap %0" : "+r" (x)); -// #else -// __asm__("rorw $8, %w0 \n\t" -// "rorl $16, %0 \n\t" -// "rorw $8, %w0" -// : "+r"(x)); -// #endif - return x; -} - -#if ARCH_X86_64 -#define bswap_64 bswap_64 -static inline uint64_t av_const bswap_64(uint64_t x) -{ - __asm__("bswap %0": "=r" (x) : "0" (x)); - return x; -} -#endif - -#endif /* AVUTIL_X86_BSWAP_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h --- a/ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2010 Alexander Strange - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_INTREADWRITE_H -#define AVUTIL_X86_INTREADWRITE_H - -#include -#include "config.h" -#include "libavutil/attributes.h" - -#if HAVE_MMX - -#if defined(__MMX__) - -#define AV_COPY64 AV_COPY64 -static av_always_inline void AV_COPY64(void *d, const void *s) -{ - __asm__("movq %1, %%mm0 \n\t" - "movq %%mm0, %0 \n\t" - : "=m"(*(uint64_t*)d) - : "m" (*(const uint64_t*)s) - : "mm0"); -} - -#define AV_SWAP64 AV_SWAP64 -static av_always_inline void AV_SWAP64(void *a, void *b) -{ - __asm__("movq %1, %%mm0 \n\t" - "movq %0, %%mm1 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm1, %1 \n\t" - : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b) - ::"mm0", "mm1"); -} - -#define AV_ZERO64 AV_ZERO64 -static av_always_inline void AV_ZERO64(void *d) -{ - __asm__("pxor %%mm0, %%mm0 \n\t" - "movq %%mm0, %0 \n\t" - : "=m"(*(uint64_t*)d) - :: "mm0"); -} - -#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */ - -#ifdef __SSE__ - -#define AV_COPY128 AV_COPY128 -static av_always_inline void AV_COPY128(void *d, const void *s) -{ - struct v {uint64_t v[2];}; - - __asm__("movaps %1, %%xmm0 \n\t" - "movaps %%xmm0, %0 \n\t" - : "=m"(*(struct v*)d) - : "m" (*(const struct v*)s) - : "xmm0"); -} - -#endif /* __SSE__ */ - -#ifdef __SSE2__ - -#define AV_ZERO128 AV_ZERO128 -static av_always_inline void AV_ZERO128(void *d) -{ - struct v {uint64_t v[2];}; - - __asm__("pxor %%xmm0, %%xmm0 \n\t" - "movdqa %%xmm0, %0 \n\t" - : "=m"(*(struct v*)d) - :: "xmm0"); -} - -#endif /* __SSE2__ */ - -#endif /* HAVE_MMX */ - -#endif /* AVUTIL_X86_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86/timer.h --- a/ffmpeg_smp/h264dec/libavutil/x86/timer.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_TIMER_H -#define AVUTIL_X86_TIMER_H - -#include - -#define AV_READ_TIME read_time - -static inline uint64_t read_time(void) -{ - uint32_t a, d; - __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); - return ((uint64_t)d << 32) + a; -} - -#endif /* AVUTIL_X86_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 ffmpeg_smp/h264dec/libavutil/x86_cpu.h --- a/ffmpeg_smp/h264dec/libavutil/x86_cpu.h Mon Aug 27 12:09:56 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -/* - * copyright (c) 2006 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVUTIL_X86_CPU_H -#define AVUTIL_X86_CPU_H - -#include -#include "config.h" - -#if ARCH_X86_64 -# define REG_a "rax" -# define REG_b "rbx" -# define REG_c "rcx" -# define REG_d "rdx" -# define REG_D "rdi" -# define REG_S "rsi" -# define PTR_SIZE "8" -typedef int64_t x86_reg; - -# define REG_SP "rsp" -# define REG_BP "rbp" -# define REGBP rbp -# define REGa rax -# define REGb rbx -# define REGc rcx -# define REGd rdx -# define REGSP rsp - -#elif ARCH_X86_32 - -# define REG_a "eax" -# define REG_b "ebx" -# define REG_c "ecx" -# define REG_d "edx" -# define REG_D "edi" -# define REG_S "esi" -# define PTR_SIZE "4" -typedef int32_t x86_reg; - -# define REG_SP "esp" -# define REG_BP "ebp" -# define REGBP ebp -# define REGa eax -# define REGb ebx -# define REGc ecx -# define REGd edx -# define REGSP esp -#else -typedef int x86_reg; -#endif - -// #if ARCH_X86_64 && defined(PIC) -// # define BROKEN_RELOCATIONS 1 -// #endif - -#endif /* AVUTIL_X86_CPU_H */ diff -r 11d15c47beaf -r 897f711a7157 h264dec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/h264dec.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,288 @@ +/* +* H264 decoder main +*/ + +#include "config.h" +#include "libavcodec/h264.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + + +static const char program_name[] = "h264dec"; +static const int program_birth_year = 2010; + +static const char *file_name; +static int ifile, ofile; +static int no_arch =0; +static int parallel = 1; +static int frame_width = 0; +static int frame_height = 0; + +static void av_exit(int ret) +{ + //do some free calls +#undef exit + exit(ret); +} + +static void opt_input_file(const char *filename) +{ + /* open the input file */ + ifile = open(filename, O_RDONLY, 0666); + if (ifile < 0){ + fprintf(stderr, "Failed to open %s\n", filename); + av_exit(-1); + } + + //parse first frame to get resolution (other information available but not used) + H264Slice slice; + PictureInfo pi; + GetBitContext gb = {0,}; + ParserContext *pc; + NalContext *nc; + + pc = get_parse_context(ifile); + nc = get_nal_context(0, 0); + + memset(&slice, 0, sizeof(H264Slice)); + slice.current_picture_info=π + + av_read_frame_internal(pc, &gb); + decode_nal_units(nc, &slice, &gb); + + frame_width = nc->width; + frame_height= nc->height; + + //clean up + av_freep(&gb.raw); + if (gb.rbsp) + av_freep(&gb.rbsp); + free_parse_context(pc); + free_nal_context(nc); + + //rewind file + int offset; + if ( (offset=lseek(ifile, 0, SEEK_SET)) ){ + fprintf(stderr, "Rewind input file %s failed at offset %d\n", filename, offset); + } + +} + +static void opt_output_file(const char *filename) +{ + if (filename){ + if (!strcmp(filename, "-")) + filename = "pipe:"; + + ofile = open(filename, O_CREAT | O_TRUNC | O_WRONLY, 0666); + }else{ + ofile =0; + } +} + +static void show_usage(void) +{ + printf("usage: ffmpeg [options] -i infile }...\n"); + printf("\n"); +} + +static struct option long_options[] = { + {"static-sched", 0, 0, 0}, + {"static-mbd", 0, 0, 0}, + {"numamap", 0, 0, 0}, + {"no-mbd", 0, 0, 0}, + {"static-3d", 0, 0, 0}, + {"slice-bufs", 1, 0, 0}, + {"smt", 0, 0, 0}, + {"noarch", 0, 0, 'a'}, + {"display", 0, 0, 'd'}, + {"fullscreen", 0, 0, 'f'}, + {"numframes", 1, 0, 'n'}, + {"use-ppe-ed", 1, 0, 'p'}, + {"sequential", 0, 0, 's'}, + {"threads", 1, 0, 't'}, + {"verbose", 1, 0, 'v'}, + {"wave-order", 1, 0, 'w'}, + {"smb-size", 1, 0, 'z'}, + {"pipe-bufs", 1, 0, 'e'}, + {0, 0, 0, 0} +}; + +static h264_options cli_opts; +static void parse_cmd(int argc, char **argv) +{ + int c; + int digit_optind = 0; + int option_index = 0; + char ofile_name[1024]; + extern char *optarg; + extern int optind, optopt; + + cli_opts.statsched =0; + cli_opts.numamap =0; + cli_opts.statmbd =0; + cli_opts.no_mbd= 0; + cli_opts.numframes = INT_MAX; + cli_opts.display=0; + cli_opts.fullscreen=0; + cli_opts.verbose=0; + cli_opts.ppe_ed=0; + cli_opts.profile=0; + cli_opts.threads = 1; + cli_opts.smb_size[0] = cli_opts.smb_size[1] = 1; + cli_opts.wave_order=0; + cli_opts.static_3d=0; + cli_opts.pipe_bufs=8; + cli_opts.slice_bufs=1; + cli_opts.smt= 0; + while ((c = getopt_long(argc, argv, "ade:fi:n:o:p:st:vwz:", long_options, &option_index)) != -1 ){ + int this_option_optind = optind ? optind : 1; + + switch (c){ + case 0: + if (option_index==0){ + cli_opts.statsched=1; + }else if (option_index==1){ + cli_opts.statmbd= 1; + }else if (option_index==2){ + cli_opts.numamap= 1; + }else if (option_index==3){ + cli_opts.no_mbd= 1; + }else if (option_index==4){ + cli_opts.static_3d= 1; + }else if (option_index==5){ + cli_opts.slice_bufs= (unsigned) atoi(optarg); + }else if (option_index==6){ + cli_opts.smt= 1; + } + break; + case '0': + case '1': + case '2': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf("option %c\n", c); + break; + case 'a': + no_arch=1; + break; + case 'd': + cli_opts.display=1; + break; + case 'f': + cli_opts.fullscreen=1; + break; + case 'i': + file_name = (const char *)optarg; + opt_input_file(file_name); + break; + case 'n': + cli_opts.numframes = (unsigned) atoi(optarg); + break; + case 'o': + strcpy(ofile_name, optarg); + opt_output_file(ofile_name); + break; + case 'p': + cli_opts.profile = (unsigned) atoi(optarg); + break; + case 's': + cli_opts.threads = 0; + parallel = 0; + break; + case 't': + cli_opts.threads = atoi(optarg); + if (cli_opts.threads<=0){ + fprintf(stderr, "Option -%c requires thread numbers > 0\n", c); + av_exit(-1); + } + break; + case 'v': + cli_opts.verbose = 1; + break; + case 'w': + cli_opts.wave_order = 1; + break; + case 'z': // only useful in ompss + if (argc < optind +1){ + fprintf(stderr, "Option -%c (--smb-size) requires 2 arguments\n", c); + av_exit(-1); + } + optind--; + for (int i=0; i<2; i++){ + cli_opts.smb_size[i] = atoi(argv[optind++]); + if (!(cli_opts.smb_size > 0)){ + fprintf(stderr, "Option -%c (--smb-size) requires dimensions > 0\n", c); + av_exit(-1); + } + } + break; + case 'e': + cli_opts.pipe_bufs = atoi(optarg); + break; + case ':': + fprintf(stderr, "Option -%c requires an operand\n", optopt); + av_exit(-1); + break; + case '?': + fprintf(stderr, "Unrecognized option: -%c\n", optopt); + av_exit(-1); + break; + } + } + +} + +int main(int argc, char **argv) +{ + /* parse options */ + parse_cmd(argc, argv); + + if(!ifile ) { + show_usage(); + av_exit(1); + } + + H264Context *h = get_h264dec_context(file_name, ifile, ofile, frame_width, frame_height, &cli_opts); +#if OMPSS + if (h264_decode_ompss( h ) < 0) + av_exit(-1); +#else + if (parallel){ + if (ARCH_CELL && !no_arch){ + if (h264_decode_cell( h ) < 0) + av_exit(-1); + }else{ + if (h264_decode_pthread( h ) < 0) + av_exit(1); + } + }else{ + if (ARCH_CELL && !no_arch){ + if (h264_decode_cell_seq( h ) < 0) + av_exit(1); + }else{ + if (h264_decode_seq( h ) < 0) + av_exit(1); + } + } +#endif + free_h264dec_context(h); + close(ifile); + close(ofile); + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/aac.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/aac.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_AAC_H +#define AVCODEC_ARM_AAC_H + +#include "config.h" + +#if HAVE_NEON && HAVE_INLINE_ASM + +#define VMUL2 VMUL2 +static inline float *VMUL2(float *dst, const float *v, unsigned idx, + const float *scale) +{ + unsigned v0, v1; + __asm__ volatile ("ubfx %0, %4, #0, #4 \n\t" + "ubfx %1, %4, #4, #4 \n\t" + "ldr %0, [%3, %0, lsl #2] \n\t" + "ldr %1, [%3, %1, lsl #2] \n\t" + "vld1.32 {d1[]}, [%5,:32] \n\t" + "vmov d0, %0, %1 \n\t" + "vmul.f32 d0, d0, d1 \n\t" + "vst1.32 {d0}, [%2,:64]! \n\t" + : "=&r"(v0), "=&r"(v1), "+r"(dst) + : "r"(v), "r"(idx), "r"(scale) + : "d0", "d1"); + return dst; +} + +#define VMUL4 VMUL4 +static inline float *VMUL4(float *dst, const float *v, unsigned idx, + const float *scale) +{ + unsigned v0, v1, v2, v3; + __asm__ volatile ("ubfx %0, %6, #0, #2 \n\t" + "ubfx %1, %6, #2, #2 \n\t" + "ldr %0, [%5, %0, lsl #2] \n\t" + "ubfx %2, %6, #4, #2 \n\t" + "ldr %1, [%5, %1, lsl #2] \n\t" + "ubfx %3, %6, #6, #2 \n\t" + "ldr %2, [%5, %2, lsl #2] \n\t" + "vmov d0, %0, %1 \n\t" + "ldr %3, [%5, %3, lsl #2] \n\t" + "vld1.32 {d2[],d3[]},[%7,:32] \n\t" + "vmov d1, %2, %3 \n\t" + "vmul.f32 q0, q0, q1 \n\t" + "vst1.32 {q0}, [%4,:128]! \n\t" + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst) + : "r"(v), "r"(idx), "r"(scale) + : "d0", "d1", "d2", "d3"); + return dst; +} + +#define VMUL2S VMUL2S +static inline float *VMUL2S(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + unsigned v0, v1, v2, v3; + __asm__ volatile ("ubfx %0, %6, #0, #4 \n\t" + "ubfx %1, %6, #4, #4 \n\t" + "ldr %0, [%5, %0, lsl #2] \n\t" + "lsl %2, %8, #30 \n\t" + "ldr %1, [%5, %1, lsl #2] \n\t" + "lsl %3, %8, #31 \n\t" + "vmov d0, %0, %1 \n\t" + "bic %2, %2, #1<<30 \n\t" + "vld1.32 {d1[]}, [%7,:32] \n\t" + "vmov d2, %2, %3 \n\t" + "veor d0, d0, d2 \n\t" + "vmul.f32 d0, d0, d1 \n\t" + "vst1.32 {d0}, [%4,:64]! \n\t" + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst) + : "r"(v), "r"(idx), "r"(scale), "r"(sign) + : "d0", "d1", "d2"); + return dst; +} + +#define VMUL4S VMUL4S +static inline float *VMUL4S(float *dst, const float *v, unsigned idx, + unsigned sign, const float *scale) +{ + unsigned v0, v1, v2, v3, nz; + __asm__ volatile ("vld1.32 {d2[],d3[]},[%9,:32] \n\t" + "ubfx %0, %8, #0, #2 \n\t" + "ubfx %1, %8, #2, #2 \n\t" + "ldr %0, [%7, %0, lsl #2] \n\t" + "ubfx %2, %8, #4, #2 \n\t" + "ldr %1, [%7, %1, lsl #2] \n\t" + "ubfx %3, %8, #6, #2 \n\t" + "ldr %2, [%7, %2, lsl #2] \n\t" + "vmov d0, %0, %1 \n\t" + "ldr %3, [%7, %3, lsl #2] \n\t" + "lsr %6, %8, #12 \n\t" + "rbit %6, %6 \n\t" + "vmov d1, %2, %3 \n\t" + "lsls %6, %6, #1 \n\t" + "and %0, %5, #1<<31 \n\t" + "lslcs %5, %5, #1 \n\t" + "lsls %6, %6, #1 \n\t" + "and %1, %5, #1<<31 \n\t" + "lslcs %5, %5, #1 \n\t" + "lsls %6, %6, #1 \n\t" + "and %2, %5, #1<<31 \n\t" + "lslcs %5, %5, #1 \n\t" + "vmov d4, %0, %1 \n\t" + "and %3, %5, #1<<31 \n\t" + "vmov d5, %2, %3 \n\t" + "veor q0, q0, q2 \n\t" + "vmul.f32 q0, q0, q1 \n\t" + "vst1.32 {q0}, [%4,:128]! \n\t" + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), + "+r"(sign), "=r"(nz) + : "r"(v), "r"(idx), "r"(scale) + : "d0", "d1", "d2", "d3", "d4", "d5"); + return dst; +} + +#endif /* HAVE_NEON && HAVE_INLINE_ASM */ + +#endif /* AVCODEC_ARM_AAC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/asm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/asm.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + + .macro require8, val=1 +ELF .eabi_attribute 24, \val + .endm + + .macro preserve8, val=1 +ELF .eabi_attribute 25, \val + .endm + + .macro function name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm +.if \export + .global EXTERN_ASM\name +EXTERN_ASM\name: +.endif +ELF .type \name, %function + .func \name +\name: + .endm + + .macro movrel rd, val +#if HAVE_ARMV6T2 && !CONFIG_PIC + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif + .endm + +#if HAVE_VFP_ARGS + .eabi_attribute 28, 1 +# define VFP +# define NOVFP @ +#else +# define VFP @ +# define NOVFP +#endif + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dcadsp_init_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dcadsp_init_arm.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavcodec/dcadsp.h" + +void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, + int decifactor, float scale, float bias); + +void av_cold ff_dcadsp_init_arm(DCADSPContext *s) +{ + if (HAVE_NEON) + s->lfe_fir = ff_dca_lfe_fir_neon; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dcadsp_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dcadsp_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +function ff_dca_lfe_fir_neon, export=1 + push {r4-r6,lr} + + add r4, r0, r3, lsl #2 @ out2 + add r5, r2, #256*4-16 @ cf1 + sub r1, r1, #12 + cmp r3, #32 + moveq r6, #256/32 + movne r6, #256/64 +NOVFP vldr d0, [sp, #16] @ scale, bias + mov lr, #-16 +1: + vmov.f32 q2, #0.0 @ v0 + vmov.f32 q3, #0.0 @ v1 + mov r12, r6 +2: + vld1.32 {q8}, [r2,:128]! @ cf0 + vld1.32 {q9}, [r5,:128], lr @ cf1 + vld1.32 {q1}, [r1], lr @ in + subs r12, r12, #4 + vrev64.32 q10, q8 + vmla.f32 q3, q1, q9 + vmla.f32 d4, d2, d21 + vmla.f32 d5, d3, d20 + bne 2b + + add r1, r1, r6, lsl #2 + subs r3, r3, #1 + vadd.f32 d4, d4, d5 + vadd.f32 d6, d6, d7 + vpadd.f32 d4, d4, d6 + vdup.32 d5, d0[1] + vmla.f32 d5, d4, d0[0] + vst1.32 {d5[0]}, [r0,:32]! + vst1.32 {d5[1]}, [r4,:32]! + bne 1b + + pop {r4-r6,pc} +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_arm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_arm.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,712 @@ +@ +@ ARMv4 optimized DSP utils +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of FFmpeg. +@ +@ FFmpeg is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ FFmpeg is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with FFmpeg; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "asm.S" + + preserve8 + +#if !HAVE_PLD +.macro pld reg +.endm +#endif + +#if HAVE_ARMV5TE +function ff_prefetch_arm, export=1 + subs r2, r2, #1 + pld [r0] + add r0, r0, r1 + bne ff_prefetch_arm + bx lr +endfunc +#endif + +.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 + mov \Rd0, \Rn0, lsr #(\shift * 8) + mov \Rd1, \Rn1, lsr #(\shift * 8) + mov \Rd2, \Rn2, lsr #(\shift * 8) + mov \Rd3, \Rn3, lsr #(\shift * 8) + orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) + orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) + orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) + orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD shift, R0, R1, R2 + mov \R0, \R0, lsr #(\shift * 8) + orr \R0, \R0, \R1, lsl #(32 - \shift * 8) + mov \R1, \R1, lsr #(\shift * 8) + orr \R1, \R1, \R2, lsl #(32 - \shift * 8) +.endm +.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 + mov \Rdst0, \Rsrc0, lsr #(\shift * 8) + mov \Rdst1, \Rsrc1, lsr #(\shift * 8) + orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) + orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) +.endm + +.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + orr \Rn0, \Rn0, \Rm0 + orr \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + sub \Rd0, \Rn0, \Rd0, lsr #1 + sub \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask + @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) + @ Rmask = 0xFEFEFEFE + @ Rn = destroy + eor \Rd0, \Rn0, \Rm0 + eor \Rd1, \Rn1, \Rm1 + and \Rn0, \Rn0, \Rm0 + and \Rn1, \Rn1, \Rm1 + and \Rd0, \Rd0, \Rmask + and \Rd1, \Rd1, \Rmask + add \Rd0, \Rn0, \Rd0, lsr #1 + add \Rd1, \Rn1, \Rd1, lsr #1 +.endm + +.macro JMP_ALIGN tmp, reg + ands \tmp, \reg, #3 + bic \reg, \reg, #3 + beq 1f + subs \tmp, \tmp, #1 + beq 2f + subs \tmp, \tmp, #1 + beq 3f + b 4f +.endm + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels16_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11, lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r7} + add r1, r1, r2 + stm r0, {r4-r7} + pld [r1] + subs r3, r3, #1 + add r0, r0, r2 + bne 1b + pop {r4-r11, pc} + .align 5 +2: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 2b + pop {r4-r11, pc} + .align 5 +3: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 3b + pop {r4-r11, pc} + .align 5 +4: + ldm r1, {r4-r8} + add r1, r1, r2 + ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 + pld [r1] + subs r3, r3, #1 + stm r0, {r9-r12} + add r0, r0, r2 + bne 4b + pop {r4-r11,pc} +endfunc + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r5,lr} + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 + subs r3, r3, #1 + pld [r1] + stm r0, {r4-r5} + add r0, r0, r2 + bne 1b + pop {r4-r5,pc} + .align 5 +2: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 1, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r5,pc} + .align 5 +3: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 2, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r5,pc} + .align 5 +4: + ldm r1, {r4-r5, r12} + add r1, r1, r2 + ALIGN_DWORD 3, r4, r5, r12 + pld [r1] + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 4b + pop {r4-r5,pc} +endfunc + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_x2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r10,lr} + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 1b + pop {r4-r10,pc} + .align 5 +2: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 2b + pop {r4-r10,pc} + .align 5 +3: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 + subs r3, r3, #1 + stm r0, {r4-r5} + add r0, r0, r2 + bne 3b + pop {r4-r10,pc} + .align 5 +4: + ldm r1, {r4-r5, r10} + add r1, r1, r2 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 4b + pop {r4-r10,pc} +endfunc + + +@ ---------------------------------------------------------------- + .align 5 +function ff_put_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_y2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + mov r3, r3, lsr #1 + ldr r12, =0xfefefefe + JMP_ALIGN r5, r1 +1: + ldm r1, {r4-r5} + add r1, r1, r2 +6: ldm r1, {r6-r7} + add r1, r1, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 + ldm r1, {r4-r5} + add r1, r1, r2 + stm r0, {r8-r9} + add r0, r0, r2 + pld [r1] + NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 + subs r3, r3, #1 + stm r0, {r8-r9} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +2: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 1, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +3: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 2, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} + .align 5 +4: + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 +6: ldm r1, {r7-r9} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r7, r8, r9 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 + stm r0, {r10-r11} + add r0, r0, r2 + ldm r1, {r4-r6} + add r1, r1, r2 + pld [r1] + ALIGN_DWORD 3, r4, r5, r6 + subs r3, r3, #1 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 + stm r0, {r10-r11} + add r0, r0, r2 + bne 6b + pop {r4-r11,pc} +endfunc + + .ltorg + +@ ---------------------------------------------------------------- +.macro RND_XY2_IT align, rnd + @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) + @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) +.if \align == 0 + ldm r1, {r6-r8} +.elseif \align == 3 + ldm r1, {r5-r7} +.else + ldm r1, {r8-r10} +.endif + add r1, r1, r2 + pld [r1] +.if \align == 0 + ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 +.elseif \align == 1 + ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 +.elseif \align == 2 + ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 + ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 +.elseif \align == 3 + ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 +.endif + ldr r14, =0x03030303 + tst r3, #1 + and r8, r4, r14 + and r9, r5, r14 + and r10, r6, r14 + and r11, r7, r14 + andeq r14, r14, r14, \rnd #1 + add r8, r8, r10 + add r9, r9, r11 + ldr r12, =0xfcfcfcfc >> 2 + addeq r8, r8, r14 + addeq r9, r9, r14 + and r4, r12, r4, lsr #2 + and r5, r12, r5, lsr #2 + and r6, r12, r6, lsr #2 + and r7, r12, r7, lsr #2 + add r10, r4, r6 + add r11, r5, r7 + subs r3, r3, #1 +.endm + +.macro RND_XY2_EXPAND align, rnd + RND_XY2_IT \align, \rnd +6: push {r8-r11} + RND_XY2_IT \align, \rnd + pop {r4-r7} + add r4, r4, r8 + add r5, r5, r9 + ldr r14, =0x0f0f0f0f + add r6, r6, r10 + add r7, r7, r11 + and r4, r14, r4, lsr #2 + and r5, r14, r5, lsr #2 + add r4, r4, r6 + add r5, r5, r7 + stm r0, {r4-r5} + add r0, r0, r2 + bge 6b + pop {r4-r11,pc} +.endm + + .align 5 +function ff_put_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} @ R14 is also called LR + JMP_ALIGN r5, r1 +1: RND_XY2_EXPAND 0, lsl + .align 5 +2: RND_XY2_EXPAND 1, lsl + .align 5 +3: RND_XY2_EXPAND 2, lsl + .align 5 +4: RND_XY2_EXPAND 3, lsl +endfunc + + .align 5 +function ff_put_no_rnd_pixels8_xy2_arm, export=1 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) + @ block = word aligned, pixles = unaligned + pld [r1] + push {r4-r11,lr} + JMP_ALIGN r5, r1 +1: RND_XY2_EXPAND 0, lsr + .align 5 +2: RND_XY2_EXPAND 1, lsr + .align 5 +3: RND_XY2_EXPAND 2, lsr + .align 5 +4: RND_XY2_EXPAND 3, lsr +endfunc + + .align 5 +@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) +function ff_add_pixels_clamped_arm, export=1 + push {r4-r10} + mov r10, #8 +1: + ldr r4, [r1] /* load dest */ + /* block[0] and block[1]*/ + ldrsh r5, [r0] + ldrsh r7, [r0, #2] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r5, r6 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #4] /* moved form [A] */ + orr r9, r9, r8, lsl #8 + /* block[2] and block[3] */ + /* [A] */ + ldrsh r7, [r0, #6] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + ldr r4, [r1, #4] /* moved form [B] */ + orr r9, r9, r8, lsl #24 + /* store dest */ + ldrsh r5, [r0, #8] /* moved form [C] */ + str r9, [r1] + + /* load dest */ + /* [B] */ + /* block[4] and block[5] */ + /* [C] */ + ldrsh r7, [r0, #10] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r5, r6 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #12] /* moved from [D] */ + orr r9, r9, r8, lsl #8 + /* block[6] and block[7] */ + /* [D] */ + ldrsh r7, [r0, #14] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + movne r6, r5, lsr #24 + tst r8, #0x100 + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + add r0, r0, #16 /* moved from [E] */ + orr r9, r9, r8, lsl #24 + subs r10, r10, #1 /* moved from [F] */ + /* store dest */ + str r9, [r1, #4] + + /* [E] */ + /* [F] */ + add r1, r1, r2 + bne 1b + + pop {r4-r10} + bx lr +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_arm.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_arm.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_DSPUTIL_H +#define AVCODEC_ARM_DSPUTIL_H + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" + +void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); +void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_armv6.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_armv6.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,623 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + + .text + +.macro call_2x_pixels type, subp +function ff_\type\()_pixels16\subp\()_armv6, export=1 + push {r0-r3, lr} + bl ff_\type\()_pixels8\subp\()_armv6 + pop {r0-r3, lr} + add r0, r0, #8 + add r1, r1, #8 + b ff_\type\()_pixels8\subp\()_armv6 +endfunc +.endm + +call_2x_pixels avg +call_2x_pixels put, _x2 +call_2x_pixels put, _y2 +call_2x_pixels put, _x2_no_rnd +call_2x_pixels put, _y2_no_rnd + +function ff_put_pixels16_armv6, export=1 + push {r4-r11} +1: + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + ldr r4, [r1], r2 + strd r6, r7, [r0, #8] + ldr r9, [r1, #4] + strd r4, r5, [r0], r2 + ldr r10, [r1, #8] + ldr r11, [r1, #12] + ldr r8, [r1], r2 + strd r10, r11, [r0, #8] + subs r3, r3, #2 + strd r8, r9, [r0], r2 + bne 1b + + pop {r4-r11} + bx lr +endfunc + +function ff_put_pixels8_armv6, export=1 + push {r4-r7} +1: + ldr r5, [r1, #4] + ldr r4, [r1], r2 + ldr r7, [r1, #4] + strd r4, r5, [r0], r2 + ldr r6, [r1], r2 + subs r3, r3, #2 + strd r6, r7, [r0], r2 + bne 1b + + pop {r4-r7} + bx lr +endfunc + +function ff_put_pixels8_x2_armv6, export=1 + push {r4-r11, lr} + mov r12, #1 + orr r12, r12, r12, lsl #8 + orr r12, r12, r12, lsl #16 +1: + ldr r4, [r1] + subs r3, r3, #2 + ldr r5, [r1, #4] + ldr r7, [r1, #5] + lsr r6, r4, #8 + ldr r8, [r1, r2]! + orr r6, r6, r5, lsl #24 + ldr r9, [r1, #4] + ldr r11, [r1, #5] + lsr r10, r8, #8 + add r1, r1, r2 + orr r10, r10, r9, lsl #24 + eor r14, r4, r6 + uhadd8 r4, r4, r6 + eor r6, r5, r7 + uhadd8 r5, r5, r7 + and r14, r14, r12 + and r6, r6, r12 + uadd8 r4, r4, r14 + eor r14, r8, r10 + uadd8 r5, r5, r6 + eor r6, r9, r11 + uhadd8 r8, r8, r10 + and r14, r14, r12 + uhadd8 r9, r9, r11 + and r6, r6, r12 + uadd8 r8, r8, r14 + strd r4, r5, [r0], r2 + uadd8 r9, r9, r6 + strd r8, r9, [r0], r2 + bne 1b + + pop {r4-r11, pc} +endfunc + +function ff_put_pixels8_y2_armv6, export=1 + push {r4-r11} + mov r12, #1 + orr r12, r12, r12, lsl #8 + orr r12, r12, r12, lsl #16 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r6, [r1, r2]! + ldr r7, [r1, #4] +1: + subs r3, r3, #2 + uhadd8 r8, r4, r6 + eor r10, r4, r6 + uhadd8 r9, r5, r7 + eor r11, r5, r7 + and r10, r10, r12 + ldr r4, [r1, r2]! + uadd8 r8, r8, r10 + and r11, r11, r12 + uadd8 r9, r9, r11 + ldr r5, [r1, #4] + uhadd8 r10, r4, r6 + eor r6, r4, r6 + uhadd8 r11, r5, r7 + and r6, r6, r12 + eor r7, r5, r7 + uadd8 r10, r10, r6 + and r7, r7, r12 + ldr r6, [r1, r2]! + uadd8 r11, r11, r7 + strd r8, r9, [r0], r2 + ldr r7, [r1, #4] + strd r10, r11, [r0], r2 + bne 1b + + pop {r4-r11} + bx lr +endfunc + +function ff_put_pixels8_x2_no_rnd_armv6, export=1 + push {r4-r9, lr} +1: + subs r3, r3, #2 + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r7, [r1, #5] + ldr r8, [r1, r2]! + ldr r9, [r1, #4] + ldr r14, [r1, #5] + add r1, r1, r2 + lsr r6, r4, #8 + orr r6, r6, r5, lsl #24 + lsr r12, r8, #8 + orr r12, r12, r9, lsl #24 + uhadd8 r4, r4, r6 + uhadd8 r5, r5, r7 + uhadd8 r8, r8, r12 + uhadd8 r9, r9, r14 + stm r0, {r4,r5} + add r0, r0, r2 + stm r0, {r8,r9} + add r0, r0, r2 + bne 1b + + pop {r4-r9, pc} +endfunc + +function ff_put_pixels8_y2_no_rnd_armv6, export=1 + push {r4-r9, lr} + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r6, [r1, r2]! + ldr r7, [r1, #4] +1: + subs r3, r3, #2 + uhadd8 r8, r4, r6 + ldr r4, [r1, r2]! + uhadd8 r9, r5, r7 + ldr r5, [r1, #4] + uhadd8 r12, r4, r6 + ldr r6, [r1, r2]! + uhadd8 r14, r5, r7 + ldr r7, [r1, #4] + stm r0, {r8,r9} + add r0, r0, r2 + stm r0, {r12,r14} + add r0, r0, r2 + bne 1b + + pop {r4-r9, pc} +endfunc + +function ff_avg_pixels8_armv6, export=1 + pld [r1, r2] + push {r4-r10, lr} + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 + ldrd r4, r5, [r0] + ldr r10, [r1, #4] + ldr r9, [r1], r2 + subs r3, r3, #2 +1: + pld [r1, r2] + eor r8, r4, r9 + uhadd8 r4, r4, r9 + eor r12, r5, r10 + ldrd r6, r7, [r0, r2] + uhadd8 r5, r5, r10 + and r8, r8, lr + ldr r10, [r1, #4] + and r12, r12, lr + uadd8 r4, r4, r8 + ldr r9, [r1], r2 + eor r8, r6, r9 + uadd8 r5, r5, r12 + pld [r1, r2, lsl #1] + eor r12, r7, r10 + uhadd8 r6, r6, r9 + strd r4, r5, [r0], r2 + uhadd8 r7, r7, r10 + beq 2f + and r8, r8, lr + ldrd r4, r5, [r0, r2] + uadd8 r6, r6, r8 + ldr r10, [r1, #4] + and r12, r12, lr + subs r3, r3, #2 + uadd8 r7, r7, r12 + ldr r9, [r1], r2 + strd r6, r7, [r0], r2 + b 1b +2: + and r8, r8, lr + and r12, r12, lr + uadd8 r6, r6, r8 + uadd8 r7, r7, r12 + strd r6, r7, [r0], r2 + + pop {r4-r10, pc} +endfunc + +function ff_add_pixels_clamped_armv6, export=1 + push {r4-r8,lr} + mov r3, #8 +1: + ldm r0!, {r4,r5,r12,lr} + ldrd r6, r7, [r1] + pkhbt r8, r4, r5, lsl #16 + pkhtb r5, r5, r4, asr #16 + pkhbt r4, r12, lr, lsl #16 + pkhtb lr, lr, r12, asr #16 + pld [r1, r2] + uxtab16 r8, r8, r6 + uxtab16 r5, r5, r6, ror #8 + uxtab16 r4, r4, r7 + uxtab16 lr, lr, r7, ror #8 + usat16 r8, #8, r8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 lr, #8, lr + orr r6, r8, r5, lsl #8 + orr r7, r4, lr, lsl #8 + subs r3, r3, #1 + strd r6, r7, [r1], r2 + bgt 1b + pop {r4-r8,pc} +endfunc + +function ff_get_pixels_armv6, export=1 + pld [r1, r2] + push {r4-r8, lr} + mov lr, #8 +1: + ldrd r4, r5, [r1], r2 + subs lr, lr, #1 + uxtb16 r6, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r12, r5 + uxtb16 r8, r5, ror #8 + pld [r1, r2] + pkhbt r5, r6, r4, lsl #16 + pkhtb r6, r4, r6, asr #16 + pkhbt r7, r12, r8, lsl #16 + pkhtb r12, r8, r12, asr #16 + stm r0!, {r5,r6,r7,r12} + bgt 1b + + pop {r4-r8, pc} +endfunc + +function ff_diff_pixels_armv6, export=1 + pld [r1, r3] + pld [r2, r3] + push {r4-r9, lr} + mov lr, #8 +1: + ldrd r4, r5, [r1], r3 + ldrd r6, r7, [r2], r3 + uxtb16 r8, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r6 + uxtb16 r6, r6, ror #8 + pld [r1, r3] + ssub16 r9, r8, r9 + ssub16 r6, r4, r6 + uxtb16 r8, r5 + uxtb16 r5, r5, ror #8 + pld [r2, r3] + pkhbt r4, r9, r6, lsl #16 + pkhtb r6, r6, r9, asr #16 + uxtb16 r9, r7 + uxtb16 r7, r7, ror #8 + ssub16 r9, r8, r9 + ssub16 r5, r5, r7 + subs lr, lr, #1 + pkhbt r8, r9, r5, lsl #16 + pkhtb r9, r5, r9, asr #16 + stm r0!, {r4,r6,r8,r9} + bgt 1b + + pop {r4-r9, pc} +endfunc + +function ff_pix_abs16_armv6, export=1 + ldr r0, [sp] + push {r4-r9, lr} + mov r12, #0 + mov lr, #0 + ldm r1, {r4-r7} + ldr r8, [r2] +1: + ldr r9, [r2, #4] + pld [r1, r3] + usada8 r12, r4, r8, r12 + ldr r8, [r2, #8] + pld [r2, r3] + usada8 lr, r5, r9, lr + ldr r9, [r2, #12] + usada8 r12, r6, r8, r12 + subs r0, r0, #1 + usada8 lr, r7, r9, lr + beq 2f + add r1, r1, r3 + ldm r1, {r4-r7} + add r2, r2, r3 + ldr r8, [r2] + b 1b +2: + add r0, r12, lr + pop {r4-r9, pc} +endfunc + +function ff_pix_abs16_x2_armv6, export=1 + ldr r12, [sp] + push {r4-r11, lr} + mov r0, #0 + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 +1: + ldr r8, [r2] + ldr r9, [r2, #4] + lsr r10, r8, #8 + ldr r4, [r1] + lsr r6, r9, #8 + orr r10, r10, r9, lsl #24 + ldr r5, [r2, #8] + eor r11, r8, r10 + uhadd8 r7, r8, r10 + orr r6, r6, r5, lsl #24 + and r11, r11, lr + uadd8 r7, r7, r11 + ldr r8, [r1, #4] + usada8 r0, r4, r7, r0 + eor r7, r9, r6 + lsr r10, r5, #8 + and r7, r7, lr + uhadd8 r4, r9, r6 + ldr r6, [r2, #12] + uadd8 r4, r4, r7 + pld [r1, r3] + orr r10, r10, r6, lsl #24 + usada8 r0, r8, r4, r0 + ldr r4, [r1, #8] + eor r11, r5, r10 + ldrb r7, [r2, #16] + and r11, r11, lr + uhadd8 r8, r5, r10 + ldr r5, [r1, #12] + uadd8 r8, r8, r11 + pld [r2, r3] + lsr r10, r6, #8 + usada8 r0, r4, r8, r0 + orr r10, r10, r7, lsl #24 + subs r12, r12, #1 + eor r11, r6, r10 + add r1, r1, r3 + uhadd8 r9, r6, r10 + and r11, r11, lr + uadd8 r9, r9, r11 + add r2, r2, r3 + usada8 r0, r5, r9, r0 + bgt 1b + + pop {r4-r11, pc} +endfunc + +.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 + ldr \n0, [r2] + eor \n1, \p0, \n0 + uhadd8 \p0, \p0, \n0 + and \n1, \n1, lr + ldr \n2, [r1] + uadd8 \p0, \p0, \n1 + ldr \n1, [r2, #4] + usada8 r0, \p0, \n2, r0 + pld [r1, r3] + eor \n3, \p1, \n1 + uhadd8 \p1, \p1, \n1 + and \n3, \n3, lr + ldr \p0, [r1, #4] + uadd8 \p1, \p1, \n3 + ldr \n2, [r2, #8] + usada8 r0, \p1, \p0, r0 + pld [r2, r3] + eor \p0, \p2, \n2 + uhadd8 \p2, \p2, \n2 + and \p0, \p0, lr + ldr \p1, [r1, #8] + uadd8 \p2, \p2, \p0 + ldr \n3, [r2, #12] + usada8 r0, \p2, \p1, r0 + eor \p1, \p3, \n3 + uhadd8 \p3, \p3, \n3 + and \p1, \p1, lr + ldr \p0, [r1, #12] + uadd8 \p3, \p3, \p1 + add r1, r1, r3 + usada8 r0, \p3, \p0, r0 + add r2, r2, r3 +.endm + +function ff_pix_abs16_y2_armv6, export=1 + pld [r1] + pld [r2] + ldr r12, [sp] + push {r4-r11, lr} + mov r0, #0 + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 + ldr r4, [r2] + ldr r5, [r2, #4] + ldr r6, [r2, #8] + ldr r7, [r2, #12] + add r2, r2, r3 +1: + usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 + subs r12, r12, #2 + usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 + bgt 1b + + pop {r4-r11, pc} +endfunc + +function ff_pix_abs8_armv6, export=1 + pld [r2, r3] + ldr r12, [sp] + push {r4-r9, lr} + mov r0, #0 + mov lr, #0 + ldrd r4, r5, [r1], r3 +1: + subs r12, r12, #2 + ldr r7, [r2, #4] + ldr r6, [r2], r3 + ldrd r8, r9, [r1], r3 + usada8 r0, r4, r6, r0 + pld [r2, r3] + usada8 lr, r5, r7, lr + ldr r7, [r2, #4] + ldr r6, [r2], r3 + beq 2f + ldrd r4, r5, [r1], r3 + usada8 r0, r8, r6, r0 + pld [r2, r3] + usada8 lr, r9, r7, lr + b 1b +2: + usada8 r0, r8, r6, r0 + usada8 lr, r9, r7, lr + add r0, r0, lr + pop {r4-r9, pc} +endfunc + +function ff_sse16_armv6, export=1 + ldr r12, [sp] + push {r4-r9, lr} + mov r0, #0 +1: + ldrd r4, r5, [r1] + ldr r8, [r2] + uxtb16 lr, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r8 + uxtb16 r8, r8, ror #8 + ldr r7, [r2, #4] + usub16 lr, lr, r9 + usub16 r4, r4, r8 + smlad r0, lr, lr, r0 + uxtb16 r6, r5 + uxtb16 lr, r5, ror #8 + uxtb16 r8, r7 + uxtb16 r9, r7, ror #8 + smlad r0, r4, r4, r0 + ldrd r4, r5, [r1, #8] + usub16 r6, r6, r8 + usub16 r8, lr, r9 + ldr r7, [r2, #8] + smlad r0, r6, r6, r0 + uxtb16 lr, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r7 + uxtb16 r7, r7, ror #8 + smlad r0, r8, r8, r0 + ldr r8, [r2, #12] + usub16 lr, lr, r9 + usub16 r4, r4, r7 + smlad r0, lr, lr, r0 + uxtb16 r6, r5 + uxtb16 r5, r5, ror #8 + uxtb16 r9, r8 + uxtb16 r8, r8, ror #8 + smlad r0, r4, r4, r0 + usub16 r6, r6, r9 + usub16 r5, r5, r8 + smlad r0, r6, r6, r0 + add r1, r1, r3 + add r2, r2, r3 + subs r12, r12, #1 + smlad r0, r5, r5, r0 + bgt 1b + + pop {r4-r9, pc} +endfunc + +function ff_pix_norm1_armv6, export=1 + push {r4-r6, lr} + mov r12, #16 + mov lr, #0 +1: + ldm r0, {r2-r5} + uxtb16 r6, r2 + uxtb16 r2, r2, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r3 + smlad lr, r2, r2, lr + uxtb16 r3, r3, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r4 + smlad lr, r3, r3, lr + uxtb16 r4, r4, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r5 + smlad lr, r4, r4, lr + uxtb16 r5, r5, ror #8 + smlad lr, r6, r6, lr + subs r12, r12, #1 + add r0, r0, r1 + smlad lr, r5, r5, lr + bgt 1b + + mov r0, lr + pop {r4-r6, pc} +endfunc + +function ff_pix_sum_armv6, export=1 + push {r4-r7, lr} + mov r12, #16 + mov r2, #0 + mov r3, #0 + mov lr, #0 + ldr r4, [r0] +1: + subs r12, r12, #1 + ldr r5, [r0, #4] + usada8 r2, r4, lr, r2 + ldr r6, [r0, #8] + usada8 r3, r5, lr, r3 + ldr r7, [r0, #12] + usada8 r2, r6, lr, r2 + beq 2f + ldr r4, [r0, r1]! + usada8 r3, r7, lr, r3 + bgt 1b +2: + usada8 r3, r7, lr, r3 + add r0, r2, r3 + pop {r4-r7, pc} +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_init_arm.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,112 @@ +/* + * ARM optimized DSP utils + * Copyright (c) 2001 Lionel Ulmer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void ff_j_rev_dct_arm(DCTELEM *data); +void ff_simple_idct_arm(DCTELEM *data); + +/* XXX: local hack */ +static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); +static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); + +void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); +void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) +CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) +CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) + +void ff_add_pixels_clamped_arm(const DCTELEM *block, uint8_t *dest, + int line_size); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + converted */ +static void j_rev_dct_arm_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_j_rev_dct_arm (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void j_rev_dct_arm_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_j_rev_dct_arm (block); + ff_add_pixels_clamped(block, dest, line_size); +} +static void simple_idct_arm_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_simple_idct_arm (block); + ff_put_pixels_clamped(block, dest, line_size); +} +static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_simple_idct_arm (block); + ff_add_pixels_clamped(block, dest, line_size); +} + +int mm_support(void) +{ + return HAVE_IWMMXT * FF_MM_IWMMXT; +} + +void dsputil_init_arm(DSPContext* c) +{ + ff_put_pixels_clamped = c->put_pixels_clamped; + ff_add_pixels_clamped = c->add_pixels_clamped; + + c->idct_put = simple_idct_arm_put; + c->idct_add = simple_idct_arm_add; + c->idct = ff_simple_idct_arm; + c->idct_permutation_type = FF_NO_IDCT_PERM; + + c->add_pixels_clamped = ff_add_pixels_clamped_arm; + + c->put_pixels_tab[0][0] = ff_put_pixels16_arm; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; + c->put_pixels_tab[1][0] = ff_put_pixels8_arm; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; + c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; + c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; + + if (HAVE_NEON) ff_dsputil_init_neon(c); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_armv5te.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_init_armv5te.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void ff_simple_idct_armv5te(DCTELEM *data); +void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data); +void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data); + +void ff_prefetch_arm(void *mem, int stride, int h); + +void av_cold ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx) +{ + if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { + c->idct_put = ff_simple_idct_put_armv5te; + c->idct_add = ff_simple_idct_add_armv5te; + c->idct = ff_simple_idct_armv5te; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } + + c->prefetch = ff_prefetch_arm; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_armv6.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_init_armv6.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void ff_simple_idct_armv6(DCTELEM *data); +void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); +void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); + +void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int); + +void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); + +void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int); + +void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int); + +void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); + +void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int); + +void ff_add_pixels_clamped_armv6(const DCTELEM *block, + uint8_t *restrict pixels, + int line_size); + +void ff_get_pixels_armv6(DCTELEM *block, const uint8_t *pixels, int stride); +void ff_diff_pixels_armv6(DCTELEM *block, const uint8_t *s1, + const uint8_t *s2, int stride); + +int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); +int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); +int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_pix_norm1_armv6(uint8_t *pix, int line_size); +int ff_pix_sum_armv6(uint8_t *pix, int line_size); + +void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx) +{ + if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) { + c->idct_put = ff_simple_idct_put_armv6; + c->idct_add = ff_simple_idct_add_armv6; + c->idct = ff_simple_idct_armv6; + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; + } + + c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; +/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ + c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; +/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; +/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; +/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; + + c->add_pixels_clamped = ff_add_pixels_clamped_armv6; + c->get_pixels = ff_get_pixels_armv6; + c->diff_pixels = ff_diff_pixels_armv6; + + c->pix_abs[0][0] = ff_pix_abs16_armv6; + c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; + c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; + + c->pix_abs[1][0] = ff_pix_abs8_armv6; + + c->sad[0] = ff_pix_abs16_armv6; + c->sad[1] = ff_pix_abs8_armv6; + + c->sse[0] = ff_sse16_armv6; + + c->pix_norm1 = ff_pix_norm1_armv6; + c->pix_sum = ff_pix_sum_armv6; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_neon.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_init_neon.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,308 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void ff_simple_idct_neon(DCTELEM *data); +void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); + +void ff_vp3_idct_neon(DCTELEM *data); +void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); + +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); + +void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); +void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int); + +void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); +void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); +void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); + +void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); +void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); +void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); + +void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); +void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); + +void ff_vector_fmul_neon(float *dst, const float *src, int len); +void ff_vector_fmul_window_neon(float *dst, const float *src0, + const float *src1, const float *win, + float add_bias, int len); +void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, + int len); +void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, + const float **vp, float mul, int len); +void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, + const float **vp, float mul, int len); +void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, + int len); +void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, + int len); +void ff_butterflies_float_neon(float *v1, float *v2, int len); +float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, + float mul, int len); +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, + const float *src1, int len); +void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, + const float *src2, int len); + +void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, + int len); +void ff_float_to_int16_neon(int16_t *, const float *, long); +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); + +void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); + +int32_t ff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len, + int shift); +int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, int16_t *v2, + int16_t *v3, int len, int mul); + +void ff_dsputil_init_neon(DSPContext *c) +{ + + { + c->idct_put = ff_simple_idct_put_neon; + c->idct_add = ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; + + } + + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; + + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; + + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; + + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + + + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; + c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon; + + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; + c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon; + + c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; + c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; + c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; + c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; + c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; + c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; + c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; + c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; + c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; + c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; + + c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; + c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; + c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; + c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; + c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; + c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; + + c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; + c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon; + c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon; + c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon; + c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon; + c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon; + c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon; + c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon; + c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon; + c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon; + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon; + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon; + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon; + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon; + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon; + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon; + + c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; + c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon; + c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon; + c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon; + c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon; + c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon; + c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon; + c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon; + c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon; + c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon; + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon; + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon; + c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon; + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon; + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon; + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon; + + c->vector_fmul = ff_vector_fmul_neon; + c->vector_fmul_window = ff_vector_fmul_window_neon; + c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; + c->butterflies_float = ff_butterflies_float_neon; + c->scalarproduct_float = ff_scalarproduct_float_neon; + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; + c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; + c->vector_fmul_add = ff_vector_fmul_add_neon; + c->vector_clipf = ff_vector_clipf_neon; + + c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; + c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; + + c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; + c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; + + + c->float_to_int16 = ff_float_to_int16_neon; + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; + + c->scalarproduct_int16 = ff_scalarproduct_int16_neon; + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_init_vfp.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_init_vfp.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "dsputil_arm.h" + +void ff_vector_fmul_vfp(float *dst, const float *src, int len); +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, + const float *src1, int len); +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); + +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) +{ + c->vector_fmul = ff_vector_fmul_vfp; + c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; +#if HAVE_ARMV6 + c->float_to_int16 = ff_float_to_int16_vfp; +#endif +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_iwmmxt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_iwmmxt.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,205 @@ +/* + * iWMMXt optimized DSP utils + * Copyright (c) 2004 AGAWA Koji + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" + +#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt +#define SET_RND(regd) __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); +#define WAVG2B "wavg2b" +#include "dsputil_iwmmxt_rnd_template.c" +#undef DEF +#undef SET_RND +#undef WAVG2B + +#define DEF(x, y) x ## _ ## y ##_iwmmxt +#define SET_RND(regd) __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); +#define WAVG2B "wavg2br" +#include "dsputil_iwmmxt_rnd_template.c" +#undef DEF +#undef SET_RND +#undef WAVG2BR + +// need scheduling +#define OP(AVG) \ + __asm__ volatile ( \ + /* alignment */ \ + "and r12, %[pixels], #7 \n\t" \ + "bic %[pixels], %[pixels], #7 \n\t" \ + "tmcr wcgr1, r12 \n\t" \ + \ + "wldrd wr0, [%[pixels]] \n\t" \ + "wldrd wr1, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "walignr1 wr4, wr0, wr1 \n\t" \ + \ + "1: \n\t" \ + \ + "wldrd wr2, [%[pixels]] \n\t" \ + "wldrd wr3, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "pld [%[pixels]] \n\t" \ + "walignr1 wr5, wr2, wr3 \n\t" \ + AVG " wr6, wr4, wr5 \n\t" \ + "wstrd wr6, [%[block]] \n\t" \ + "add %[block], %[block], %[line_size] \n\t" \ + \ + "wldrd wr0, [%[pixels]] \n\t" \ + "wldrd wr1, [%[pixels], #8] \n\t" \ + "add %[pixels], %[pixels], %[line_size] \n\t" \ + "walignr1 wr4, wr0, wr1 \n\t" \ + "pld [%[pixels]] \n\t" \ + AVG " wr6, wr4, wr5 \n\t" \ + "wstrd wr6, [%[block]] \n\t" \ + "add %[block], %[block], %[line_size] \n\t" \ + \ + "subs %[h], %[h], #2 \n\t" \ + "bne 1b \n\t" \ + : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ + : [line_size]"r"(line_size) \ + : "memory", "r12"); +void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + OP("wavg2br"); +} +void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + OP("wavg2b"); +} +#undef OP + +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + uint8_t *pixels2 = pixels + line_size; + + __asm__ volatile ( + "mov r12, #4 \n\t" + "1: \n\t" + "pld [%[pixels], %[line_size2]] \n\t" + "pld [%[pixels2], %[line_size2]] \n\t" + "wldrd wr4, [%[pixels]] \n\t" + "wldrd wr5, [%[pixels2]] \n\t" + "pld [%[block], #32] \n\t" + "wunpckelub wr6, wr4 \n\t" + "wldrd wr0, [%[block]] \n\t" + "wunpckehub wr7, wr4 \n\t" + "wldrd wr1, [%[block], #8] \n\t" + "wunpckelub wr8, wr5 \n\t" + "wldrd wr2, [%[block], #16] \n\t" + "wunpckehub wr9, wr5 \n\t" + "wldrd wr3, [%[block], #24] \n\t" + "add %[block], %[block], #32 \n\t" + "waddhss wr10, wr0, wr6 \n\t" + "waddhss wr11, wr1, wr7 \n\t" + "waddhss wr12, wr2, wr8 \n\t" + "waddhss wr13, wr3, wr9 \n\t" + "wpackhus wr14, wr10, wr11 \n\t" + "wpackhus wr15, wr12, wr13 \n\t" + "wstrd wr14, [%[pixels]] \n\t" + "add %[pixels], %[pixels], %[line_size2] \n\t" + "subs r12, r12, #1 \n\t" + "wstrd wr15, [%[pixels2]] \n\t" + "add %[pixels2], %[pixels2], %[line_size2] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) + : [line_size2]"r"(line_size << 1) + : "cc", "memory", "r12"); +} + +static void clear_blocks_iwmmxt(DCTELEM *blocks) +{ + __asm__ volatile( + "wzero wr0 \n\t" + "mov r1, #(128 * 6 / 32) \n\t" + "1: \n\t" + "wstrd wr0, [%0] \n\t" + "wstrd wr0, [%0, #8] \n\t" + "wstrd wr0, [%0, #16] \n\t" + "wstrd wr0, [%0, #24] \n\t" + "subs r1, r1, #1 \n\t" + "add %0, %0, #32 \n\t" + "bne 1b \n\t" + : "+r"(blocks) + : + : "r1" + ); +} + +static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + return; +} + +/* A run time test is not simple. If this file is compiled in + * then we should install the functions + */ +int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */ + +void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) +{ + if (avctx->dsp_mask) { + if (avctx->dsp_mask & FF_MM_FORCE) + mm_flags |= (avctx->dsp_mask & 0xffff); + else + mm_flags &= ~(avctx->dsp_mask & 0xffff); + } + + if (!(mm_flags & FF_MM_IWMMXT)) return; + + c->add_pixels_clamped = add_pixels_clamped_iwmmxt; + + c->clear_blocks = clear_blocks_iwmmxt; + + c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; + c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; + c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; + c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; + + c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; + c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; + c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; + c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; + + c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; + c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; + c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; + + c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; + c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; + c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; + c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; + c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_iwmmxt_rnd_template.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_iwmmxt_rnd_template.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1114 @@ +/* + * iWMMXt optimized DSP utils + * copyright (c) 2004 AGAWA Koji + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr4, [r4, #8] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr4, [r4, #8] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr0, [%[block]] \n\t" + "wldrd wr2, [r5] \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + WAVG2B" wr8, wr8, wr0 \n\t" + WAVG2B" wr10, wr10, wr2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size] \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr2, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr4, [r4, #8] \n\t" + "walignr1 wr9, wr1, wr2 \n\t" + "wldrd wr5, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wstrd wr8, [%[block]] \n\t" + "walignr1 wr11, wr4, wr5 \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "wstrd wr11, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + __asm__ volatile ( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "1: \n\t" + "wldrd wr0, [%[pixels]] \n\t" + "wldrd wr1, [%[pixels], #8] \n\t" + "subs %[h], %[h], #2 \n\t" + "wldrd wr2, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr3, [r4] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr8, wr0, wr1 \n\t" + "wldrd wr4, [r4, #8] \n\t" + "walignr1 wr9, wr1, wr2 \n\t" + "wldrd wr5, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "wldrd wr0, [%[block]] \n\t" + "pld [r4] \n\t" + "wldrd wr1, [%[block], #8] \n\t" + "pld [r4, #32] \n\t" + "wldrd wr2, [r5] \n\t" + "walignr1 wr10, wr3, wr4 \n\t" + "wldrd wr3, [r5, #8] \n\t" + WAVG2B" wr8, wr8, wr0 \n\t" + WAVG2B" wr9, wr9, wr1 \n\t" + WAVG2B" wr10, wr10, wr2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "walignr1 wr11, wr4, wr5 \n\t" + WAVG2B" wr11, wr11, wr3 \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr10, [r5] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "wstrd wr11, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) + : + : "memory", "r4", "r5", "r12"); +} + +void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr6, wr14 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr2, [r5] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr15, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "walignr1 wr3, wr14, wr15 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr5, wr12 \n\t" + "wmoveq wr6, wr14 \n\t" + "wmoveq wr7, wr15 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr5, wr11, wr12 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "walignr2ne wr7, wr14, wr15 \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr1, wr1, wr5 \n\t" + "wstrd wr0, [%[block]] \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wstrd wr1, [%[block], #8] \n\t" + WAVG2B" wr3, wr3, wr7 \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr2, [r5] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr3, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr6, wr14 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "wldrd wr12, [r5] \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + WAVG2B" wr0, wr0, wr10 \n\t" + WAVG2B" wr2, wr2, wr12 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr2, [r5] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "add r4, %[pixels], %[line_size]\n\t" + "tmcr wcgr2, r12 \n\t" + "add r5, %[block], %[line_size] \n\t" + "mov %[line_size], %[line_size], lsl #1 \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "wldrd wr13, [r4] \n\t" + "pld [%[pixels]] \n\t" + "wldrd wr14, [r4, #8] \n\t" + "pld [%[pixels], #32] \n\t" + "wldrd wr15, [r4, #16] \n\t" + "add r4, r4, %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [r4] \n\t" + "pld [r4, #32] \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "walignr1 wr2, wr13, wr14 \n\t" + "walignr1 wr3, wr14, wr15 \n\t" + "wmoveq wr4, wr11 \n\t" + "wmoveq wr5, wr12 \n\t" + "wmoveq wr6, wr14 \n\t" + "wmoveq wr7, wr15 \n\t" + "walignr2ne wr4, wr10, wr11 \n\t" + "walignr2ne wr5, wr11, wr12 \n\t" + "walignr2ne wr6, wr13, wr14 \n\t" + "walignr2ne wr7, wr14, wr15 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr0, wr0, wr4 \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr1, wr1, wr5 \n\t" + "wldrd wr12, [r5] \n\t" + WAVG2B" wr2, wr2, wr6 \n\t" + "wldrd wr13, [r5, #8] \n\t" + WAVG2B" wr3, wr3, wr7 \n\t" + WAVG2B" wr0, wr0, wr10 \n\t" + WAVG2B" wr1, wr1, wr11 \n\t" + WAVG2B" wr2, wr2, wr12 \n\t" + WAVG2B" wr3, wr3, wr13 \n\t" + "wstrd wr0, [%[block]] \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr1, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wstrd wr2, [r5] \n\t" + "pld [%[block]] \n\t" + "wstrd wr3, [r5, #8] \n\t" + "add r5, r5, %[line_size] \n\t" + "pld [%[block], #32] \n\t" + "pld [r5] \n\t" + "pld [r5, #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + :"r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "wldrd wr10, [%[block]] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "cc", "memory", "r12"); +} + +void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "walignr1 wr5, wr11, wr12 \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + int stride = line_size; + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + __asm__ volatile( + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "and r12, %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + + "1: \n\t" + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr4, wr10, wr11 \n\t" + "walignr1 wr5, wr11, wr12 \n\t" + "wldrd wr10, [%[block]] \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + WAVG2B" wr9, wr9, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "wldrd wr10, [%[pixels]] \n\t" + "wldrd wr11, [%[pixels], #8] \n\t" + "pld [%[block]] \n\t" + "wldrd wr12, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr0, wr10, wr11 \n\t" + "walignr1 wr1, wr11, wr12 \n\t" + "wldrd wr10, [%[block]] \n\t" + "wldrd wr11, [%[block], #8] \n\t" + WAVG2B" wr8, wr0, wr4 \n\t" + WAVG2B" wr9, wr1, wr5 \n\t" + WAVG2B" wr8, wr8, wr10 \n\t" + WAVG2B" wr9, wr9, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) + : + : "r4", "r5", "r12", "memory"); +} + +void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "add r12, r12, #1 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "tmcr wcgr2, r12 \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "cmp r12, #8 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "subs %[h], %[h], #2 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + /* alignment */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "tmcr wcgr2, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr7, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr6, wr7 \n\t" + "wunpckehub wr7, wr7 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr6, wr6, wr10 \n\t" + "waddhus wr7, wr7, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + "subs %[h], %[h], #2 \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "add r12, r12, #1 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "tmcr wcgr2, r12 \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "cmp r12, #8 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "wldrd wr12, [%[pixels]] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr13, [%[pixels], #8] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "wmoveq wr10, wr13 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "subs %[h], %[h], #2 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + "wstrd wr8, [%[block]] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} + +void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) +{ + // [wr0 wr1 wr2 wr3] for previous line + // [wr4 wr5 wr6 wr7] for current line + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "pld [%[pixels]] \n\t" + "mov r12, #2 \n\t" + "pld [%[pixels], #32] \n\t" + "tmcr wcgr0, r12 \n\t" /* for shift value */ + /* alignment */ + "and r12, %[pixels], #7 \n\t" + "bic %[pixels], %[pixels], #7 \n\t" + "tmcr wcgr1, r12 \n\t" + "add r12, r12, #1 \n\t" + "tmcr wcgr2, r12 \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "pld [%[pixels]] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + + "1: \n\t" + // [wr0 wr1 wr2 wr3] + // [wr4 wr5 wr6 wr7] <= * + "wldrd wr12, [%[pixels]] \n\t" + "cmp r12, #8 \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr6, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr7, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr4, wr6 \n\t" + "wunpckehub wr5, wr6 \n\t" + "wunpckelub wr6, wr7 \n\t" + "wunpckehub wr7, wr7 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr4, wr4, wr8 \n\t" + "waddhus wr5, wr5, wr9 \n\t" + "waddhus wr6, wr6, wr10 \n\t" + "waddhus wr7, wr7, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wldrd wr13, [%[block], #8] \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + WAVG2B" wr9, wr9, wr13 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + + // [wr0 wr1 wr2 wr3] <= * + // [wr4 wr5 wr6 wr7] + "wldrd wr12, [%[pixels]] \n\t" + "pld [%[block]] \n\t" + "wldrd wr13, [%[pixels], #8] \n\t" + "pld [%[block], #32] \n\t" + "wldrd wr14, [%[pixels], #16] \n\t" + "add %[pixels], %[pixels], %[line_size] \n\t" + "walignr1 wr2, wr12, wr13 \n\t" + "pld [%[pixels]] \n\t" + "pld [%[pixels], #32] \n\t" + "walignr1 wr3, wr13, wr14 \n\t" + "wmoveq wr10, wr13 \n\t" + "wmoveq wr11, wr14 \n\t" + "walignr2ne wr10, wr12, wr13 \n\t" + "walignr2ne wr11, wr13, wr14 \n\t" + "wunpckelub wr0, wr2 \n\t" + "wunpckehub wr1, wr2 \n\t" + "wunpckelub wr2, wr3 \n\t" + "wunpckehub wr3, wr3 \n\t" + "wunpckelub wr8, wr10 \n\t" + "wunpckehub wr9, wr10 \n\t" + "wunpckelub wr10, wr11 \n\t" + "wunpckehub wr11, wr11 \n\t" + "waddhus wr0, wr0, wr8 \n\t" + "waddhus wr1, wr1, wr9 \n\t" + "waddhus wr2, wr2, wr10 \n\t" + "waddhus wr3, wr3, wr11 \n\t" + "waddhus wr8, wr0, wr4 \n\t" + "waddhus wr9, wr1, wr5 \n\t" + "waddhus wr10, wr2, wr6 \n\t" + "waddhus wr11, wr3, wr7 \n\t" + "waddhus wr8, wr8, wr15 \n\t" + "waddhus wr9, wr9, wr15 \n\t" + "waddhus wr10, wr10, wr15 \n\t" + "waddhus wr11, wr11, wr15 \n\t" + "wsrlhg wr8, wr8, wcgr0 \n\t" + "wsrlhg wr9, wr9, wcgr0 \n\t" + "wldrd wr12, [%[block]] \n\t" + "wldrd wr13, [%[block], #8] \n\t" + "wsrlhg wr10, wr10, wcgr0 \n\t" + "wsrlhg wr11, wr11, wcgr0 \n\t" + "wpackhus wr8, wr8, wr9 \n\t" + "wpackhus wr9, wr10, wr11 \n\t" + WAVG2B" wr8, wr8, wr12 \n\t" + WAVG2B" wr9, wr9, wr13 \n\t" + "wstrd wr8, [%[block]] \n\t" + "wstrd wr9, [%[block], #8] \n\t" + "add %[block], %[block], %[line_size] \n\t" + "subs %[h], %[h], #2 \n\t" + "pld [%[block]] \n\t" + "pld [%[block], #32] \n\t" + "bne 1b \n\t" + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) + : [line_size]"r"(line_size) + : "r12", "memory"); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1146 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + preserve8 + .text + + .macro pixels16 avg=0 +.if \avg + mov ip, r0 +.endif +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 + vld1.64 {d4, d5}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.64 {d6, d7}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] +.if \avg + vld1.64 {d16,d17}, [ip,:128], r2 + vrhadd.u8 q0, q0, q8 + vld1.64 {d18,d19}, [ip,:128], r2 + vrhadd.u8 q1, q1, q9 + vld1.64 {d20,d21}, [ip,:128], r2 + vrhadd.u8 q2, q2, q10 + vld1.64 {d22,d23}, [ip,:128], r2 + vrhadd.u8 q3, q3, q11 +.endif + subs r3, r3, #4 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d2, d3}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_x2 vhadd=vrhadd.u8 +1: vld1.64 {d0-d2}, [r1], r2 + vld1.64 {d4-d6}, [r1], r2 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vext.8 q1, q0, q1, #1 + \vhadd q0, q0, q1 + vext.8 q3, q2, q3, #1 + \vhadd q2, q2, q3 + vst1.64 {d0, d1}, [r0,:128], r2 + vst1.64 {d4, d5}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_y2 vhadd=vrhadd.u8 + vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 +1: subs r3, r3, #2 + \vhadd q2, q0, q1 + vld1.64 {d0, d1}, [r1], r2 + \vhadd q3, q0, q1 + vld1.64 {d2, d3}, [r1], r2 + pld [r1] + pld [r1, r2] + vst1.64 {d4, d5}, [r0,:128], r2 + vst1.64 {d6, d7}, [r0,:128], r2 + bne 1b + bx lr + .endm + + .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 + vld1.64 {d0-d2}, [r1], r2 + vld1.64 {d4-d6}, [r1], r2 +.if \no_rnd + vmov.i16 q13, #1 +.endif + pld [r1] + pld [r1, r2] + vext.8 q1, q0, q1, #1 + vext.8 q3, q2, q3, #1 + vaddl.u8 q8, d0, d2 + vaddl.u8 q10, d1, d3 + vaddl.u8 q9, d4, d6 + vaddl.u8 q11, d5, d7 +1: subs r3, r3, #2 + vld1.64 {d0-d2}, [r1], r2 + vadd.u16 q12, q8, q9 + pld [r1] +.if \no_rnd + vadd.u16 q12, q12, q13 +.endif + vext.8 q15, q0, q1, #1 + vadd.u16 q1 , q10, q11 + \vshrn d28, q12, #2 +.if \no_rnd + vadd.u16 q1, q1, q13 +.endif + \vshrn d29, q1, #2 + vaddl.u8 q8, d0, d30 + vld1.64 {d2-d4}, [r1], r2 + vaddl.u8 q10, d1, d31 + vst1.64 {d28,d29}, [r0,:128], r2 + vadd.u16 q12, q8, q9 + pld [r1, r2] +.if \no_rnd + vadd.u16 q12, q12, q13 +.endif + vext.8 q2, q1, q2, #1 + vadd.u16 q0, q10, q11 + \vshrn d30, q12, #2 +.if \no_rnd + vadd.u16 q0, q0, q13 +.endif + \vshrn d31, q0, #2 + vaddl.u8 q9, d2, d4 + vaddl.u8 q11, d3, d5 + vst1.64 {d30,d31}, [r0,:128], r2 + bgt 1b + bx lr + .endm + + .macro pixels8 avg=0 +1: vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + pld [r1, r2, lsl #2] + vld1.64 {d3}, [r1], r2 + pld [r1] + pld [r1, r2] + pld [r1, r2, lsl #1] +.if \avg + vld1.64 {d4}, [r0,:64], r2 + vrhadd.u8 d0, d0, d4 + vld1.64 {d5}, [r0,:64], r2 + vrhadd.u8 d1, d1, d5 + vld1.64 {d6}, [r0,:64], r2 + vrhadd.u8 d2, d2, d6 + vld1.64 {d7}, [r0,:64], r2 + vrhadd.u8 d3, d3, d7 + sub r0, r0, r2, lsl #2 +.endif + subs r3, r3, #4 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + vst1.64 {d2}, [r0,:64], r2 + vst1.64 {d3}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_x2 vhadd=vrhadd.u8 +1: vld1.64 {d0, d1}, [r1], r2 + vext.8 d1, d0, d1, #1 + vld1.64 {d2, d3}, [r1], r2 + vext.8 d3, d2, d3, #1 + pld [r1] + pld [r1, r2] + subs r3, r3, #2 + vswp d1, d2 + \vhadd q0, q0, q1 + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_y2 vhadd=vrhadd.u8 + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 +1: subs r3, r3, #2 + \vhadd d4, d0, d1 + vld1.64 {d0}, [r1], r2 + \vhadd d5, d0, d1 + vld1.64 {d1}, [r1], r2 + pld [r1] + pld [r1, r2] + vst1.64 {d4}, [r0,:64], r2 + vst1.64 {d5}, [r0,:64], r2 + bne 1b + bx lr + .endm + + .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 + vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d2, d3}, [r1], r2 +.if \no_rnd + vmov.i16 q11, #1 +.endif + pld [r1] + pld [r1, r2] + vext.8 d4, d0, d1, #1 + vext.8 d6, d2, d3, #1 + vaddl.u8 q8, d0, d4 + vaddl.u8 q9, d2, d6 +1: subs r3, r3, #2 + vld1.64 {d0, d1}, [r1], r2 + pld [r1] + vadd.u16 q10, q8, q9 + vext.8 d4, d0, d1, #1 +.if \no_rnd + vadd.u16 q10, q10, q11 +.endif + vaddl.u8 q8, d0, d4 + \vshrn d5, q10, #2 + vld1.64 {d2, d3}, [r1], r2 + vadd.u16 q10, q8, q9 + pld [r1, r2] +.if \no_rnd + vadd.u16 q10, q10, q11 +.endif + vst1.64 {d5}, [r0,:64], r2 + \vshrn d7, q10, #2 + vext.8 d6, d2, d3, #1 + vaddl.u8 q9, d2, d6 + vst1.64 {d7}, [r0,:64], r2 + bgt 1b + bx lr + .endm + + .macro pixfunc pfx name suf rnd_op args:vararg +function ff_\pfx\name\suf\()_neon, export=1 + \name \rnd_op \args +endfunc + .endm + + .macro pixfunc2 pfx name args:vararg + pixfunc \pfx \name + pixfunc \pfx \name \args + .endm + +function ff_put_h264_qpel16_mc00_neon, export=1 + mov r3, #16 +endfunc + + pixfunc put_ pixels16 + pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 + +function ff_avg_h264_qpel16_mc00_neon, export=1 + mov r3, #16 +endfunc + + pixfunc avg_ pixels16,, 1 + +function ff_put_h264_qpel8_mc00_neon, export=1 + mov r3, #8 +endfunc + + pixfunc put_ pixels8 + pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 + pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 + +function ff_avg_h264_qpel8_mc00_neon, export=1 + mov r3, #8 +endfunc + + pixfunc avg_ pixels8,, 1 + +function ff_put_pixels_clamped_neon, export=1 + vld1.64 {d16-d19}, [r0,:128]! + vqmovun.s16 d0, q8 + vld1.64 {d20-d23}, [r0,:128]! + vqmovun.s16 d1, q9 + vld1.64 {d24-d27}, [r0,:128]! + vqmovun.s16 d2, q10 + vld1.64 {d28-d31}, [r0,:128]! + vqmovun.s16 d3, q11 + vst1.64 {d0}, [r1,:64], r2 + vqmovun.s16 d4, q12 + vst1.64 {d1}, [r1,:64], r2 + vqmovun.s16 d5, q13 + vst1.64 {d2}, [r1,:64], r2 + vqmovun.s16 d6, q14 + vst1.64 {d3}, [r1,:64], r2 + vqmovun.s16 d7, q15 + vst1.64 {d4}, [r1,:64], r2 + vst1.64 {d5}, [r1,:64], r2 + vst1.64 {d6}, [r1,:64], r2 + vst1.64 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_put_signed_pixels_clamped_neon, export=1 + vmov.u8 d31, #128 + vld1.64 {d16-d17}, [r0,:128]! + vqmovn.s16 d0, q8 + vld1.64 {d18-d19}, [r0,:128]! + vqmovn.s16 d1, q9 + vld1.64 {d16-d17}, [r0,:128]! + vqmovn.s16 d2, q8 + vld1.64 {d18-d19}, [r0,:128]! + vadd.u8 d0, d0, d31 + vld1.64 {d20-d21}, [r0,:128]! + vadd.u8 d1, d1, d31 + vld1.64 {d22-d23}, [r0,:128]! + vadd.u8 d2, d2, d31 + vst1.64 {d0}, [r1,:64], r2 + vqmovn.s16 d3, q9 + vst1.64 {d1}, [r1,:64], r2 + vqmovn.s16 d4, q10 + vst1.64 {d2}, [r1,:64], r2 + vqmovn.s16 d5, q11 + vld1.64 {d24-d25}, [r0,:128]! + vadd.u8 d3, d3, d31 + vld1.64 {d26-d27}, [r0,:128]! + vadd.u8 d4, d4, d31 + vadd.u8 d5, d5, d31 + vst1.64 {d3}, [r1,:64], r2 + vqmovn.s16 d6, q12 + vst1.64 {d4}, [r1,:64], r2 + vqmovn.s16 d7, q13 + vst1.64 {d5}, [r1,:64], r2 + vadd.u8 d6, d6, d31 + vadd.u8 d7, d7, d31 + vst1.64 {d6}, [r1,:64], r2 + vst1.64 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_add_pixels_clamped_neon, export=1 + mov r3, r1 + vld1.64 {d16}, [r1,:64], r2 + vld1.64 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vld1.64 {d17}, [r1,:64], r2 + vld1.64 {d2-d3}, [r0,:128]! + vqmovun.s16 d0, q0 + vld1.64 {d18}, [r1,:64], r2 + vaddw.u8 q1, q1, d17 + vld1.64 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.64 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.64 {d19}, [r1,:64], r2 + vld1.64 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vqmovun.s16 d4, q2 + vst1.64 {d2}, [r3,:64], r2 + vld1.64 {d16}, [r1,:64], r2 + vqmovun.s16 d6, q3 + vld1.64 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vst1.64 {d4}, [r3,:64], r2 + vld1.64 {d17}, [r1,:64], r2 + vld1.64 {d2-d3}, [r0,:128]! + vaddw.u8 q1, q1, d17 + vst1.64 {d6}, [r3,:64], r2 + vqmovun.s16 d0, q0 + vld1.64 {d18}, [r1,:64], r2 + vld1.64 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.64 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.64 {d19}, [r1,:64], r2 + vqmovun.s16 d4, q2 + vld1.64 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vst1.64 {d2}, [r3,:64], r2 + vqmovun.s16 d6, q3 + vst1.64 {d4}, [r3,:64], r2 + vst1.64 {d6}, [r3,:64], r2 + bx lr +endfunc + +function ff_float_to_int16_neon, export=1 + subs r2, r2, #8 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q9, q1, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vshrn.s32 d4, q8, #16 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q0, q0, #16 + vshrn.s32 d5, q9, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vld1.64 {d16-d17},[r1,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r1,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6-d7}, [r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vld1.64 {d0-d1}, [r1,:128]! + vshrn.s32 d4, q8, #16 + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vshrn.s32 d5, q9, #16 + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vst1.64 {d6-d7}, [r0,:128]! + bx lr +3: vshrn.s32 d4, q8, #16 + vshrn.s32 d5, q9, #16 + vst1.64 {d4-d5}, [r0,:128]! + bx lr +endfunc + +function ff_float_to_int16_interleave_neon, export=1 + cmp r3, #2 + ldrlt r1, [r1] + blt ff_float_to_int16_neon + bne 4f + + ldr r3, [r1] + ldr r1, [r1, #4] + + subs r2, r2, #8 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q9, q1, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q10, q8, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vld1.64 {d26-d27},[r1,:128]! + vsri.32 q11, q9, #16 + vst1.64 {d20-d21},[r0,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q12, q0, #16 + vld1.64 {d16-d17},[r3,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d25},[r0,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r3,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d26-d27},[r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vsri.32 q10, q8, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vsri.32 q11, q9, #16 + vld1.64 {d26-d27},[r1,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d20-d21},[r0,:128]! + vsri.32 q12, q0, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d27},[r0,:128]! + bx lr +3: vsri.32 q10, q8, #16 + vsri.32 q11, q9, #16 + vst1.64 {d20-d23},[r0,:128]! + bx lr + +4: push {r4-r8,lr} + cmp r3, #4 + lsl ip, r3, #1 + blt 4f + + @ 4 channels +5: ldmia r1!, {r4-r7} + mov lr, r2 + mov r8, r0 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #8 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q9, q8, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 q11, q10, #16 + vld1.64 {d4-d5}, [r6,:128]! + vcvt.s32.f32 q2, q2, #16 + vzip.32 d18, d22 + vld1.64 {d6-d7}, [r7,:128]! + vcvt.s32.f32 q3, q3, #16 + vzip.32 d19, d23 + vst1.64 {d18}, [r8], ip + vsri.32 q1, q0, #16 + vst1.64 {d22}, [r8], ip + vsri.32 q3, q2, #16 + vst1.64 {d19}, [r8], ip + vzip.32 d2, d6 + vst1.64 {d23}, [r8], ip + vzip.32 d3, d7 + beq 7f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.64 {d2}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6}, [r8], ip + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.64 {d3}, [r8], ip + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d7}, [r8], ip + b 6b +7: vst1.64 {d2}, [r8], ip + vst1.64 {d6}, [r8], ip + vst1.64 {d3}, [r8], ip + vst1.64 {d7}, [r8], ip + subs r3, r3, #4 + popeq {r4-r8,pc} + cmp r3, #4 + add r0, r0, #8 + bge 5b + + @ 2 channels +4: cmp r3, #2 + blt 4f + ldmia r1!, {r4-r5} + mov lr, r2 + mov r8, r0 + tst lr, #8 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 6f + subs lr, lr, #8 + beq 7f + vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #16 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 d18, d16, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 d19, d17, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r5,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vsri.32 d2, d0, #16 + vst1.32 {d19[1]}, [r8], ip + vsri.32 d3, d1, #16 + vst1.32 {d22[0]}, [r8], ip + vsri.32 d6, d4, #16 + vst1.32 {d22[1]}, [r8], ip + vsri.32 d7, d5, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + beq 6f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + bgt 6b +6: vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + b 8f +7: vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip +8: subs r3, r3, #2 + add r0, r0, #4 + popeq {r4-r8,pc} + + @ 1 channel +4: ldr r4, [r1],#4 + tst r2, #8 + mov lr, r2 + mov r5, r0 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + bne 8f +6: subs lr, lr, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r4,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + beq 7f + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 +7: vst1.16 {d4[1]}, [r5,:16], ip + vst1.16 {d4[3]}, [r5,:16], ip + vst1.16 {d5[1]}, [r5,:16], ip + vst1.16 {d5[3]}, [r5,:16], ip + vst1.16 {d6[1]}, [r5,:16], ip + vst1.16 {d6[3]}, [r5,:16], ip + vst1.16 {d7[1]}, [r5,:16], ip + vst1.16 {d7[3]}, [r5,:16], ip + bgt 6b + pop {r4-r8,pc} +8: subs lr, lr, #8 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + popeq {r4-r8,pc} + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + b 6b +endfunc + +function ff_vector_fmul_neon, export=1 + mov r3, r0 + subs r2, r2, #8 + vld1.64 {d0-d3}, [r0,:128]! + vld1.64 {d4-d7}, [r1,:128]! + vmul.f32 q8, q0, q2 + vmul.f32 q9, q1, q3 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vld1.64 {d0-d1}, [r0,:128]! + vld1.64 {d4-d5}, [r1,:128]! + vmul.f32 q10, q0, q2 + vld1.64 {d2-d3}, [r0,:128]! + vld1.64 {d6-d7}, [r1,:128]! + vmul.f32 q11, q1, q3 + vst1.64 {d16-d19},[r3,:128]! + vld1.64 {d0-d1}, [r0,:128]! + vld1.64 {d4-d5}, [r1,:128]! + vmul.f32 q8, q0, q2 + vld1.64 {d2-d3}, [r0,:128]! + vld1.64 {d6-d7}, [r1,:128]! + vmul.f32 q9, q1, q3 + vst1.64 {d20-d23},[r3,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vld1.64 {d0-d1}, [r0,:128]! + vld1.64 {d4-d5}, [r1,:128]! + vst1.64 {d16-d17},[r3,:128]! + vmul.f32 q8, q0, q2 + vld1.64 {d2-d3}, [r0,:128]! + vld1.64 {d6-d7}, [r1,:128]! + vst1.64 {d18-d19},[r3,:128]! + vmul.f32 q9, q1, q3 +3: vst1.64 {d16-d19},[r3,:128]! + bx lr +endfunc + +function ff_vector_fmul_window_neon, export=1 +VFP vdup.32 q8, d0[0] +NOVFP vld1.32 {d16[],d17[]}, [sp,:32] + push {r4,r5,lr} +VFP ldr lr, [sp, #12] +NOVFP ldr lr, [sp, #16] + sub r2, r2, #8 + sub r5, lr, #2 + add r2, r2, r5, lsl #2 + add r4, r3, r5, lsl #3 + add ip, r0, r5, lsl #3 + mov r5, #-16 + vld1.64 {d0,d1}, [r1,:128]! + vld1.64 {d2,d3}, [r2,:128], r5 + vld1.64 {d4,d5}, [r3,:128]! + vld1.64 {d6,d7}, [r4,:128], r5 +1: subs lr, lr, #4 + vmov q11, q8 + vmla.f32 d22, d0, d4 + vmov q10, q8 + vmla.f32 d23, d1, d5 + vrev64.32 q3, q3 + vmla.f32 d20, d0, d7 + vrev64.32 q1, q1 + vmla.f32 d21, d1, d6 + beq 2f + vmla.f32 d22, d3, d7 + vld1.64 {d0,d1}, [r1,:128]! + vmla.f32 d23, d2, d6 + vld1.64 {d18,d19},[r2,:128], r5 + vmls.f32 d20, d3, d4 + vld1.64 {d24,d25},[r3,:128]! + vmls.f32 d21, d2, d5 + vld1.64 {d6,d7}, [r4,:128], r5 + vmov q1, q9 + vrev64.32 q11, q11 + vmov q2, q12 + vswp d22, d23 + vst1.64 {d20,d21},[r0,:128]! + vst1.64 {d22,d23},[ip,:128], r5 + b 1b +2: vmla.f32 d22, d3, d7 + vmla.f32 d23, d2, d6 + vmls.f32 d20, d3, d4 + vmls.f32 d21, d2, d5 + vrev64.32 q11, q11 + vswp d22, d23 + vst1.64 {d20,d21},[r0,:128]! + vst1.64 {d22,d23},[ip,:128], r5 + pop {r4,r5,pc} +endfunc + +#if CONFIG_VORBIS_DECODER +function ff_vorbis_inverse_coupling_neon, export=1 + vmov.i32 q10, #1<<31 + subs r2, r2, #4 + mov r3, r0 + mov r12, r1 + beq 3f + + vld1.32 {d24-d25},[r1,:128]! + vld1.32 {d22-d23},[r0,:128]! + vcle.s32 q8, q12, #0 + vand q9, q11, q10 + veor q12, q12, q9 + vand q2, q12, q8 + vbic q3, q12, q8 + vadd.f32 q12, q11, q2 + vsub.f32 q11, q11, q3 +1: vld1.32 {d2-d3}, [r1,:128]! + vld1.32 {d0-d1}, [r0,:128]! + vcle.s32 q8, q1, #0 + vand q9, q0, q10 + veor q1, q1, q9 + vst1.32 {d24-d25},[r3, :128]! + vst1.32 {d22-d23},[r12,:128]! + vand q2, q1, q8 + vbic q3, q1, q8 + vadd.f32 q1, q0, q2 + vsub.f32 q0, q0, q3 + subs r2, r2, #8 + ble 2f + vld1.32 {d24-d25},[r1,:128]! + vld1.32 {d22-d23},[r0,:128]! + vcle.s32 q8, q12, #0 + vand q9, q11, q10 + veor q12, q12, q9 + vst1.32 {d2-d3}, [r3, :128]! + vst1.32 {d0-d1}, [r12,:128]! + vand q2, q12, q8 + vbic q3, q12, q8 + vadd.f32 q12, q11, q2 + vsub.f32 q11, q11, q3 + b 1b + +2: vst1.32 {d2-d3}, [r3, :128]! + vst1.32 {d0-d1}, [r12,:128]! + bxlt lr + +3: vld1.32 {d2-d3}, [r1,:128] + vld1.32 {d0-d1}, [r0,:128] + vcle.s32 q8, q1, #0 + vand q9, q0, q10 + veor q1, q1, q9 + vand q2, q1, q8 + vbic q3, q1, q8 + vadd.f32 q1, q0, q2 + vsub.f32 q0, q0, q3 + vst1.32 {d2-d3}, [r0,:128]! + vst1.32 {d0-d1}, [r1,:128]! + bx lr +endfunc +#endif + +function ff_vector_fmul_scalar_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + bics r12, len, #15 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q1},[r1,:128]! +1: vmul.f32 q0, q0, q8 + vld1.32 {q2},[r1,:128]! + vmul.f32 q1, q1, q8 + vld1.32 {q3},[r1,:128]! + vmul.f32 q2, q2, q8 + vst1.32 {q0},[r0,:128]! + vmul.f32 q3, q3, q8 + vst1.32 {q1},[r0,:128]! + subs r12, r12, #16 + beq 2f + vld1.32 {q0},[r1,:128]! + vst1.32 {q2},[r0,:128]! + vld1.32 {q1},[r1,:128]! + vst1.32 {q3},[r0,:128]! + b 1b +2: vst1.32 {q2},[r0,:128]! + vst1.32 {q3},[r0,:128]! + ands len, len, #15 + bxeq lr +3: vld1.32 {q0},[r1,:128]! + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 3b + bx lr + .unreq len +endfunc + +function ff_vector_fmul_sv_scalar_2_neon, export=1 +VFP vdup.32 d16, d0[0] +NOVFP vdup.32 d16, r3 +NOVFP ldr r3, [sp] + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! +1: subs r3, r3, #4 + vmul.f32 d4, d0, d16 + vmul.f32 d5, d1, d16 + ldr r12, [r2], #4 + vld1.32 {d2},[r12,:64] + ldr r12, [r2], #4 + vld1.32 {d3},[r12,:64] + vmul.f32 d4, d4, d2 + vmul.f32 d5, d5, d3 + beq 2f + vld1.32 {d0},[r1,:64]! + vld1.32 {d1},[r1,:64]! + vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + b 1b +2: vst1.32 {d4},[r0,:64]! + vst1.32 {d5},[r0,:64]! + bx lr +endfunc + +function ff_vector_fmul_sv_scalar_4_neon, export=1 +VFP vdup.32 q10, d0[0] +NOVFP vdup.32 q10, r3 +NOVFP ldr r3, [sp] + push {lr} + bics lr, r3, #7 + beq 3f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! +1: ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + ldr r12, [r2], #4 + vld1.32 {q3},[r12,:128] + vmul.f32 q8, q0, q10 + vmul.f32 q8, q8, q1 + vmul.f32 q9, q2, q10 + vmul.f32 q9, q9, q3 + subs lr, lr, #8 + beq 2f + vld1.32 {q0},[r1,:128]! + vld1.32 {q2},[r1,:128]! + vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + b 1b +2: vst1.32 {q8},[r0,:128]! + vst1.32 {q9},[r0,:128]! + ands r3, r3, #7 + popeq {pc} +3: vld1.32 {q0},[r1,:128]! + ldr r12, [r2], #4 + vld1.32 {q1},[r12,:128] + vmul.f32 q0, q0, q10 + vmul.f32 q0, q0, q1 + vst1.32 {q0},[r0,:128]! + subs r3, r3, #4 + bgt 3b + pop {pc} +endfunc + +function ff_sv_fmul_scalar_2_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] +1: vmul.f32 q1, q0, q8 + subs len, len, #4 + beq 2f + ldr r12, [r1], #4 + vld1.32 {d0},[r12,:64] + ldr r12, [r1], #4 + vld1.32 {d1},[r12,:64] + vst1.32 {q1},[r0,:128]! + b 1b +2: vst1.32 {q1},[r0,:128]! + bx lr + .unreq len +endfunc + +function ff_sv_fmul_scalar_4_neon, export=1 +VFP len .req r2 +NOVFP len .req r3 +VFP vdup.32 q8, d0[0] +NOVFP vdup.32 q8, r2 +1: ldr r12, [r1], #4 + vld1.32 {q0},[r12,:128] + vmul.f32 q0, q0, q8 + vst1.32 {q0},[r0,:128]! + subs len, len, #4 + bgt 1b + bx lr + .unreq len +endfunc + +function ff_butterflies_float_neon, export=1 +1: vld1.32 {q0},[r0,:128] + vld1.32 {q1},[r1,:128] + vsub.f32 q2, q0, q1 + vadd.f32 q1, q0, q1 + vst1.32 {q2},[r1,:128]! + vst1.32 {q1},[r0,:128]! + subs r2, r2, #4 + bgt 1b + bx lr +endfunc + +function ff_scalarproduct_float_neon, export=1 + vmov.f32 q2, #0.0 +1: vld1.32 {q0},[r0,:128]! + vld1.32 {q1},[r1,:128]! + vmla.f32 q2, q0, q1 + subs r2, r2, #4 + bgt 1b + vadd.f32 d0, d4, d5 + vpadd.f32 d0, d0, d0 +NOVFP vmov.32 r0, d0[0] + bx lr +endfunc + +function ff_int32_to_float_fmul_scalar_neon, export=1 +VFP vdup.32 q0, d0[0] +VFP len .req r2 +NOVFP vdup.32 q0, r2 +NOVFP len .req r3 + + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 +1: subs len, len, #8 + pld [r1, #16] + vmul.f32 q9, q3, q0 + vmul.f32 q10, q8, q0 + beq 2f + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 + vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + b 1b +2: vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + bx lr + .unreq len +endfunc + +function ff_vector_fmul_reverse_neon, export=1 + add r2, r2, r3, lsl #2 + sub r2, r2, #32 + mov r12, #-32 + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 +1: pld [r1, #32] + vrev64.32 q3, q3 + vmul.f32 d16, d0, d7 + vmul.f32 d17, d1, d6 + pld [r2, #-32] + vrev64.32 q2, q2 + vmul.f32 d18, d2, d5 + vmul.f32 d19, d3, d4 + subs r3, r3, #8 + beq 2f + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 + vst1.32 {q8-q9}, [r0,:128]! + b 1b +2: vst1.32 {q8-q9}, [r0,:128]! + bx lr +endfunc + +function ff_vector_fmul_add_neon, export=1 + ldr r12, [sp] + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q8-q9}, [r2,:128]! + vld1.32 {q2-q3}, [r3,:128]! + vmul.f32 q10, q0, q8 + vmul.f32 q11, q1, q9 +1: vadd.f32 q12, q2, q10 + vadd.f32 q13, q3, q11 + pld [r1, #16] + pld [r2, #16] + pld [r3, #16] + subs r12, r12, #8 + beq 2f + vld1.32 {q0}, [r1,:128]! + vld1.32 {q8}, [r2,:128]! + vmul.f32 q10, q0, q8 + vld1.32 {q1}, [r1,:128]! + vld1.32 {q9}, [r2,:128]! + vmul.f32 q11, q1, q9 + vld1.32 {q2-q3}, [r3,:128]! + vst1.32 {q12-q13},[r0,:128]! + b 1b +2: vst1.32 {q12-q13},[r0,:128]! + bx lr +endfunc + +function ff_vector_clipf_neon, export=1 +VFP vdup.32 q1, d0[1] +VFP vdup.32 q0, d0[0] +NOVFP vdup.32 q0, r2 +NOVFP vdup.32 q1, r3 +NOVFP ldr r2, [sp] + vld1.f32 {q2},[r1,:128]! + vmin.f32 q10, q2, q1 + vld1.f32 {q3},[r1,:128]! + vmin.f32 q11, q3, q1 +1: vmax.f32 q8, q10, q0 + vmax.f32 q9, q11, q0 + subs r2, r2, #8 + beq 2f + vld1.f32 {q2},[r1,:128]! + vmin.f32 q10, q2, q1 + vld1.f32 {q3},[r1,:128]! + vmin.f32 q11, q3, q1 + vst1.f32 {q8},[r0,:128]! + vst1.f32 {q9},[r0,:128]! + b 1b +2: vst1.f32 {q8},[r0,:128]! + vst1.f32 {q9},[r0,:128]! + bx lr +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/dsputil_vfp.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/dsputil_vfp.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + .syntax unified +/* + * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle + * throughput for almost all the instructions (except for double precision + * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles + * for arithmetic operations. Scheduling code to avoid pipeline stalls is very + * important for performance. One more interesting feature is that VFP has + * independent load/store and arithmetics pipelines, so it is possible to make + * them work simultaneously and get more than 1 operation per cycle. Load/store + * pipeline can process 2 single precision floating point values per cycle and + * supports bulk loads and stores for large sets of registers. Arithmetic operations + * can be done on vectors, which allows to keep the arithmetics pipeline busy, + * while the processor may issue and execute other instructions. Detailed + * optimization manuals can be found at http://www.arm.com + */ + +/** + * ARM VFP optimized implementation of 'vector_fmul_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) +function ff_vector_fmul_vfp, export=1 + vpush {d8-d15} + mov r3, r0 + fmrx r12, fpscr + orr r12, r12, #(3 << 16) /* set vector size to 4 */ + fmxr fpscr, r12 + + vldmia r3!, {s0-s3} + vldmia r1!, {s8-s11} + vldmia r3!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s0, s8 +1: + subs r2, r2, #16 + vmul.f32 s12, s4, s12 + vldmiage r3!, {s16-s19} + vldmiage r1!, {s24-s27} + vldmiage r3!, {s20-s23} + vldmiage r1!, {s28-s31} + vmulge.f32 s24, s16, s24 + vstmia r0!, {s8-s11} + vstmia r0!, {s12-s15} + vmulge.f32 s28, s20, s28 + vldmiagt r3!, {s0-s3} + vldmiagt r1!, {s8-s11} + vldmiagt r3!, {s4-s7} + vldmiagt r1!, {s12-s15} + vmulge.f32 s8, s0, s8 + vstmiage r0!, {s24-s27} + vstmiage r0!, {s28-s31} + bgt 1b + + bic r12, r12, #(7 << 16) /* set vector size back to 1 */ + fmxr fpscr, r12 + vpop {d8-d15} + bx lr +endfunc + +/** + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, +@ const float *src1, int len) +function ff_vector_fmul_reverse_vfp, export=1 + vpush {d8-d15} + add r2, r2, r3, lsl #2 + vldmdb r2!, {s0-s3} + vldmia r1!, {s8-s11} + vldmdb r2!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s3, s8 + vmul.f32 s9, s2, s9 + vmul.f32 s10, s1, s10 + vmul.f32 s11, s0, s11 +1: + subs r3, r3, #16 + vldmdbge r2!, {s16-s19} + vmul.f32 s12, s7, s12 + vldmiage r1!, {s24-s27} + vmul.f32 s13, s6, s13 + vldmdbge r2!, {s20-s23} + vmul.f32 s14, s5, s14 + vldmiage r1!, {s28-s31} + vmul.f32 s15, s4, s15 + vmulge.f32 s24, s19, s24 + vldmdbgt r2!, {s0-s3} + vmulge.f32 s25, s18, s25 + vstmia r0!, {s8-s13} + vmulge.f32 s26, s17, s26 + vldmiagt r1!, {s8-s11} + vmulge.f32 s27, s16, s27 + vmulge.f32 s28, s23, s28 + vldmdbgt r2!, {s4-s7} + vmulge.f32 s29, s22, s29 + vstmia r0!, {s14-s15} + vmulge.f32 s30, s21, s30 + vmulge.f32 s31, s20, s31 + vmulge.f32 s8, s3, s8 + vldmiagt r1!, {s12-s15} + vmulge.f32 s9, s2, s9 + vmulge.f32 s10, s1, s10 + vstmiage r0!, {s24-s27} + vmulge.f32 s11, s0, s11 + vstmiage r0!, {s28-s31} + bgt 1b + + vpop {d8-d15} + bx lr +endfunc + +#if HAVE_ARMV6 +/** + * ARM VFP optimized float to int16 conversion. + * Assume that len is a positive number and is multiple of 8, destination + * buffer is at least 4 bytes aligned (8 bytes alignment is better for + * performance), little endian byte sex + */ +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) +function ff_float_to_int16_vfp, export=1 + push {r4-r8,lr} + vpush {d8-d11} + vldmia r1!, {s16-s23} + vcvt.s32.f32 s0, s16 + vcvt.s32.f32 s1, s17 + vcvt.s32.f32 s2, s18 + vcvt.s32.f32 s3, s19 + vcvt.s32.f32 s4, s20 + vcvt.s32.f32 s5, s21 + vcvt.s32.f32 s6, s22 + vcvt.s32.f32 s7, s23 +1: + subs r2, r2, #8 + vmov r3, r4, s0, s1 + vmov r5, r6, s2, s3 + vmov r7, r8, s4, s5 + vmov ip, lr, s6, s7 + vldmiagt r1!, {s16-s23} + ssat r4, #16, r4 + ssat r3, #16, r3 + ssat r6, #16, r6 + ssat r5, #16, r5 + pkhbt r3, r3, r4, lsl #16 + pkhbt r4, r5, r6, lsl #16 + vcvtgt.s32.f32 s0, s16 + vcvtgt.s32.f32 s1, s17 + vcvtgt.s32.f32 s2, s18 + vcvtgt.s32.f32 s3, s19 + vcvtgt.s32.f32 s4, s20 + vcvtgt.s32.f32 s5, s21 + vcvtgt.s32.f32 s6, s22 + vcvtgt.s32.f32 s7, s23 + ssat r8, #16, r8 + ssat r7, #16, r7 + ssat lr, #16, lr + ssat ip, #16, ip + pkhbt r5, r7, r8, lsl #16 + pkhbt r6, ip, lr, lsl #16 + stmia r0!, {r3-r6} + bgt 1b + + vpop {d8-d11} + pop {r4-r8,pc} +endfunc +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/fft_init_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/fft_init_arm.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/fft.h" +#include "libavcodec/synth_filter.h" + +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); + +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); + +void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + +void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale, float bias); + +av_cold void ff_fft_init_arm(FFTContext *s) +{ + if (HAVE_NEON) { + s->fft_permute = ff_fft_permute_neon; + s->fft_calc = ff_fft_calc_neon; + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->permutation = FF_MDCT_PERM_INTERLEAVE; + } +} + +#if CONFIG_RDFT +av_cold void ff_rdft_init_arm(RDFTContext *s) +{ + if (HAVE_NEON) + s->rdft_calc = ff_rdft_calc_neon; +} +#endif + +#if CONFIG_DCA_DECODER +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) +{ + if (HAVE_NEON) + s->synth_filter_float = ff_synth_filter_float_neon; +} +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/fft_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/fft_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,371 @@ +/* + * ARM NEON optimised FFT + * + * Copyright (c) 2009 Mans Rullgard + * Copyright (c) 2009 Naotoshi Nojiri + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define M_SQRT1_2 0.70710678118654752440 + + .text + +function fft4_neon + vld1.32 {d0-d3}, [r0,:128] + + vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 + vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 + vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 + vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 + vadd.f32 d1, d6, d7 + vsub.f32 d3, d6, d7 + vadd.f32 d0, d4, d5 + vsub.f32 d2, d4, d5 + + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft8_neon + mov r1, r0 + vld1.32 {d0-d3}, [r1,:128]! + vld1.32 {d16-d19}, [r1,:128] + + movw r2, #0x04f3 @ sqrt(1/2) + movt r2, #0x3f35 + eor r3, r2, #1<<31 + vdup.32 d31, r2 + + vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 + vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 + vmov d28, r3, r2 + vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 + vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 + vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 + vrev64.32 d29, d28 + vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 + vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w + vext.32 q3, q2, q2, #1 + vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w + vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 + vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 + vmul.f32 d24, d17, d31 @ a2r*w,a2i*w + vmul.f32 d25, d19, d31 @ a3r*w,a3i*w + vadd.f32 d0, d20, d21 + vsub.f32 d2, d20, d21 + vadd.f32 d1, d22, d23 + vrev64.32 q13, q13 + vsub.f32 d3, d22, d23 + vsub.f32 d6, d6, d7 + vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 + vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 + vadd.f32 d7, d4, d5 + vsub.f32 d18, d2, d6 + vext.32 q13, q12, q12, #1 + vadd.f32 d2, d2, d6 + vsub.f32 d16, d0, d7 + vadd.f32 d5, d25, d24 + vsub.f32 d4, d26, d27 + vadd.f32 d0, d0, d7 + vsub.f32 d17, d1, d5 + vsub.f32 d19, d3, d4 + vadd.f32 d3, d3, d4 + vadd.f32 d1, d1, d5 + + vst1.32 {d16-d19}, [r1,:128] + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft16_neon + movrel r1, mppm + vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} + pld [r0, #32] + vld1.32 {d2-d3}, [r1,:128] + vext.32 q13, q9, q9, #1 + vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} + vadd.f32 d4, d16, d17 + vsub.f32 d5, d16, d17 + vadd.f32 d18, d18, d19 + vsub.f32 d19, d26, d27 + + vadd.f32 d20, d22, d23 + vsub.f32 d22, d22, d23 + vsub.f32 d23, d24, d25 + vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} + vadd.f32 d21, d24, d25 + vmul.f32 d24, d22, d2 + vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} + vmul.f32 d25, d23, d3 + vuzp.32 d16, d17 @ {r0,r1,i0,i1} + vmul.f32 q1, q11, d2[1] + vuzp.32 d18, d19 @ {r2,r3,i2,i3} + vrev64.32 q12, q12 + vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} + vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} + vzip.32 q10, q11 + vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + sub r0, r0, #96 + vext.32 q13, q13, q13, #1 + vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vext.32 q15, q15, q15, #1 + vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} + vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} + vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} + vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} + vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} + vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} + movrel r2, X(ff_cos_16) + vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} + vrev64.32 d1, d1 + vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} + vrev64.32 d3, d3 + movrel r3, pmmp + vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} + vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} + vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} + vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} + vld1.32 {d4-d5}, [r2,:64] + vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} + vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} + vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} + vld1.32 {d6-d7}, [r3,:128] + vrev64.32 q1, q14 + vmul.f32 q14, q14, d4[1] + vmul.f32 q1, q1, q3 + vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} + vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} + vzip.32 q12, q14 + vadd.f32 d0, d28, d24 + vadd.f32 d1, d25, d29 + vsub.f32 d2, d25, d29 + vsub.f32 d3, d28, d24 + vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} + mov r1, #32 + vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} + vrev64.32 q0, q13 + vmul.f32 q13, q13, d5[0] + vrev64.32 q1, q15 + vmul.f32 q15, q15, d5[1] + vst2.32 {d16-d17},[r0,:128], r1 + vmul.f32 q0, q0, q3 + vst2.32 {d20-d21},[r0,:128], r1 + vmul.f32 q1, q1, q3 + vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} + vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} + vst2.32 {d24-d25},[r0,:128], r1 + vst2.32 {d28-d29},[r0,:128] + vzip.32 q13, q15 + sub r0, r0, #80 + vadd.f32 d0, d30, d26 + vadd.f32 d1, d27, d31 + vsub.f32 d2, d27, d31 + vsub.f32 d3, d30, d26 + vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} + vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} + vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} + vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} + vst2.32 {d18-d19},[r0,:128], r1 + vst2.32 {d22-d23},[r0,:128], r1 + vst2.32 {d26-d27},[r0,:128], r1 + vst2.32 {d30-d31},[r0,:128] + bx lr +endfunc + +function fft_pass_neon + push {r4-r6,lr} + mov r6, r2 @ n + lsl r5, r2, #3 @ 2 * n * sizeof FFTSample + lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex + lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex + add r3, r2, r4 + add r4, r4, r0 @ &z[o1] + add r2, r2, r0 @ &z[o2] + add r3, r3, r0 @ &z[o3] + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + movrel r12, pmmp + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + add r5, r5, r1 @ wim + vld1.32 {d6-d7}, [r12,:128] @ pmmp + vswp d21, d22 + vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} + sub r5, r5, #4 @ wim-- + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vmul.f32 q1, q1, q3 + vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + sub r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} + sub r5, r5, #8 @ wim -= 2 +1: + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + vswp d21, d22 + vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} + vrev64.32 q0, q10 + vmul.f32 q10, q10, d4[0] + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} + vmul.f32 q0, q0, q3 + sub r5, r5, #8 @ wim -= 2 + vmul.f32 q1, q1, q3 + vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + subs r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} + bne 1b + + pop {r4-r6,pc} +endfunc + +.macro def_fft n, n2, n4 + .align 6 +function fft\n\()_neon + push {r4, lr} + mov r4, r0 + bl fft\n2\()_neon + add r0, r4, #\n4*2*8 + bl fft\n4\()_neon + add r0, r4, #\n4*3*8 + bl fft\n4\()_neon + mov r0, r4 + pop {r4, lr} + movrel r1, X(ff_cos_\n) + mov r2, #\n4/2 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_calc_neon, export=1 + ldr r2, [r0] + sub r2, r2, #2 + movrel r3, fft_tab_neon + ldr r3, [r3, r2, lsl #2] + mov r0, r1 + bx r3 +endfunc + +function ff_fft_permute_neon, export=1 + push {r4,lr} + mov r12, #1 + ldr r2, [r0] @ nbits + ldr r3, [r0, #20] @ tmp_buf + ldr r0, [r0, #8] @ revtab + lsl r12, r12, r2 + mov r2, r12 +1: + vld1.32 {d0-d1}, [r1,:128]! + ldr r4, [r0], #4 + uxth lr, r4 + uxth r4, r4, ror #16 + add lr, r3, lr, lsl #3 + add r4, r3, r4, lsl #3 + vst1.32 {d0}, [lr,:64] + vst1.32 {d1}, [r4,:64] + subs r12, r12, #2 + bgt 1b + + sub r1, r1, r2, lsl #3 +1: + vld1.32 {d0-d3}, [r3,:128]! + vst1.32 {d0-d3}, [r1,:128]! + subs r2, r2, #4 + bgt 1b + + pop {r4,pc} +endfunc + + .section .rodata + .align 4 +fft_tab_neon: + .word fft4_neon + .word fft8_neon + .word fft16_neon + .word fft32_neon + .word fft64_neon + .word fft128_neon + .word fft256_neon + .word fft512_neon + .word fft1024_neon + .word fft2048_neon + .word fft4096_neon + .word fft8192_neon + .word fft16384_neon + .word fft32768_neon + .word fft65536_neon + .size fft_tab_neon, . - fft_tab_neon + + .align 4 +pmmp: .float +1.0, -1.0, -1.0, +1.0 +mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264dsp_init_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/h264dsp_init_arm.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/dsputil.h" +#include "libavcodec/h264dsp.h" + +void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); + +void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); +void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, + int weight, int offset); + +void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); + +void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, + DCTELEM *block, int stride, + const uint8_t nnzc[6*8]); + +#if HAVE_NEON +static void ff_h264dsp_init_neon(H264DSPContext *c) +{ + c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; + c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; + c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; + c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; + c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; + c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; + + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; + c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; + c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; + c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; + c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; + c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; + + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + c->h264_idct_add16 = ff_h264_idct_add16_neon; + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; + c->h264_idct_add8 = ff_h264_idct_add8_neon; +} +#endif + +void ff_h264dsp_init_arm(H264DSPContext *c) +{ + if (HAVE_NEON) ff_h264dsp_init_neon(c); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264dsp_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/h264dsp_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1883 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 + vtrn.32 \r0, \r4 + vtrn.32 \r1, \r5 + vtrn.32 \r2, \r6 + vtrn.32 \r3, \r7 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.16 \r4, \r6 + vtrn.16 \r5, \r7 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + vtrn.8 \r4, \r5 + vtrn.8 \r6, \r7 + .endm + + .macro transpose_4x4 r0 r1 r2 r3 + vtrn.16 \r0, \r2 + vtrn.16 \r1, \r3 + vtrn.8 \r0, \r1 + vtrn.8 \r2, \r3 + .endm + + .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 + vswp \r0, \r4 + vswp \r1, \r5 + vswp \r2, \r6 + vswp \r3, \r7 + .endm + + .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 + .endm + +/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ + .macro h264_chroma_mc8 type +function ff_\type\()_h264_chroma_mc8_neon, export=1 + push {r4-r7, lr} + ldrd r4, [sp, #20] +.ifc \type,avg + mov lr, r0 +.endif + pld [r1] + pld [r1, r2] + + muls r7, r4, r5 + rsb r6, r7, r5, lsl #3 + rsb ip, r7, r4, lsl #3 + sub r4, r7, r4, lsl #3 + sub r4, r4, r5, lsl #3 + add r4, r4, #64 + + beq 2f + + add r5, r1, r2 + + vdup.8 d0, r4 + lsl r4, r2, #1 + vdup.8 d1, ip + vld1.64 {d4, d5}, [r1], r4 + vdup.8 d2, r6 + vld1.64 {d6, d7}, [r5], r4 + vdup.8 d3, r7 + + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + +1: pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vld1.64 {d4, d5}, [r1], r4 + vmlal.u8 q8, d6, d2 + vext.8 d5, d4, d5, #1 + vmlal.u8 q8, d7, d3 + vmull.u8 q9, d6, d0 + subs r3, r3, #2 + vmlal.u8 q9, d7, d1 + vmlal.u8 q9, d4, d2 + vmlal.u8 q9, d5, d3 + vrshrn.u16 d16, q8, #6 + vld1.64 {d6, d7}, [r5], r4 + pld [r1] + vrshrn.u16 d17, q9, #6 +.ifc \type,avg + vld1.64 {d20}, [lr,:64], r2 + vld1.64 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 +.endif + vext.8 d7, d6, d7, #1 + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r0,:64], r2 + bgt 1b + + pop {r4-r7, pc} + +2: tst r6, r6 + add ip, ip, r6 + vdup.8 d0, r4 + vdup.8 d1, ip + + beq 4f + + add r5, r1, r2 + lsl r4, r2, #1 + vld1.64 {d4}, [r1], r4 + vld1.64 {d6}, [r5], r4 + +3: pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d1 + vld1.64 {d4}, [r1], r4 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d1 + vld1.64 {d6}, [r5], r4 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 +.ifc \type,avg + vld1.64 {d20}, [lr,:64], r2 + vld1.64 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 +.endif + subs r3, r3, #2 + pld [r1] + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r0,:64], r2 + bgt 3b + + pop {r4-r7, pc} + +4: vld1.64 {d4, d5}, [r1], r2 + vld1.64 {d6, d7}, [r1], r2 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + +5: pld [r1] + subs r3, r3, #2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vld1.64 {d4, d5}, [r1], r2 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d7, d1 + pld [r1] + vext.8 d5, d4, d5, #1 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 +.ifc \type,avg + vld1.64 {d20}, [lr,:64], r2 + vld1.64 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 +.endif + vld1.64 {d6, d7}, [r1], r2 + vext.8 d7, d6, d7, #1 + vst1.64 {d16}, [r0,:64], r2 + vst1.64 {d17}, [r0,:64], r2 + bgt 5b + + pop {r4-r7, pc} +endfunc + .endm + +/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ + .macro h264_chroma_mc4 type +function ff_\type\()_h264_chroma_mc4_neon, export=1 + push {r4-r7, lr} + ldrd r4, [sp, #20] +.ifc \type,avg + mov lr, r0 +.endif + pld [r1] + pld [r1, r2] + + muls r7, r4, r5 + rsb r6, r7, r5, lsl #3 + rsb ip, r7, r4, lsl #3 + sub r4, r7, r4, lsl #3 + sub r4, r4, r5, lsl #3 + add r4, r4, #64 + + beq 2f + + add r5, r1, r2 + + vdup.8 d0, r4 + lsl r4, r2, #1 + vdup.8 d1, ip + vld1.64 {d4}, [r1], r4 + vdup.8 d2, r6 + vld1.64 {d6}, [r5], r4 + vdup.8 d3, r7 + + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 + vtrn.32 d6, d7 + + vtrn.32 d0, d1 + vtrn.32 d2, d3 + +1: pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d2 + vld1.64 {d4}, [r1], r4 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d2 + vld1.64 {d6}, [r5], r4 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + vrshrn.u16 d16, q8, #6 + subs r3, r3, #2 + pld [r1] +.ifc \type,avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 +.endif + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 1b + + pop {r4-r7, pc} + +2: tst r6, r6 + add ip, ip, r6 + vdup.8 d0, r4 + vdup.8 d1, ip + vtrn.32 d0, d1 + + beq 4f + + vext.32 d1, d0, d1, #1 + add r5, r1, r2 + lsl r4, r2, #1 + vld1.32 {d4[0]}, [r1], r4 + vld1.32 {d4[1]}, [r5], r4 + +3: pld [r5] + vmull.u8 q8, d4, d0 + vld1.32 {d4[0]}, [r1], r4 + vmull.u8 q9, d4, d1 + vld1.32 {d4[1]}, [r5], r4 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + vrshrn.u16 d16, q8, #6 +.ifc \type,avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 +.endif + subs r3, r3, #2 + pld [r1] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 3b + + pop {r4-r7, pc} + +4: vld1.64 {d4}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 + vtrn.32 d6, d7 + +5: vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + subs r3, r3, #2 + vld1.64 {d4}, [r1], r2 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + pld [r1] + vrshrn.u16 d16, q8, #6 +.ifc \type,avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 +.endif + vld1.64 {d6}, [r1], r2 + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 + pld [r1] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 5b + + pop {r4-r7, pc} +endfunc + .endm + + .macro h264_chroma_mc2 type +function ff_\type\()_h264_chroma_mc2_neon, export=1 + push {r4-r6, lr} + ldr r4, [sp, #16] + ldr lr, [sp, #20] + pld [r1] + pld [r1, r2] + orrs r5, r4, lr + beq 2f + + mul r5, r4, lr + rsb r6, r5, lr, lsl #3 + rsb r12, r5, r4, lsl #3 + sub r4, r5, r4, lsl #3 + sub r4, r4, lr, lsl #3 + add r4, r4, #64 + vdup.8 d0, r4 + vdup.8 d2, r12 + vdup.8 d1, r6 + vdup.8 d3, r5 + vtrn.16 q0, q1 +1: + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1], r2 + vrev64.32 d5, d4 + vld1.32 {d5[1]}, [r1] + vext.8 q3, q2, q2, #1 + vtrn.16 q2, q3 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 +.ifc \type,avg + vld1.16 {d18[0]}, [r0,:16], r2 + vld1.16 {d18[1]}, [r0,:16] + sub r0, r0, r2 +.endif + vtrn.32 d16, d17 + vadd.i16 d16, d16, d17 + vrshrn.u16 d16, q8, #6 +.ifc \type,avg + vrhadd.u8 d16, d16, d18 +.endif + vst1.16 {d16[0]}, [r0,:16], r2 + vst1.16 {d16[1]}, [r0,:16], r2 + subs r3, r3, #2 + bgt 1b + pop {r4-r6, pc} +2: +.ifc \type,put + ldrh r5, [r1], r2 + strh r5, [r0], r2 + ldrh r6, [r1], r2 + strh r6, [r0], r2 +.else + vld1.16 {d16[0]}, [r1], r2 + vld1.16 {d16[1]}, [r1], r2 + vld1.16 {d18[0]}, [r0,:16], r2 + vld1.16 {d18[1]}, [r0,:16] + sub r0, r0, r2 + vrhadd.u8 d16, d16, d18 + vst1.16 {d16[0]}, [r0,:16], r2 + vst1.16 {d16[1]}, [r0,:16], r2 +.endif + subs r3, r3, #2 + bgt 2b + pop {r4-r6, pc} +endfunc +.endm + + .text + .align + + h264_chroma_mc8 put + h264_chroma_mc8 avg + h264_chroma_mc4 put + h264_chroma_mc4 avg + h264_chroma_mc2 put + h264_chroma_mc2 avg + + /* H.264 loop filter */ + + .macro h264_loop_filter_start + ldr ip, [sp] + tst r2, r2 + ldr ip, [ip] + tstne r3, r3 + vmov.32 d24[0], ip + and ip, ip, ip, lsl #16 + bxeq lr + ands ip, ip, ip, lsl #8 + bxlt lr + .endm + + .macro align_push_regs + and ip, sp, #15 + add ip, ip, #32 + sub sp, sp, ip + vst1.64 {d12-d15}, [sp,:128] + sub sp, sp, #32 + vst1.64 {d8-d11}, [sp,:128] + .endm + + .macro align_pop_regs + vld1.64 {d8-d11}, [sp,:128]! + vld1.64 {d12-d15}, [sp,:128], ip + .endm + + .macro h264_loop_filter_luma + vdup.8 q11, r2 @ alpha + vmovl.u8 q12, d24 + vabd.u8 q6, q8, q0 @ abs(p0 - q0) + vmovl.u16 q12, d24 + vabd.u8 q14, q9, q8 @ abs(p1 - p0) + vsli.16 q12, q12, #8 + vabd.u8 q15, q1, q0 @ abs(q1 - q0) + vsli.32 q12, q12, #16 + vclt.u8 q6, q6, q11 @ < alpha + vdup.8 q11, r3 @ beta + vclt.s8 q7, q12, #0 + vclt.u8 q14, q14, q11 @ < beta + vclt.u8 q15, q15, q11 @ < beta + vbic q6, q6, q7 + vabd.u8 q4, q10, q8 @ abs(p2 - p0) + vand q6, q6, q14 + vabd.u8 q5, q2, q0 @ abs(q2 - q0) + vclt.u8 q4, q4, q11 @ < beta + vand q6, q6, q15 + vclt.u8 q5, q5, q11 @ < beta + vand q4, q4, q6 + vand q5, q5, q6 + vand q12, q12, q6 + vrhadd.u8 q14, q8, q0 + vsub.i8 q6, q12, q4 + vqadd.u8 q7, q9, q12 + vhadd.u8 q10, q10, q14 + vsub.i8 q6, q6, q5 + vhadd.u8 q14, q2, q14 + vmin.u8 q7, q7, q10 + vqsub.u8 q11, q9, q12 + vqadd.u8 q2, q1, q12 + vmax.u8 q7, q7, q11 + vqsub.u8 q11, q1, q12 + vmin.u8 q14, q2, q14 + vmovl.u8 q2, d0 + vmax.u8 q14, q14, q11 + vmovl.u8 q10, d1 + vsubw.u8 q2, q2, d16 + vsubw.u8 q10, q10, d17 + vshl.i16 q2, q2, #2 + vshl.i16 q10, q10, #2 + vaddw.u8 q2, q2, d18 + vaddw.u8 q10, q10, d19 + vsubw.u8 q2, q2, d2 + vsubw.u8 q10, q10, d3 + vrshrn.i16 d4, q2, #3 + vrshrn.i16 d5, q10, #3 + vbsl q4, q7, q9 + vbsl q5, q14, q1 + vneg.s8 q7, q6 + vmovl.u8 q14, d16 + vmin.s8 q2, q2, q6 + vmovl.u8 q6, d17 + vmax.s8 q2, q2, q7 + vmovl.u8 q11, d0 + vmovl.u8 q12, d1 + vaddw.s8 q14, q14, d4 + vaddw.s8 q6, q6, d5 + vsubw.s8 q11, q11, d4 + vsubw.s8 q12, q12, d5 + vqmovun.s16 d16, q14 + vqmovun.s16 d17, q6 + vqmovun.s16 d0, q11 + vqmovun.s16 d1, q12 + .endm + +function ff_h264_v_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + vld1.64 {d0, d1}, [r0,:128], r1 + vld1.64 {d2, d3}, [r0,:128], r1 + vld1.64 {d4, d5}, [r0,:128], r1 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + vld1.64 {d20,d21}, [r0,:128], r1 + vld1.64 {d18,d19}, [r0,:128], r1 + vld1.64 {d16,d17}, [r0,:128], r1 + + align_push_regs + + h264_loop_filter_luma + + sub r0, r0, r1, lsl #1 + vst1.64 {d8, d9}, [r0,:128], r1 + vst1.64 {d16,d17}, [r0,:128], r1 + vst1.64 {d0, d1}, [r0,:128], r1 + vst1.64 {d10,d11}, [r0,:128] + + align_pop_regs + bx lr +endfunc + +function ff_h264_h_loop_filter_luma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, #4 + vld1.64 {d6}, [r0], r1 + vld1.64 {d20}, [r0], r1 + vld1.64 {d18}, [r0], r1 + vld1.64 {d16}, [r0], r1 + vld1.64 {d0}, [r0], r1 + vld1.64 {d2}, [r0], r1 + vld1.64 {d4}, [r0], r1 + vld1.64 {d26}, [r0], r1 + vld1.64 {d7}, [r0], r1 + vld1.64 {d21}, [r0], r1 + vld1.64 {d19}, [r0], r1 + vld1.64 {d17}, [r0], r1 + vld1.64 {d1}, [r0], r1 + vld1.64 {d3}, [r0], r1 + vld1.64 {d5}, [r0], r1 + vld1.64 {d27}, [r0], r1 + + transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 + + align_push_regs + + h264_loop_filter_luma + + transpose_4x4 q4, q8, q0, q5 + + sub r0, r0, r1, lsl #4 + add r0, r0, #2 + vst1.32 {d8[0]}, [r0], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d10[0]}, [r0], r1 + vst1.32 {d8[1]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0], r1 + vst1.32 {d10[1]}, [r0], r1 + vst1.32 {d9[0]}, [r0], r1 + vst1.32 {d17[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + vst1.32 {d11[0]}, [r0], r1 + vst1.32 {d9[1]}, [r0], r1 + vst1.32 {d17[1]}, [r0], r1 + vst1.32 {d1[1]}, [r0], r1 + vst1.32 {d11[1]}, [r0], r1 + + align_pop_regs + bx lr +endfunc + + .macro h264_loop_filter_chroma + vdup.8 d22, r2 @ alpha + vmovl.u8 q12, d24 + vabd.u8 d26, d16, d0 @ abs(p0 - q0) + vmovl.u8 q2, d0 + vabd.u8 d28, d18, d16 @ abs(p1 - p0) + vsubw.u8 q2, q2, d16 + vsli.16 d24, d24, #8 + vshl.i16 q2, q2, #2 + vabd.u8 d30, d2, d0 @ abs(q1 - q0) + vaddw.u8 q2, q2, d18 + vclt.u8 d26, d26, d22 @ < alpha + vsubw.u8 q2, q2, d2 + vdup.8 d22, r3 @ beta + vclt.s8 d25, d24, #0 + vrshrn.i16 d4, q2, #3 + vclt.u8 d28, d28, d22 @ < beta + vbic d26, d26, d25 + vclt.u8 d30, d30, d22 @ < beta + vand d26, d26, d28 + vneg.s8 d25, d24 + vand d26, d26, d30 + vmin.s8 d4, d4, d24 + vmovl.u8 q14, d16 + vand d4, d4, d26 + vmax.s8 d4, d4, d25 + vmovl.u8 q11, d0 + vaddw.s8 q14, q14, d4 + vsubw.s8 q11, q11, d4 + vqmovun.s16 d16, q14 + vqmovun.s16 d0, q11 + .endm + +function ff_h264_v_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, r1, lsl #1 + vld1.64 {d18}, [r0,:64], r1 + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d0}, [r0,:64], r1 + vld1.64 {d2}, [r0,:64] + + h264_loop_filter_chroma + + sub r0, r0, r1, lsl #1 + vst1.64 {d16}, [r0,:64], r1 + vst1.64 {d0}, [r0,:64], r1 + + bx lr +endfunc + +function ff_h264_h_loop_filter_chroma_neon, export=1 + h264_loop_filter_start + + sub r0, r0, #2 + vld1.32 {d18[0]}, [r0], r1 + vld1.32 {d16[0]}, [r0], r1 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d2[0]}, [r0], r1 + vld1.32 {d18[1]}, [r0], r1 + vld1.32 {d16[1]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[1]}, [r0], r1 + + vtrn.16 d18, d0 + vtrn.16 d16, d2 + vtrn.8 d18, d16 + vtrn.8 d0, d2 + + h264_loop_filter_chroma + + vtrn.16 d18, d0 + vtrn.16 d16, d2 + vtrn.8 d18, d16 + vtrn.8 d0, d2 + + sub r0, r0, r1, lsl #3 + vst1.32 {d18[0]}, [r0], r1 + vst1.32 {d16[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d2[0]}, [r0], r1 + vst1.32 {d18[1]}, [r0], r1 + vst1.32 {d16[1]}, [r0], r1 + vst1.32 {d0[1]}, [r0], r1 + vst1.32 {d2[1]}, [r0], r1 + + bx lr +endfunc + + /* H.264 qpel MC */ + + .macro lowpass_const r + movw \r, #5 + movt \r, #20 + vmov.32 d6[0], \r + .endm + + .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 +.if \narrow + t0 .req q0 + t1 .req q8 +.else + t0 .req \d0 + t1 .req \d1 +.endif + vext.8 d2, \r0, \r1, #2 + vext.8 d3, \r0, \r1, #3 + vaddl.u8 q1, d2, d3 + vext.8 d4, \r0, \r1, #1 + vext.8 d5, \r0, \r1, #4 + vaddl.u8 q2, d4, d5 + vext.8 d30, \r0, \r1, #5 + vaddl.u8 t0, \r0, d30 + vext.8 d18, \r2, \r3, #2 + vmla.i16 t0, q1, d6[1] + vext.8 d19, \r2, \r3, #3 + vaddl.u8 q9, d18, d19 + vext.8 d20, \r2, \r3, #1 + vmls.i16 t0, q2, d6[0] + vext.8 d21, \r2, \r3, #4 + vaddl.u8 q10, d20, d21 + vext.8 d31, \r2, \r3, #5 + vaddl.u8 t1, \r2, d31 + vmla.i16 t1, q9, d6[1] + vmls.i16 t1, q10, d6[0] +.if \narrow + vqrshrun.s16 \d0, t0, #5 + vqrshrun.s16 \d1, t1, #5 +.endif + .unreq t0 + .unreq t1 + .endm + + .macro lowpass_8_1 r0, r1, d0, narrow=1 +.if \narrow + t0 .req q0 +.else + t0 .req \d0 +.endif + vext.8 d2, \r0, \r1, #2 + vext.8 d3, \r0, \r1, #3 + vaddl.u8 q1, d2, d3 + vext.8 d4, \r0, \r1, #1 + vext.8 d5, \r0, \r1, #4 + vaddl.u8 q2, d4, d5 + vext.8 d30, \r0, \r1, #5 + vaddl.u8 t0, \r0, d30 + vmla.i16 t0, q1, d6[1] + vmls.i16 t0, q2, d6[0] +.if \narrow + vqrshrun.s16 \d0, t0, #5 +.endif + .unreq t0 + .endm + + .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d + vext.16 q1, \r0, \r1, #2 + vext.16 q0, \r0, \r1, #3 + vaddl.s16 q9, d2, d0 + vext.16 q2, \r0, \r1, #1 + vaddl.s16 q1, d3, d1 + vext.16 q3, \r0, \r1, #4 + vaddl.s16 q10, d4, d6 + vext.16 \r1, \r0, \r1, #5 + vaddl.s16 q2, d5, d7 + vaddl.s16 q0, \h0, \h1 + vaddl.s16 q8, \l0, \l1 + + vshl.i32 q3, q9, #4 + vshl.i32 q9, q9, #2 + vshl.i32 q15, q10, #2 + vadd.i32 q9, q9, q3 + vadd.i32 q10, q10, q15 + + vshl.i32 q3, q1, #4 + vshl.i32 q1, q1, #2 + vshl.i32 q15, q2, #2 + vadd.i32 q1, q1, q3 + vadd.i32 q2, q2, q15 + + vadd.i32 q9, q9, q8 + vsub.i32 q9, q9, q10 + + vadd.i32 q1, q1, q0 + vsub.i32 q1, q1, q2 + + vrshrn.s32 d18, q9, #10 + vrshrn.s32 d19, q1, #10 + + vqmovun.s16 \d, q9 + .endm + +function put_h264_qpel16_h_lowpass_neon_packed + mov r4, lr + mov ip, #16 + mov r3, #8 + bl put_h264_qpel8_h_lowpass_neon + sub r1, r1, r2, lsl #4 + add r1, r1, #8 + mov ip, #16 + mov lr, r4 + b put_h264_qpel8_h_lowpass_neon +endfunc + + .macro h264_qpel_h_lowpass type +function \type\()_h264_qpel16_h_lowpass_neon + push {lr} + mov ip, #16 + bl \type\()_h264_qpel8_h_lowpass_neon + sub r0, r0, r3, lsl #4 + sub r1, r1, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #8 + mov ip, #16 + pop {lr} +endfunc + +function \type\()_h264_qpel8_h_lowpass_neon +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d16,d17}, [r1], r2 + subs ip, ip, #2 + lowpass_8 d0, d1, d16, d17, d0, d16 +.ifc \type,avg + vld1.8 {d2}, [r0,:64], r3 + vrhadd.u8 d0, d0, d2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 d16, d16, d3 + sub r0, r0, r3 +.endif + vst1.64 {d0}, [r0,:64], r3 + vst1.64 {d16}, [r0,:64], r3 + bne 1b + bx lr +endfunc + .endm + + h264_qpel_h_lowpass put + h264_qpel_h_lowpass avg + + .macro h264_qpel_h_lowpass_l2 type +function \type\()_h264_qpel16_h_lowpass_l2_neon + push {lr} + mov ip, #16 + bl \type\()_h264_qpel8_h_lowpass_l2_neon + sub r0, r0, r2, lsl #4 + sub r1, r1, r2, lsl #4 + sub r3, r3, r2, lsl #4 + add r0, r0, #8 + add r1, r1, #8 + add r3, r3, #8 + mov ip, #16 + pop {lr} +endfunc + +function \type\()_h264_qpel8_h_lowpass_l2_neon +1: vld1.64 {d0, d1}, [r1], r2 + vld1.64 {d16,d17}, [r1], r2 + vld1.64 {d28}, [r3], r2 + vld1.64 {d29}, [r3], r2 + subs ip, ip, #2 + lowpass_8 d0, d1, d16, d17, d0, d1 + vrhadd.u8 q0, q0, q14 +.ifc \type,avg + vld1.8 {d2}, [r0,:64], r2 + vrhadd.u8 d0, d0, d2 + vld1.8 {d3}, [r0,:64] + vrhadd.u8 d1, d1, d3 + sub r0, r0, r2 +.endif + vst1.64 {d0}, [r0,:64], r2 + vst1.64 {d1}, [r0,:64], r2 + bne 1b + bx lr +endfunc + .endm + + h264_qpel_h_lowpass_l2 put + h264_qpel_h_lowpass_l2 avg + +function put_h264_qpel16_v_lowpass_neon_packed + mov r4, lr + mov r2, #8 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl put_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 + b put_h264_qpel8_v_lowpass_neon +endfunc + + .macro h264_qpel_v_lowpass type +function \type\()_h264_qpel16_v_lowpass_neon + mov r4, lr + bl \type\()_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_v_lowpass_neon + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl \type\()_h264_qpel8_v_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 +endfunc + +function \type\()_h264_qpel8_v_lowpass_neon + vld1.64 {d8}, [r1], r3 + vld1.64 {d10}, [r1], r3 + vld1.64 {d12}, [r1], r3 + vld1.64 {d14}, [r1], r3 + vld1.64 {d22}, [r1], r3 + vld1.64 {d24}, [r1], r3 + vld1.64 {d26}, [r1], r3 + vld1.64 {d28}, [r1], r3 + vld1.64 {d9}, [r1], r3 + vld1.64 {d11}, [r1], r3 + vld1.64 {d13}, [r1], r3 + vld1.64 {d15}, [r1], r3 + vld1.64 {d23}, [r1] + + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 + lowpass_8 d8, d9, d10, d11, d8, d10 + lowpass_8 d12, d13, d14, d15, d12, d14 + lowpass_8 d22, d23, d24, d25, d22, d24 + lowpass_8 d26, d27, d28, d29, d26, d28 + transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 + +.ifc \type,avg + vld1.8 {d9}, [r0,:64], r2 + vrhadd.u8 d8, d8, d9 + vld1.8 {d11}, [r0,:64], r2 + vrhadd.u8 d10, d10, d11 + vld1.8 {d13}, [r0,:64], r2 + vrhadd.u8 d12, d12, d13 + vld1.8 {d15}, [r0,:64], r2 + vrhadd.u8 d14, d14, d15 + vld1.8 {d23}, [r0,:64], r2 + vrhadd.u8 d22, d22, d23 + vld1.8 {d25}, [r0,:64], r2 + vrhadd.u8 d24, d24, d25 + vld1.8 {d27}, [r0,:64], r2 + vrhadd.u8 d26, d26, d27 + vld1.8 {d29}, [r0,:64], r2 + vrhadd.u8 d28, d28, d29 + sub r0, r0, r2, lsl #3 +.endif + + vst1.64 {d8}, [r0,:64], r2 + vst1.64 {d10}, [r0,:64], r2 + vst1.64 {d12}, [r0,:64], r2 + vst1.64 {d14}, [r0,:64], r2 + vst1.64 {d22}, [r0,:64], r2 + vst1.64 {d24}, [r0,:64], r2 + vst1.64 {d26}, [r0,:64], r2 + vst1.64 {d28}, [r0,:64], r2 + + bx lr +endfunc + .endm + + h264_qpel_v_lowpass put + h264_qpel_v_lowpass avg + + .macro h264_qpel_v_lowpass_l2 type +function \type\()_h264_qpel16_v_lowpass_l2_neon + mov r4, lr + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub r0, r0, r3, lsl #4 + sub ip, ip, r2, lsl #4 + add r0, r0, #8 + add ip, ip, #8 + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + mov lr, r4 +endfunc + +function \type\()_h264_qpel8_v_lowpass_l2_neon + vld1.64 {d8}, [r1], r3 + vld1.64 {d10}, [r1], r3 + vld1.64 {d12}, [r1], r3 + vld1.64 {d14}, [r1], r3 + vld1.64 {d22}, [r1], r3 + vld1.64 {d24}, [r1], r3 + vld1.64 {d26}, [r1], r3 + vld1.64 {d28}, [r1], r3 + vld1.64 {d9}, [r1], r3 + vld1.64 {d11}, [r1], r3 + vld1.64 {d13}, [r1], r3 + vld1.64 {d15}, [r1], r3 + vld1.64 {d23}, [r1] + + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 + lowpass_8 d8, d9, d10, d11, d8, d9 + lowpass_8 d12, d13, d14, d15, d12, d13 + lowpass_8 d22, d23, d24, d25, d22, d23 + lowpass_8 d26, d27, d28, d29, d26, d27 + transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 + + vld1.64 {d0}, [ip], r2 + vld1.64 {d1}, [ip], r2 + vld1.64 {d2}, [ip], r2 + vld1.64 {d3}, [ip], r2 + vld1.64 {d4}, [ip], r2 + vrhadd.u8 q0, q0, q4 + vld1.64 {d5}, [ip], r2 + vrhadd.u8 q1, q1, q6 + vld1.64 {d10}, [ip], r2 + vrhadd.u8 q2, q2, q11 + vld1.64 {d11}, [ip], r2 + vrhadd.u8 q5, q5, q13 + +.ifc \type,avg + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d0, d0, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d1, d1, d17 + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d2, d2, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d3, d3, d17 + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d4, d4, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d5, d5, d17 + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d10, d10, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d11, d11, d17 + sub r0, r0, r3, lsl #3 +.endif + + vst1.64 {d0}, [r0,:64], r3 + vst1.64 {d1}, [r0,:64], r3 + vst1.64 {d2}, [r0,:64], r3 + vst1.64 {d3}, [r0,:64], r3 + vst1.64 {d4}, [r0,:64], r3 + vst1.64 {d5}, [r0,:64], r3 + vst1.64 {d10}, [r0,:64], r3 + vst1.64 {d11}, [r0,:64], r3 + + bx lr +endfunc + .endm + + h264_qpel_v_lowpass_l2 put + h264_qpel_v_lowpass_l2 avg + +function put_h264_qpel8_hv_lowpass_neon_top + lowpass_const ip + mov ip, #12 +1: vld1.64 {d0, d1}, [r1], r3 + vld1.64 {d16,d17}, [r1], r3 + subs ip, ip, #2 + lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 + vst1.64 {d22-d25}, [r4,:128]! + bne 1b + + vld1.64 {d0, d1}, [r1] + lowpass_8_1 d0, d1, q12, narrow=0 + + mov ip, #-16 + add r4, r4, ip + vld1.64 {d30,d31}, [r4,:128], ip + vld1.64 {d20,d21}, [r4,:128], ip + vld1.64 {d18,d19}, [r4,:128], ip + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d14,d15}, [r4,:128], ip + vld1.64 {d12,d13}, [r4,:128], ip + vld1.64 {d10,d11}, [r4,:128], ip + vld1.64 {d8, d9}, [r4,:128], ip + vld1.64 {d6, d7}, [r4,:128], ip + vld1.64 {d4, d5}, [r4,:128], ip + vld1.64 {d2, d3}, [r4,:128], ip + vld1.64 {d0, d1}, [r4,:128] + + swap4 d1, d3, d5, d7, d8, d10, d12, d14 + transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 + + swap4 d17, d19, d21, d31, d24, d26, d28, d22 + transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 + + vst1.64 {d30,d31}, [r4,:128]! + vst1.64 {d6, d7}, [r4,:128]! + vst1.64 {d20,d21}, [r4,:128]! + vst1.64 {d4, d5}, [r4,:128]! + vst1.64 {d18,d19}, [r4,:128]! + vst1.64 {d2, d3}, [r4,:128]! + vst1.64 {d16,d17}, [r4,:128]! + vst1.64 {d0, d1}, [r4,:128] + + lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 + lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 + lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 + lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 + + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128], ip + lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128], ip + lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128], ip + lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 + vld1.64 {d16,d17}, [r4,:128], ip + vld1.64 {d30,d31}, [r4,:128] + lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 + + transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 + + bx lr +endfunc + + .macro h264_qpel8_hv_lowpass type +function \type\()_h264_qpel8_hv_lowpass_neon + mov r10, lr + bl put_h264_qpel8_hv_lowpass_neon_top +.ifc \type,avg + vld1.8 {d0}, [r0,:64], r2 + vrhadd.u8 d12, d12, d0 + vld1.8 {d1}, [r0,:64], r2 + vrhadd.u8 d13, d13, d1 + vld1.8 {d2}, [r0,:64], r2 + vrhadd.u8 d14, d14, d2 + vld1.8 {d3}, [r0,:64], r2 + vrhadd.u8 d15, d15, d3 + vld1.8 {d4}, [r0,:64], r2 + vrhadd.u8 d8, d8, d4 + vld1.8 {d5}, [r0,:64], r2 + vrhadd.u8 d9, d9, d5 + vld1.8 {d6}, [r0,:64], r2 + vrhadd.u8 d10, d10, d6 + vld1.8 {d7}, [r0,:64], r2 + vrhadd.u8 d11, d11, d7 + sub r0, r0, r2, lsl #3 +.endif + vst1.64 {d12}, [r0,:64], r2 + vst1.64 {d13}, [r0,:64], r2 + vst1.64 {d14}, [r0,:64], r2 + vst1.64 {d15}, [r0,:64], r2 + vst1.64 {d8}, [r0,:64], r2 + vst1.64 {d9}, [r0,:64], r2 + vst1.64 {d10}, [r0,:64], r2 + vst1.64 {d11}, [r0,:64], r2 + + mov lr, r10 + bx lr +endfunc + .endm + + h264_qpel8_hv_lowpass put + h264_qpel8_hv_lowpass avg + + .macro h264_qpel8_hv_lowpass_l2 type +function \type\()_h264_qpel8_hv_lowpass_l2_neon + mov r10, lr + bl put_h264_qpel8_hv_lowpass_neon_top + + vld1.64 {d0, d1}, [r2,:128]! + vld1.64 {d2, d3}, [r2,:128]! + vrhadd.u8 q0, q0, q6 + vld1.64 {d4, d5}, [r2,:128]! + vrhadd.u8 q1, q1, q7 + vld1.64 {d6, d7}, [r2,:128]! + vrhadd.u8 q2, q2, q4 + vrhadd.u8 q3, q3, q5 +.ifc \type,avg + vld1.8 {d16}, [r0,:64], r3 + vrhadd.u8 d0, d0, d16 + vld1.8 {d17}, [r0,:64], r3 + vrhadd.u8 d1, d1, d17 + vld1.8 {d18}, [r0,:64], r3 + vrhadd.u8 d2, d2, d18 + vld1.8 {d19}, [r0,:64], r3 + vrhadd.u8 d3, d3, d19 + vld1.8 {d20}, [r0,:64], r3 + vrhadd.u8 d4, d4, d20 + vld1.8 {d21}, [r0,:64], r3 + vrhadd.u8 d5, d5, d21 + vld1.8 {d22}, [r0,:64], r3 + vrhadd.u8 d6, d6, d22 + vld1.8 {d23}, [r0,:64], r3 + vrhadd.u8 d7, d7, d23 + sub r0, r0, r3, lsl #3 +.endif + vst1.64 {d0}, [r0,:64], r3 + vst1.64 {d1}, [r0,:64], r3 + vst1.64 {d2}, [r0,:64], r3 + vst1.64 {d3}, [r0,:64], r3 + vst1.64 {d4}, [r0,:64], r3 + vst1.64 {d5}, [r0,:64], r3 + vst1.64 {d6}, [r0,:64], r3 + vst1.64 {d7}, [r0,:64], r3 + + mov lr, r10 + bx lr +endfunc + .endm + + h264_qpel8_hv_lowpass_l2 put + h264_qpel8_hv_lowpass_l2 avg + + .macro h264_qpel16_hv type +function \type\()_h264_qpel16_hv_lowpass_neon + mov r9, lr + bl \type\()_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + sub r0, r0, r2, lsl #4 + add r0, r0, #8 + bl \type\()_h264_qpel8_hv_lowpass_neon + sub r1, r1, r3, lsl #2 + mov lr, r9 + b \type\()_h264_qpel8_hv_lowpass_neon +endfunc + +function \type\()_h264_qpel16_hv_lowpass_l2_neon + mov r9, lr + sub r2, r4, #256 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #4 + sub r1, r1, r3, lsl #2 + add r1, r1, #8 + sub r0, r0, r3, lsl #4 + add r0, r0, #8 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub r1, r1, r3, lsl #2 + mov lr, r9 + b \type\()_h264_qpel8_hv_lowpass_l2_neon +endfunc + .endm + + h264_qpel16_hv put + h264_qpel16_hv avg + + .macro h264_qpel8 type +function ff_\type\()_h264_qpel8_mc10_neon, export=1 + lowpass_const r3 + mov r3, r1 + sub r1, r1, #2 + mov ip, #8 + b \type\()_h264_qpel8_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel8_mc20_neon, export=1 + lowpass_const r3 + sub r1, r1, #2 + mov r3, r2 + mov ip, #8 + b \type\()_h264_qpel8_h_lowpass_neon +endfunc + +function ff_\type\()_h264_qpel8_mc30_neon, export=1 + lowpass_const r3 + add r3, r1, #1 + sub r1, r1, #2 + mov ip, #8 + b \type\()_h264_qpel8_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel8_mc01_neon, export=1 + push {lr} + mov ip, r1 +\type\()_h264_qpel8_mc01: + lowpass_const r3 + mov r3, r2 + sub r1, r1, r2, lsl #1 + vpush {d8-d15} + bl \type\()_h264_qpel8_v_lowpass_l2_neon + vpop {d8-d15} + pop {pc} +endfunc + +function ff_\type\()_h264_qpel8_mc11_neon, export=1 + push {r0, r1, r11, lr} +\type\()_h264_qpel8_mc11: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #64 + mov r0, sp + sub r1, r1, #2 + mov r3, #8 + mov ip, #8 + vpush {d8-d15} + bl put_h264_qpel8_h_lowpass_neon + ldrd r0, [r11] + mov r3, r2 + add ip, sp, #64 + sub r1, r1, r2, lsl #1 + mov r2, #8 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc21_neon, export=1 + push {r0, r1, r4, r10, r11, lr} +\type\()_h264_qpel8_mc21: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(8*8+16*12) + sub r1, r1, #2 + mov r3, #8 + mov r0, sp + mov ip, #8 + vpush {d8-d15} + bl put_h264_qpel8_h_lowpass_neon + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub r2, r4, #64 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4, r10, r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc31_neon, export=1 + add r1, r1, #1 + push {r0, r1, r11, lr} + sub r1, r1, #1 + b \type\()_h264_qpel8_mc11 +endfunc + +function ff_\type\()_h264_qpel8_mc02_neon, export=1 + push {lr} + lowpass_const r3 + sub r1, r1, r2, lsl #1 + mov r3, r2 + vpush {d8-d15} + bl \type\()_h264_qpel8_v_lowpass_neon + vpop {d8-d15} + pop {pc} +endfunc + +function ff_\type\()_h264_qpel8_mc12_neon, export=1 + push {r0, r1, r4, r10, r11, lr} +\type\()_h264_qpel8_mc12: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(8*8+16*12) + sub r1, r1, r2, lsl #1 + mov r3, r2 + mov r2, #8 + mov r0, sp + vpush {d8-d15} + bl put_h264_qpel8_v_lowpass_neon + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r3, lsl #1 + sub r1, r1, #2 + sub r2, r4, #64 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4, r10, r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc22_neon, export=1 + push {r4, r10, r11, lr} + mov r11, sp + bic sp, sp, #15 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub sp, sp, #(16*12) + mov r4, sp + vpush {d8-d15} + bl \type\()_h264_qpel8_hv_lowpass_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r10, r11, pc} +endfunc + +function ff_\type\()_h264_qpel8_mc32_neon, export=1 + push {r0, r1, r4, r10, r11, lr} + add r1, r1, #1 + b \type\()_h264_qpel8_mc12 +endfunc + +function ff_\type\()_h264_qpel8_mc03_neon, export=1 + push {lr} + add ip, r1, r2 + b \type\()_h264_qpel8_mc01 +endfunc + +function ff_\type\()_h264_qpel8_mc13_neon, export=1 + push {r0, r1, r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel8_mc11 +endfunc + +function ff_\type\()_h264_qpel8_mc23_neon, export=1 + push {r0, r1, r4, r10, r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel8_mc21 +endfunc + +function ff_\type\()_h264_qpel8_mc33_neon, export=1 + add r1, r1, #1 + push {r0, r1, r11, lr} + add r1, r1, r2 + sub r1, r1, #1 + b \type\()_h264_qpel8_mc11 +endfunc + .endm + + h264_qpel8 put + h264_qpel8 avg + + .macro h264_qpel16 type +function ff_\type\()_h264_qpel16_mc10_neon, export=1 + lowpass_const r3 + mov r3, r1 + sub r1, r1, #2 + b \type\()_h264_qpel16_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel16_mc20_neon, export=1 + lowpass_const r3 + sub r1, r1, #2 + mov r3, r2 + b \type\()_h264_qpel16_h_lowpass_neon +endfunc + +function ff_\type\()_h264_qpel16_mc30_neon, export=1 + lowpass_const r3 + add r3, r1, #1 + sub r1, r1, #2 + b \type\()_h264_qpel16_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel16_mc01_neon, export=1 + push {r4, lr} + mov ip, r1 +\type\()_h264_qpel16_mc01: + lowpass_const r3 + mov r3, r2 + sub r1, r1, r2, lsl #1 + vpush {d8-d15} + bl \type\()_h264_qpel16_v_lowpass_l2_neon + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc11_neon, export=1 + push {r0, r1, r4, r11, lr} +\type\()_h264_qpel16_mc11: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #256 + mov r0, sp + sub r1, r1, #2 + mov r3, #16 + vpush {d8-d15} + bl put_h264_qpel16_h_lowpass_neon + ldrd r0, [r11] + mov r3, r2 + add ip, sp, #64 + sub r1, r1, r2, lsl #1 + mov r2, #16 + bl \type\()_h264_qpel16_v_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4, r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc21_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} +\type\()_h264_qpel16_mc21: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(16*16+16*12) + sub r1, r1, #2 + mov r0, sp + vpush {d8-d15} + bl put_h264_qpel16_h_lowpass_neon_packed + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4-r5, r9-r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc31_neon, export=1 + add r1, r1, #1 + push {r0, r1, r4, r11, lr} + sub r1, r1, #1 + b \type\()_h264_qpel16_mc11 +endfunc + +function ff_\type\()_h264_qpel16_mc02_neon, export=1 + push {r4, lr} + lowpass_const r3 + sub r1, r1, r2, lsl #1 + mov r3, r2 + vpush {d8-d15} + bl \type\()_h264_qpel16_v_lowpass_neon + vpop {d8-d15} + pop {r4, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc12_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} +\type\()_h264_qpel16_mc12: + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub sp, sp, #(16*16+16*12) + sub r1, r1, r2, lsl #1 + mov r0, sp + mov r3, r2 + vpush {d8-d15} + bl put_h264_qpel16_v_lowpass_neon_packed + mov r4, r0 + ldrd r0, [r11] + sub r1, r1, r3, lsl #1 + sub r1, r1, #2 + mov r2, r3 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon + vpop {d8-d15} + add sp, r11, #8 + pop {r4-r5, r9-r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc22_neon, export=1 + push {r4, r9-r11, lr} + lowpass_const r3 + mov r11, sp + bic sp, sp, #15 + sub r1, r1, r2, lsl #1 + sub r1, r1, #2 + mov r3, r2 + sub sp, sp, #(16*12) + mov r4, sp + vpush {d8-d15} + bl \type\()_h264_qpel16_hv_lowpass_neon + vpop {d8-d15} + mov sp, r11 + pop {r4, r9-r11, pc} +endfunc + +function ff_\type\()_h264_qpel16_mc32_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} + add r1, r1, #1 + b \type\()_h264_qpel16_mc12 +endfunc + +function ff_\type\()_h264_qpel16_mc03_neon, export=1 + push {r4, lr} + add ip, r1, r2 + b \type\()_h264_qpel16_mc01 +endfunc + +function ff_\type\()_h264_qpel16_mc13_neon, export=1 + push {r0, r1, r4, r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel16_mc11 +endfunc + +function ff_\type\()_h264_qpel16_mc23_neon, export=1 + push {r0, r1, r4-r5, r9-r11, lr} + add r1, r1, r2 + b \type\()_h264_qpel16_mc21 +endfunc + +function ff_\type\()_h264_qpel16_mc33_neon, export=1 + add r1, r1, #1 + push {r0, r1, r4, r11, lr} + add r1, r1, r2 + sub r1, r1, #1 + b \type\()_h264_qpel16_mc11 +endfunc + .endm + + h264_qpel16 put + h264_qpel16 avg + +@ Biweighted prediction + + .macro biweight_16 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q2, q8 + vmov q3, q8 +1: subs ip, ip, #2 + vld1.8 {d20-d21},[r0,:128], r2 + \macd q2, d0, d20 + pld [r0] + \macd q3, d0, d21 + vld1.8 {d22-d23},[r1,:128], r2 + \macs q2, d1, d22 + pld [r1] + \macs q3, d1, d23 + vmov q12, q8 + vld1.8 {d28-d29},[r0,:128], r2 + vmov q13, q8 + \macd q12, d0, d28 + pld [r0] + \macd q13, d0, d29 + vld1.8 {d30-d31},[r1,:128], r2 + \macs q12, d1, d30 + pld [r1] + \macs q13, d1, d31 + vshl.s16 q2, q2, q9 + vshl.s16 q3, q3, q9 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vshl.s16 q12, q12, q9 + vshl.s16 q13, q13, q9 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vmov q3, q8 + vst1.8 {d4- d5}, [r6,:128], r2 + vmov q2, q8 + vst1.8 {d24-d25},[r6,:128], r2 + bne 1b + pop {r4-r6, pc} + .endm + + .macro biweight_8 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q1, q8 + vmov q10, q8 +1: subs ip, ip, #2 + vld1.8 {d4},[r0,:64], r2 + \macd q1, d0, d4 + pld [r0] + vld1.8 {d5},[r1,:64], r2 + \macs q1, d1, d5 + pld [r1] + vld1.8 {d6},[r0,:64], r2 + \macd q10, d0, d6 + pld [r0] + vld1.8 {d7},[r1,:64], r2 + \macs q10, d1, d7 + pld [r1] + vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.8 {d2},[r6,:64], r2 + vmov q1, q8 + vst1.8 {d4},[r6,:64], r2 + bne 1b + pop {r4-r6, pc} + .endm + + .macro biweight_4 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q1, q8 + vmov q10, q8 +1: subs ip, ip, #4 + vld1.32 {d4[0]},[r0,:32], r2 + vld1.32 {d4[1]},[r0,:32], r2 + \macd q1, d0, d4 + pld [r0] + vld1.32 {d5[0]},[r1,:32], r2 + vld1.32 {d5[1]},[r1,:32], r2 + \macs q1, d1, d5 + pld [r1] + blt 2f + vld1.32 {d6[0]},[r0,:32], r2 + vld1.32 {d6[1]},[r0,:32], r2 + \macd q10, d0, d6 + pld [r0] + vld1.32 {d7[0]},[r1,:32], r2 + vld1.32 {d7[1]},[r1,:32], r2 + \macs q10, d1, d7 + pld [r1] + vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.32 {d2[0]},[r6,:32], r2 + vst1.32 {d2[1]},[r6,:32], r2 + vmov q1, q8 + vst1.32 {d4[0]},[r6,:32], r2 + vst1.32 {d4[1]},[r6,:32], r2 + bne 1b + pop {r4-r6, pc} +2: vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vst1.32 {d2[0]},[r6,:32], r2 + vst1.32 {d2[1]},[r6,:32], r2 + pop {r4-r6, pc} + .endm + + .macro biweight_func w +function biweight_h264_pixels_\w\()_neon + push {r4-r6, lr} + add r4, sp, #16 + ldm r4, {r4-r6} + lsr lr, r4, #31 + add r6, r6, #1 + eors lr, lr, r5, lsr #30 + orr r6, r6, #1 + vdup.16 q9, r3 + lsl r6, r6, r3 + vmvn q9, q9 + vdup.16 q8, r6 + mov r6, r0 + beq 10f + subs lr, lr, #1 + beq 20f + subs lr, lr, #1 + beq 30f + b 40f +10: biweight_\w vmlal.u8, vmlal.u8 +20: rsb r4, r4, #0 + biweight_\w vmlal.u8, vmlsl.u8 +30: rsb r4, r4, #0 + rsb r5, r5, #0 + biweight_\w vmlsl.u8, vmlsl.u8 +40: rsb r5, r5, #0 + biweight_\w vmlsl.u8, vmlal.u8 +endfunc + .endm + + .macro biweight_entry w, h, b=1 +function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 + mov ip, #\h +.if \b + b biweight_h264_pixels_\w\()_neon +.endif +endfunc + .endm + + biweight_entry 16, 8 + biweight_entry 16, 16, b=0 + biweight_func 16 + + biweight_entry 8, 16 + biweight_entry 8, 4 + biweight_entry 8, 8, b=0 + biweight_func 8 + + biweight_entry 4, 8 + biweight_entry 4, 2 + biweight_entry 4, 4, b=0 + biweight_func 4 + +@ Weighted prediction + + .macro weight_16 add + vdup.8 d0, r3 +1: subs ip, ip, #2 + vld1.8 {d20-d21},[r0,:128], r1 + vmull.u8 q2, d0, d20 + pld [r0] + vmull.u8 q3, d0, d21 + vld1.8 {d28-d29},[r0,:128], r1 + vmull.u8 q12, d0, d28 + pld [r0] + vmull.u8 q13, d0, d29 + \add q2, q8, q2 + vrshl.s16 q2, q2, q9 + \add q3, q8, q3 + vrshl.s16 q3, q3, q9 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + \add q12, q8, q12 + vrshl.s16 q12, q12, q9 + \add q13, q8, q13 + vrshl.s16 q13, q13, q9 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vst1.8 {d4- d5}, [r4,:128], r1 + vst1.8 {d24-d25},[r4,:128], r1 + bne 1b + pop {r4, pc} + .endm + + .macro weight_8 add + vdup.8 d0, r3 +1: subs ip, ip, #2 + vld1.8 {d4},[r0,:64], r1 + vmull.u8 q1, d0, d4 + pld [r0] + vld1.8 {d6},[r0,:64], r1 + vmull.u8 q10, d0, d6 + \add q1, q8, q1 + pld [r0] + vrshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + \add q10, q8, q10 + vrshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vst1.8 {d2},[r4,:64], r1 + vst1.8 {d4},[r4,:64], r1 + bne 1b + pop {r4, pc} + .endm + + .macro weight_4 add + vdup.8 d0, r3 + vmov q1, q8 + vmov q10, q8 +1: subs ip, ip, #4 + vld1.32 {d4[0]},[r0,:32], r1 + vld1.32 {d4[1]},[r0,:32], r1 + vmull.u8 q1, d0, d4 + pld [r0] + blt 2f + vld1.32 {d6[0]},[r0,:32], r1 + vld1.32 {d6[1]},[r0,:32], r1 + vmull.u8 q10, d0, d6 + pld [r0] + \add q1, q8, q1 + vrshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + \add q10, q8, q10 + vrshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.32 {d2[0]},[r4,:32], r1 + vst1.32 {d2[1]},[r4,:32], r1 + vmov q1, q8 + vst1.32 {d4[0]},[r4,:32], r1 + vst1.32 {d4[1]},[r4,:32], r1 + bne 1b + pop {r4, pc} +2: \add q1, q8, q1 + vrshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vst1.32 {d2[0]},[r4,:32], r1 + vst1.32 {d2[1]},[r4,:32], r1 + pop {r4, pc} + .endm + + .macro weight_func w +function weight_h264_pixels_\w\()_neon + push {r4, lr} + ldr r4, [sp, #8] + cmp r2, #1 + lsl r4, r4, r2 + vdup.16 q8, r4 + mov r4, r0 + ble 20f + rsb lr, r2, #1 + vdup.16 q9, lr + cmp r3, #0 + blt 10f + weight_\w vhadd.s16 +10: rsb r3, r3, #0 + weight_\w vhsub.s16 +20: rsb lr, r2, #0 + vdup.16 q9, lr + cmp r3, #0 + blt 10f + weight_\w vadd.s16 +10: rsb r3, r3, #0 + weight_\w vsub.s16 +endfunc + .endm + + .macro weight_entry w, h, b=1 +function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 + mov ip, #\h +.if \b + b weight_h264_pixels_\w\()_neon +.endif +endfunc + .endm + + weight_entry 16, 8 + weight_entry 16, 16, b=0 + weight_func 16 + + weight_entry 8, 16 + weight_entry 8, 4 + weight_entry 8, 8, b=0 + weight_func 8 + + weight_entry 4, 8 + weight_entry 4, 2 + weight_entry 4, 4, b=0 + weight_func 4 diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264idct_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/h264idct_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + .text + +function ff_h264_idct_add_neon, export=1 + vld1.64 {d0-d3}, [r1,:128] + + vswp d1, d2 + vadd.i16 d4, d0, d1 + vshr.s16 q8, q1, #1 + vsub.i16 d5, d0, d1 + vadd.i16 d6, d2, d17 + vsub.i16 d7, d16, d3 + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vtrn.16 d0, d1 + vtrn.16 d3, d2 + vtrn.32 d0, d3 + vtrn.32 d1, d2 + + vadd.i16 d4, d0, d3 + vld1.32 {d18[0]}, [r0,:32], r2 + vswp d1, d3 + vshr.s16 q8, q1, #1 + vld1.32 {d19[1]}, [r0,:32], r2 + vsub.i16 d5, d0, d1 + vld1.32 {d18[1]}, [r0,:32], r2 + vadd.i16 d6, d16, d3 + vld1.32 {d19[0]}, [r0,:32], r2 + vsub.i16 d7, d2, d17 + sub r0, r0, r2, lsl #2 + vadd.i16 q0, q2, q3 + vsub.i16 q1, q2, q3 + + vrshr.s16 q0, q0, #6 + vrshr.s16 q1, q1, #6 + + vaddw.u8 q0, q0, d18 + vaddw.u8 q1, q1, d19 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + + bx lr +endfunc + +function ff_h264_idct_dc_add_neon, export=1 + vld1.16 {d2[],d3[]}, [r1,:16] + vrshr.s16 q1, q1, #6 + vld1.32 {d0[0]}, [r0,:32], r2 + vld1.32 {d0[1]}, [r0,:32], r2 + vaddw.u8 q2, q1, d0 + vld1.32 {d1[0]}, [r0,:32], r2 + vld1.32 {d1[1]}, [r0,:32], r2 + vaddw.u8 q1, q1, d1 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q1 + sub r0, r0, r2, lsl #2 + vst1.32 {d0[0]}, [r0,:32], r2 + vst1.32 {d0[1]}, [r0,:32], r2 + vst1.32 {d1[0]}, [r0,:32], r2 + vst1.32 {d1[1]}, [r0,:32], r2 + bx lr +endfunc + +function ff_h264_idct_add16_neon, export=1 + push {r4-r8,lr} + mov r4, r0 + mov r5, r1 + mov r1, r2 + mov r2, r3 + ldr r6, [sp, #24] + movrel r7, scan8 + mov ip, #16 +1: ldrb r8, [r7], #1 + ldr r0, [r5], #4 + ldrb r8, [r6, r8] + subs r8, r8, #1 + blt 2f + ldrsh lr, [r1] + add r0, r0, r4 + movne lr, #0 + cmp lr, #0 + adrne lr, ff_h264_idct_dc_add_neon + adreq lr, ff_h264_idct_add_neon + blx lr +2: subs ip, ip, #1 + add r1, r1, #32 + bne 1b + pop {r4-r8,pc} +endfunc + +function ff_h264_idct_add16intra_neon, export=1 + push {r4-r8,lr} + mov r4, r0 + mov r5, r1 + mov r1, r2 + mov r2, r3 + ldr r6, [sp, #24] + movrel r7, scan8 + mov ip, #16 +1: ldrb r8, [r7], #1 + ldr r0, [r5], #4 + ldrb r8, [r6, r8] + add r0, r0, r4 + cmp r8, #0 + ldrsh r8, [r1] + adrne lr, ff_h264_idct_add_neon + adreq lr, ff_h264_idct_dc_add_neon + cmpeq r8, #0 + blxne lr + subs ip, ip, #1 + add r1, r1, #32 + bne 1b + pop {r4-r8,pc} +endfunc + +function ff_h264_idct_add8_neon, export=1 + push {r4-r10,lr} + ldm r0, {r4,r9} + add r5, r1, #16*4 + add r1, r2, #16*32 + mov r2, r3 + ldr r6, [sp, #32] + movrel r7, scan8+16 + mov ip, #8 +1: ldrb r8, [r7], #1 + ldr r0, [r5], #4 + ldrb r8, [r6, r8] + tst ip, #4 + addeq r0, r0, r4 + addne r0, r0, r9 + cmp r8, #0 + ldrsh r8, [r1] + adrne lr, ff_h264_idct_add_neon + adreq lr, ff_h264_idct_dc_add_neon + cmpeq r8, #0 + blxne lr + subs ip, ip, #1 + add r1, r1, #32 + bne 1b + pop {r4-r10,pc} +endfunc + + .section .rodata +scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 + .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 + .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 + .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 + .byte 1+1*8, 2+1*8 + .byte 1+2*8, 2+2*8 + .byte 1+4*8, 2+4*8 + .byte 1+5*8, 2+5*8 diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264pred_init_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/h264pred_init_arm.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/h264pred.h" + +void ff_pred16x16_vert_neon(uint8_t *src, int stride); +void ff_pred16x16_hor_neon(uint8_t *src, int stride); +void ff_pred16x16_plane_neon(uint8_t *src, int stride); +void ff_pred16x16_dc_neon(uint8_t *src, int stride); +void ff_pred16x16_128_dc_neon(uint8_t *src, int stride); +void ff_pred16x16_left_dc_neon(uint8_t *src, int stride); +void ff_pred16x16_top_dc_neon(uint8_t *src, int stride); + +void ff_pred8x8_vert_neon(uint8_t *src, int stride); +void ff_pred8x8_hor_neon(uint8_t *src, int stride); +void ff_pred8x8_plane_neon(uint8_t *src, int stride); +void ff_pred8x8_dc_neon(uint8_t *src, int stride); +void ff_pred8x8_128_dc_neon(uint8_t *src, int stride); +void ff_pred8x8_left_dc_neon(uint8_t *src, int stride); +void ff_pred8x8_top_dc_neon(uint8_t *src, int stride); +void ff_pred8x8_l0t_dc_neon(uint8_t *src, int stride); +void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride); +void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride); +void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride); + +#if HAVE_NEON +static void ff_h264_pred_init_neon(H264PredContext *h) +{ + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; + h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_neon; + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; + + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; + + + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; + h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; +} +#endif + +void ff_h264_pred_init_arm(H264PredContext *h) +{ + if (HAVE_NEON) ff_h264_pred_init_neon(h); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/h264pred_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/h264pred_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + .macro ldcol.8 rd, rs, rt, n=8, hi=0 +.if \n == 8 || \hi == 0 + vld1.8 {\rd[0]}, [\rs], \rt + vld1.8 {\rd[1]}, [\rs], \rt + vld1.8 {\rd[2]}, [\rs], \rt + vld1.8 {\rd[3]}, [\rs], \rt +.endif +.if \n == 8 || \hi == 1 + vld1.8 {\rd[4]}, [\rs], \rt + vld1.8 {\rd[5]}, [\rs], \rt + vld1.8 {\rd[6]}, [\rs], \rt + vld1.8 {\rd[7]}, [\rs], \rt +.endif + .endm + + .macro add16x8 dq, dl, dh, rl, rh + vaddl.u8 \dq, \rl, \rh + vadd.u16 \dl, \dl, \dh + vpadd.u16 \dl, \dl, \dl + vpadd.u16 \dl, \dl, \dl + .endm + +function ff_pred16x16_128_dc_neon, export=1 + vmov.i8 q0, #128 + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_top_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {q0}, [r2,:128] + add16x8 q0, d0, d1, d0, d1 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_left_dc_neon, export=1 + sub r2, r0, #1 + ldcol.8 d0, r2, r1 + ldcol.8 d1, r2, r1 + add16x8 q0, d0, d1, d0, d1 + vrshrn.u16 d0, q0, #4 + vdup.8 q0, d0[0] + b .L_pred16x16_dc_end +endfunc + +function ff_pred16x16_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {q0}, [r2,:128] + sub r2, r0, #1 + ldcol.8 d2, r2, r1 + ldcol.8 d3, r2, r1 + vaddl.u8 q0, d0, d1 + vaddl.u8 q1, d2, d3 + vadd.u16 q0, q0, q1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #5 + vdup.8 q0, d0[0] +.L_pred16x16_dc_end: + mov r3, #8 +6: vst1.8 {q0}, [r0,:128], r1 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 6b + bx lr +endfunc + +function ff_pred16x16_hor_neon, export=1 + sub r2, r0, #1 + mov r3, #16 +1: vld1.8 {d0[],d1[]},[r2], r1 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred16x16_vert_neon, export=1 + sub r0, r0, r1 + vld1.8 {q0}, [r0,:128], r1 + mov r3, #8 +1: vst1.8 {q0}, [r0,:128], r1 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred16x16_plane_neon, export=1 + sub r3, r0, r1 + add r2, r3, #8 + sub r3, r3, #1 + vld1.8 {d0}, [r3] + vld1.8 {d2}, [r2,:64], r1 + ldcol.8 d1, r3, r1 + add r3, r3, r1 + ldcol.8 d3, r3, r1 + vrev64.8 q0, q0 + vaddl.u8 q8, d2, d3 + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d1 + movrel r3, p16weight + vld1.8 {q0}, [r3,:128] + vmul.s16 q2, q2, q0 + vmul.s16 q3, q3, q0 + vadd.i16 d4, d4, d5 + vadd.i16 d5, d6, d7 + vpadd.i16 d4, d4, d5 + vpadd.i16 d4, d4, d4 + vshl.i16 d5, d4, #2 + vaddl.s16 q2, d4, d5 + vrshrn.s32 d4, q2, #6 + mov r3, #0 + vtrn.16 d4, d5 + vadd.i16 d2, d4, d5 + vshl.i16 d3, d2, #3 + vrev64.16 d16, d17 + vsub.i16 d3, d3, d2 + vadd.i16 d16, d16, d0 + vshl.i16 d2, d16, #4 + vsub.i16 d2, d2, d3 + vshl.i16 d3, d4, #4 + vext.16 q0, q0, q0, #7 + vsub.i16 d6, d5, d3 + vmov.16 d0[0], r3 + vmul.i16 q0, q0, d4[0] + vdup.16 q1, d2[0] + vdup.16 q2, d4[0] + vdup.16 q3, d6[0] + vshl.i16 q2, q2, #3 + vadd.i16 q1, q1, q0 + vadd.i16 q3, q3, q2 + mov r3, #16 +1: + vqshrun.s16 d0, q1, #5 + vadd.i16 q1, q1, q2 + vqshrun.s16 d1, q1, #5 + vadd.i16 q1, q1, q3 + vst1.8 {q0}, [r0,:128], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + + .section .rodata + .align 4 +p16weight: + .short 1,2,3,4,5,6,7,8 + + .text + +function ff_pred8x8_hor_neon, export=1 + sub r2, r0, #1 + mov r3, #8 +1: vld1.8 {d0[]}, [r2], r1 + vst1.8 {d0}, [r0,:64], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred8x8_vert_neon, export=1 + sub r0, r0, r1 + vld1.8 {d0}, [r0,:64], r1 + mov r3, #4 +1: vst1.8 {d0}, [r0,:64], r1 + vst1.8 {d0}, [r0,:64], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred8x8_plane_neon, export=1 + sub r3, r0, r1 + add r2, r3, #4 + sub r3, r3, #1 + vld1.32 {d0[0]}, [r3] + vld1.32 {d2[0]}, [r2,:32], r1 + ldcol.8 d0, r3, r1, 4, hi=1 + add r3, r3, r1 + ldcol.8 d3, r3, r1, 4 + vaddl.u8 q8, d2, d3 + vrev32.8 d0, d0 + vtrn.32 d2, d3 + vsubl.u8 q2, d2, d0 + movrel r3, p16weight + vld1.16 {q0}, [r3,:128] + vmul.s16 d4, d4, d0 + vmul.s16 d5, d5, d0 + vpadd.i16 d4, d4, d5 + vpaddl.s16 d4, d4 + vshl.i32 d5, d4, #4 + vadd.s32 d4, d4, d5 + vrshrn.s32 d4, q2, #5 + mov r3, #0 + vtrn.16 d4, d5 + vadd.i16 d2, d4, d5 + vshl.i16 d3, d2, #2 + vrev64.16 d16, d16 + vsub.i16 d3, d3, d2 + vadd.i16 d16, d16, d0 + vshl.i16 d2, d16, #4 + vsub.i16 d2, d2, d3 + vshl.i16 d3, d4, #3 + vext.16 q0, q0, q0, #7 + vsub.i16 d6, d5, d3 + vmov.16 d0[0], r3 + vmul.i16 q0, q0, d4[0] + vdup.16 q1, d2[0] + vdup.16 q2, d4[0] + vdup.16 q3, d6[0] + vshl.i16 q2, q2, #3 + vadd.i16 q1, q1, q0 + vadd.i16 q3, q3, q2 + mov r3, #8 +1: + vqshrun.s16 d0, q1, #5 + vadd.i16 q1, q1, q3 + vst1.8 {d0}, [r0,:64], r1 + subs r3, r3, #1 + bne 1b + bx lr +endfunc + +function ff_pred8x8_128_dc_neon, export=1 + vmov.i8 q0, #128 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_top_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 d1, d0[1] + vdup.8 d0, d0[0] + vtrn.32 d0, d1 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_left_dc_neon, export=1 + sub r2, r0, #1 + ldcol.8 d0, r2, r1 + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #2 + vdup.8 d1, d0[1] + vdup.8 d0, d0[0] + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + sub r2, r0, #1 + ldcol.8 d1, r2, r1 + vtrn.32 d0, d1 + vpaddl.u8 q0, q0 + vpadd.u16 d0, d0, d1 + vpadd.u16 d1, d0, d0 + vrshrn.u16 d2, q0, #3 + vrshrn.u16 d3, q0, #2 + vdup.8 d0, d2[4] + vdup.8 d1, d3[3] + vdup.8 d4, d3[2] + vdup.8 d5, d2[5] + vtrn.32 q0, q2 +.L_pred8x8_dc_end: + mov r3, #4 + add r2, r0, r1, lsl #2 +6: vst1.8 {d0}, [r0,:64], r1 + vst1.8 {d1}, [r2,:64], r1 + subs r3, r3, #1 + bne 6b + bx lr +endfunc + +function ff_pred8x8_l0t_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + sub r2, r0, #1 + ldcol.8 d1, r2, r1, 4 + vtrn.32 d0, d1 + vpaddl.u8 q0, q0 + vpadd.u16 d0, d0, d1 + vpadd.u16 d1, d0, d0 + vrshrn.u16 d2, q0, #3 + vrshrn.u16 d3, q0, #2 + vdup.8 d0, d2[4] + vdup.8 d1, d3[0] + vdup.8 q2, d3[2] + vtrn.32 q0, q2 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_l00_dc_neon, export=1 + sub r2, r0, #1 + ldcol.8 d0, r2, r1, 4 + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0, d0 + vrshrn.u16 d0, q0, #2 + vmov.i8 d1, #128 + vdup.8 d0, d0[0] + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_0lt_dc_neon, export=1 + sub r2, r0, r1 + vld1.8 {d0}, [r2,:64] + add r2, r0, r1, lsl #2 + sub r2, r2, #1 + ldcol.8 d1, r2, r1, 4, hi=1 + vtrn.32 d0, d1 + vpaddl.u8 q0, q0 + vpadd.u16 d0, d0, d1 + vpadd.u16 d1, d0, d0 + vrshrn.u16 d3, q0, #2 + vrshrn.u16 d2, q0, #3 + vdup.8 d0, d3[0] + vdup.8 d1, d3[3] + vdup.8 d4, d3[2] + vdup.8 d5, d2[5] + vtrn.32 q0, q2 + b .L_pred8x8_dc_end +endfunc + +function ff_pred8x8_0l0_dc_neon, export=1 + add r2, r0, r1, lsl #2 + sub r2, r2, #1 + ldcol.8 d1, r2, r1, 4 + vpaddl.u8 d2, d1 + vpadd.u16 d2, d2, d2 + vrshrn.u16 d1, q1, #2 + vmov.i8 d0, #128 + vdup.8 d1, d1[0] + b .L_pred8x8_dc_end +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/int_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/int_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,118 @@ +/* + * ARM NEON optimised integer operations + * Copyright (c) 2009 Kostya Shishkov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + .fpu neon + .text + +function ff_scalarproduct_int16_neon, export=1 + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vmov.i16 q2, #0 + vmov.i16 q3, #0 + negs r3, r3 + beq 2f + + vdup.s32 q12, r3 +1: vld1.16 {d16-d17}, [r0]! + vld1.16 {d20-d21}, [r1,:128]! + vmull.s16 q12, d16, d20 + vld1.16 {d18-d19}, [r0]! + vmull.s16 q13, d17, d21 + vld1.16 {d22-d23}, [r1,:128]! + vmull.s16 q14, d18, d22 + vmull.s16 q15, d19, d23 + vshl.s32 q8, q12, q12 + vshl.s32 q9, q13, q12 + vadd.s32 q0, q0, q8 + vshl.s32 q10, q14, q12 + vadd.s32 q1, q1, q9 + vshl.s32 q11, q15, q12 + vadd.s32 q2, q2, q10 + vadd.s32 q3, q3, q11 + subs r2, r2, #16 + bne 1b + b 3f + +2: vld1.16 {d16-d17}, [r0]! + vld1.16 {d20-d21}, [r1,:128]! + vmlal.s16 q0, d16, d20 + vld1.16 {d18-d19}, [r0]! + vmlal.s16 q1, d17, d21 + vld1.16 {d22-d23}, [r1,:128]! + vmlal.s16 q2, d18, d22 + vmlal.s16 q3, d19, d23 + subs r2, r2, #16 + bne 2b + +3: vpadd.s32 d16, d0, d1 + vpadd.s32 d17, d2, d3 + vpadd.s32 d10, d4, d5 + vpadd.s32 d11, d6, d7 + vpadd.s32 d0, d16, d17 + vpadd.s32 d1, d10, d11 + vpadd.s32 d2, d0, d1 + vpaddl.s32 d3, d2 + vmov.32 r0, d3[0] + bx lr +endfunc + +@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) +function ff_scalarproduct_and_madd_int16_neon, export=1 + vld1.16 {d28[],d29[]}, [sp] + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vmov.i16 q2, #0 + vmov.i16 q3, #0 + mov r12, r0 + +1: vld1.16 {d16-d17}, [r0,:128]! + vld1.16 {d18-d19}, [r1]! + vld1.16 {d20-d21}, [r2]! + vld1.16 {d22-d23}, [r0,:128]! + vld1.16 {d24-d25}, [r1]! + vld1.16 {d26-d27}, [r2]! + vmul.s16 q10, q10, q14 + vmul.s16 q13, q13, q14 + vmlal.s16 q0, d16, d18 + vmlal.s16 q1, d17, d19 + vadd.s16 q10, q8, q10 + vadd.s16 q13, q11, q13 + vmlal.s16 q2, d22, d24 + vmlal.s16 q3, d23, d25 + vst1.16 {q10}, [r12,:128]! + subs r3, r3, #16 + vst1.16 {q13}, [r12,:128]! + bne 1b + + vpadd.s32 d16, d0, d1 + vpadd.s32 d17, d2, d3 + vpadd.s32 d10, d4, d5 + vpadd.s32 d11, d6, d7 + vpadd.s32 d0, d16, d17 + vpadd.s32 d1, d10, d11 + vpadd.s32 d2, d0, d1 + vpaddl.s32 d3, d2 + vmov.32 r0, d3[0] + bx lr +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/jrevdct_arm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/jrevdct_arm.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,388 @@ +/* + C-like prototype : + void j_rev_dct_arm(DCTBLOCK data) + + With DCTBLOCK being a pointer to an array of 64 'signed shorts' + + Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ + +#include "asm.S" + +#define FIX_0_298631336 2446 +#define FIX_0_541196100 4433 +#define FIX_0_765366865 6270 +#define FIX_1_175875602 9633 +#define FIX_1_501321110 12299 +#define FIX_2_053119869 16819 +#define FIX_3_072711026 25172 +#define FIX_M_0_390180644 -3196 +#define FIX_M_0_899976223 -7373 +#define FIX_M_1_847759065 -15137 +#define FIX_M_1_961570560 -16069 +#define FIX_M_2_562915447 -20995 +#define FIX_0xFFFF 0xFFFF + +#define FIX_0_298631336_ID 0 +#define FIX_0_541196100_ID 4 +#define FIX_0_765366865_ID 8 +#define FIX_1_175875602_ID 12 +#define FIX_1_501321110_ID 16 +#define FIX_2_053119869_ID 20 +#define FIX_3_072711026_ID 24 +#define FIX_M_0_390180644_ID 28 +#define FIX_M_0_899976223_ID 32 +#define FIX_M_1_847759065_ID 36 +#define FIX_M_1_961570560_ID 40 +#define FIX_M_2_562915447_ID 44 +#define FIX_0xFFFF_ID 48 + .text + .align + +function ff_j_rev_dct_arm, export=1 + stmdb sp!, { r4 - r12, lr } @ all callee saved regs + + sub sp, sp, #4 @ reserve some space on the stack + str r0, [ sp ] @ save the DCT pointer to the stack + + mov lr, r0 @ lr = pointer to the current row + mov r12, #8 @ r12 = row-counter + adr r11, const_array @ r11 = base pointer to the constants array +row_loop: + ldrsh r0, [lr, # 0] @ r0 = 'd0' + ldrsh r2, [lr, # 2] @ r2 = 'd2' + + @ Optimization for row that have all items except the first set to 0 + @ (this works as the DCTELEMS are always 4-byte aligned) + ldr r5, [lr, # 0] + ldr r6, [lr, # 4] + ldr r3, [lr, # 8] + ldr r4, [lr, #12] + orr r3, r3, r4 + orr r3, r3, r6 + orrs r5, r3, r5 + beq end_of_row_loop @ nothing to be done as ALL of them are '0' + orrs r3, r3, r2 + beq empty_row + + ldrsh r1, [lr, # 8] @ r1 = 'd1' + ldrsh r4, [lr, # 4] @ r4 = 'd4' + ldrsh r6, [lr, # 6] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r7, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r7, r3, r7 @ r7 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r7 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r7 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r3, r6, r3, lsl #13 @ r3 = tmp12 + + stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11 + + ldrsh r3, [lr, #10] @ r3 = 'd3' + ldrsh r5, [lr, #12] @ r5 = 'd5' + ldrsh r7, [lr, #14] @ r7 = 'd7' + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 @ r8 = z3 + z4 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) + add r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 0] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) + sub r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #14] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) + add r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 2] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) + sub r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #12] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) + add r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 4] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) + sub r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #10] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) + add r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 6] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) + sub r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 8] + + @ End of row loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + beq start_column_loop + +empty_row: + ldr r1, [r11, #FIX_0xFFFF_ID] + mov r0, r0, lsl #2 + and r0, r0, r1 + add r0, r0, r0, lsl #16 + str r0, [lr, # 0] + str r0, [lr, # 4] + str r0, [lr, # 8] + str r0, [lr, #12] + +end_of_row_loop: + @ End of loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + +start_column_loop: + @ Start of column loop + ldr lr, [ sp ] + mov r12, #8 +column_loop: + ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' + ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' + ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' + ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r1, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r1, r3, r1 @ r1 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r1 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r1 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r6, r6, r3, lsl #13 @ r6 = tmp12 + + ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' + ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' + ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' + ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' + + @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) + orr r9, r1, r3 + orr r10, r5, r7 + orrs r10, r9, r10 + beq empty_odd_column + + stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11 + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + add r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 0*8)] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + sub r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + add r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 2*8)] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + sub r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + add r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 4*8)] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + sub r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + add r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 6*8)] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + sub r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + beq the_end + +empty_odd_column: + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + add r0, r0, #(1<<17) + mov r0, r0, asr #18 + strh r0, [lr, #( 0*8)] + strh r0, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + add r4, r4, #(1<<17) + mov r4, r4, asr #18 + strh r4, [lr, #( 2*8)] + strh r4, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + add r6, r6, #(1<<17) + mov r6, r6, asr #18 + strh r6, [lr, #( 4*8)] + strh r6, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + add r2, r2, #(1<<17) + mov r2, r2, asr #18 + strh r2, [lr, #( 6*8)] + strh r2, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + +the_end: + @ The end.... + add sp, sp, #4 + ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return + +const_array: + .align + .word FIX_0_298631336 + .word FIX_0_541196100 + .word FIX_0_765366865 + .word FIX_1_175875602 + .word FIX_1_501321110 + .word FIX_2_053119869 + .word FIX_3_072711026 + .word FIX_M_0_390180644 + .word FIX_M_0_899976223 + .word FIX_M_1_847759065 + .word FIX_M_1_961570560 + .word FIX_M_2_562915447 + .word FIX_0xFFFF diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mathops.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/mathops.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,116 @@ +/* + * simple math operations + * Copyright (c) 2006 Michael Niedermayer et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_MATHOPS_H +#define AVCODEC_ARM_MATHOPS_H + +#include +#include "config.h" +#include "libavutil/common.h" + +#if HAVE_INLINE_ASM + +# define MULL MULL +static inline av_const int MULL(int a, int b, unsigned shift) +{ + int lo, hi; + __asm__("smull %0, %1, %2, %3 \n\t" + "mov %0, %0, lsr %4 \n\t" + "add %1, %0, %1, lsl %5 \n\t" + : "=&r"(lo), "=&r"(hi) + : "r"(b), "r"(a), "ir"(shift), "ir"(32-shift)); + return hi; +} + +#define MULH MULH +#if HAVE_ARMV6 +static inline av_const int MULH(int a, int b) +{ + int r; + __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} +#else +static inline av_const int MULH(int a, int b) +{ + int lo, hi; + __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a)); + return hi; +} +#endif + +static inline av_const int64_t MUL64(int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x; + __asm__ ("smull %0, %1, %2, %3" + : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b)); + return x.x; +} +#define MUL64 MUL64 + +static inline av_const int64_t MAC64(int64_t d, int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x = { d }; + __asm__ ("smlal %0, %1, %2, %3" + : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b)); + return x.x; +} +#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) +#define MLS64(d, a, b) MAC64(d, -(a), b) + +#if HAVE_ARMV5TE + +/* signed 16x16 -> 32 multiply add accumulate */ +# define MAC16(rt, ra, rb) \ + __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); + +/* signed 16x16 -> 32 multiply */ +# define MUL16 MUL16 +static inline av_const int MUL16(int ra, int rb) +{ + int rt; + __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); + return rt; +} + +#endif + +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ + int m; + __asm__ volatile ( + "mov %0, %2 \n\t" + "cmp %1, %2 \n\t" + "movgt %0, %1 \n\t" + "movgt %1, %2 \n\t" + "cmp %1, %3 \n\t" + "movle %1, %3 \n\t" + "cmp %0, %1 \n\t" + "movgt %0, %1 \n\t" + : "=&r"(m), "+r"(a) + : "r"(b), "r"(c)); + return m; +} + +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVCODEC_ARM_MATHOPS_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mdct_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/mdct_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,303 @@ +/* + * ARM NEON optimised MDCT + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + + .text + +#define ff_fft_calc_neon X(ff_fft_calc_neon) + +function ff_imdct_half_neon, export=1 + push {r4-r8,lr} + + mov r12, #1 + ldr lr, [r0, #28] @ mdct_bits + ldr r4, [r0, #32] @ tcos + ldr r3, [r0, #8] @ revtab + lsl r12, r12, lr @ n = 1 << nbits + lsr lr, r12, #2 @ n4 = n >> 2 + add r7, r2, r12, lsl #1 + mov r12, #-16 + sub r7, r7, #16 + + vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 + vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x + vrev64.32 d17, d17 + vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 + vmul.f32 d6, d17, d2 + vmul.f32 d7, d0, d2 +1: + subs lr, lr, #2 + ldr r6, [r3], #4 + vmul.f32 d4, d0, d3 + vmul.f32 d5, d17, d3 + vsub.f32 d4, d6, d4 + vadd.f32 d5, d5, d7 + uxth r8, r6, ror #16 + uxth r6, r6 + add r8, r1, r8, lsl #3 + add r6, r1, r6, lsl #3 + beq 1f + vld2.32 {d16-d17},[r7,:128],r12 + vld2.32 {d0-d1}, [r2,:128]! + vrev64.32 d17, d17 + vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 + vmul.f32 d6, d17, d2 + vmul.f32 d7, d0, d2 + vst2.32 {d4[0],d5[0]}, [r6,:64] + vst2.32 {d4[1],d5[1]}, [r8,:64] + b 1b +1: + vst2.32 {d4[0],d5[0]}, [r6,:64] + vst2.32 {d4[1],d5[1]}, [r8,:64] + + mov r4, r0 + mov r6, r1 + bl ff_fft_calc_neon + + mov r12, #1 + ldr lr, [r4, #28] @ mdct_bits + ldr r4, [r4, #32] @ tcos + lsl r12, r12, lr @ n = 1 << nbits + lsr lr, r12, #3 @ n8 = n >> 3 + + add r4, r4, lr, lsl #3 + add r6, r6, lr, lsl #3 + sub r1, r4, #16 + sub r3, r6, #16 + + mov r7, #-16 + mov r8, r6 + mov r0, r3 + + vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 + vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 + vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 +1: + subs lr, lr, #2 + vmul.f32 d7, d0, d18 + vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 + vmul.f32 d4, d1, d18 + vmul.f32 d5, d21, d19 + vmul.f32 d6, d20, d19 + vmul.f32 d22, d1, d16 + vmul.f32 d23, d21, d17 + vmul.f32 d24, d0, d16 + vmul.f32 d25, d20, d17 + vadd.f32 d7, d7, d22 + vadd.f32 d6, d6, d23 + vsub.f32 d4, d4, d24 + vsub.f32 d5, d5, d25 + beq 1f + vld2.32 {d0-d1}, [r3,:128], r7 + vld2.32 {d20-d21},[r6,:128]! + vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128], r7 + vst2.32 {d5,d7}, [r8,:128]! + b 1b +1: + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128] + vst2.32 {d5,d7}, [r8,:128] + + pop {r4-r8,pc} +endfunc + +function ff_imdct_calc_neon, export=1 + push {r4-r6,lr} + + ldr r3, [r0, #28] + mov r4, #1 + mov r5, r1 + lsl r4, r4, r3 + add r1, r1, r4 + + bl ff_imdct_half_neon + + add r0, r5, r4, lsl #2 + add r1, r5, r4, lsl #1 + sub r0, r0, #8 + sub r2, r1, #16 + mov r3, #-16 + mov r6, #-8 + vmov.i32 d30, #1<<31 +1: + vld1.32 {d0-d1}, [r2,:128], r3 + pld [r0, #-16] + vrev64.32 q0, q0 + vld1.32 {d2-d3}, [r1,:128]! + veor d4, d1, d30 + pld [r2, #-16] + vrev64.32 q1, q1 + veor d5, d0, d30 + vst1.32 {d2}, [r0,:64], r6 + vst1.32 {d3}, [r0,:64], r6 + vst1.32 {d4-d5}, [r5,:128]! + subs r4, r4, #16 + bgt 1b + + pop {r4-r6,pc} +endfunc + +function ff_mdct_calc_neon, export=1 + push {r4-r10,lr} + + mov r12, #1 + ldr lr, [r0, #28] @ mdct_bits + ldr r4, [r0, #32] @ tcos + ldr r3, [r0, #8] @ revtab + lsl lr, r12, lr @ n = 1 << nbits + add r7, r2, lr @ in4u + sub r9, r7, #16 @ in4d + add r2, r7, lr, lsl #1 @ in3u + add r8, r9, lr, lsl #1 @ in3d + add r5, r4, lr, lsl #1 + sub r5, r5, #16 + sub r3, r3, #4 + mov r12, #-16 + + vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 + vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 + vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 + vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 + vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 + vsub.f32 d0, d18, d0 @ in4d-in4u I + vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 + vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 + vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 + vadd.f32 d1, d1, d19 @ in3u+in3d -R + vsub.f32 d16, d16, d2 @ in0u-in2d R + vadd.f32 d17, d17, d3 @ in2u+in1d -I +1: + vmul.f32 d7, d0, d21 @ I*s + ldr r10, [r3, lr, lsr #1] + vmul.f32 d6, d1, d20 @ -R*c + ldr r6, [r3, #4]! + vmul.f32 d4, d1, d21 @ -R*s + vmul.f32 d5, d0, d20 @ I*c + vmul.f32 d24, d16, d30 @ R*c + vmul.f32 d25, d17, d31 @ -I*s + vmul.f32 d22, d16, d31 @ R*s + vmul.f32 d23, d17, d30 @ I*c + subs lr, lr, #16 + vsub.f32 d6, d6, d7 @ -R*c-I*s + vadd.f32 d7, d4, d5 @ -R*s+I*c + vsub.f32 d24, d25, d24 @ I*s-R*c + vadd.f32 d25, d22, d23 @ R*s-I*c + beq 1f + mov r12, #-16 + vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 + vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 + vneg.f32 d7, d7 @ R*s-I*c + vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 + vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 + vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 + vsub.f32 d0, d18, d0 @ in4d-in4u I + vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 + vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 + vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 + vadd.f32 d1, d1, d19 @ in3u+in3d -R + vsub.f32 d16, d16, d2 @ in0u-in2d R + vadd.f32 d17, d17, d3 @ in2u+in1d -I + uxth r12, r6, ror #16 + uxth r6, r6 + add r12, r1, r12, lsl #3 + add r6, r1, r6, lsl #3 + vst2.32 {d6[0],d7[0]}, [r6,:64] + vst2.32 {d6[1],d7[1]}, [r12,:64] + uxth r6, r10, ror #16 + uxth r10, r10 + add r6 , r1, r6, lsl #3 + add r10, r1, r10, lsl #3 + vst2.32 {d24[0],d25[0]},[r10,:64] + vst2.32 {d24[1],d25[1]},[r6,:64] + b 1b +1: + vneg.f32 d7, d7 @ R*s-I*c + uxth r12, r6, ror #16 + uxth r6, r6 + add r12, r1, r12, lsl #3 + add r6, r1, r6, lsl #3 + vst2.32 {d6[0],d7[0]}, [r6,:64] + vst2.32 {d6[1],d7[1]}, [r12,:64] + uxth r6, r10, ror #16 + uxth r10, r10 + add r6 , r1, r6, lsl #3 + add r10, r1, r10, lsl #3 + vst2.32 {d24[0],d25[0]},[r10,:64] + vst2.32 {d24[1],d25[1]},[r6,:64] + + mov r4, r0 + mov r6, r1 + bl ff_fft_calc_neon + + mov r12, #1 + ldr lr, [r4, #28] @ mdct_bits + ldr r4, [r4, #32] @ tcos + lsl r12, r12, lr @ n = 1 << nbits + lsr lr, r12, #3 @ n8 = n >> 3 + + add r4, r4, lr, lsl #3 + add r6, r6, lr, lsl #3 + sub r1, r4, #16 + sub r3, r6, #16 + + mov r7, #-16 + mov r8, r6 + mov r0, r3 + + vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 + vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 + vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 +1: + subs lr, lr, #2 + vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 + vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 + vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 + vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 + vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 + vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 + vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 + vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 + vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 + vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 + vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 + vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 + vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 + vneg.f32 q2, q2 + beq 1f + vld2.32 {d0-d1}, [r3,:128], r7 + vld2.32 {d20-d21},[r6,:128]! + vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128], r7 + vst2.32 {d5,d7}, [r8,:128]! + b 1b +1: + vrev64.32 q3, q3 + vst2.32 {d4,d6}, [r0,:128] + vst2.32 {d5,d7}, [r8,:128] + + pop {r4-r10,pc} +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_arm.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/mpegvideo_arm.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2002 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" +#include "mpegvideo_arm.h" + +void MPV_common_init_arm(MpegEncContext *s) +{ + /* IWMMXT support is a superset of armv5te, so + * allow optimized functions for armv5te unless + * a better iwmmxt function exists + */ +#if HAVE_ARMV5TE + MPV_common_init_armv5te(s); +#endif +#if HAVE_IWMMXT + MPV_common_init_iwmmxt(s); +#endif +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_arm.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/mpegvideo_arm.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,27 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_MPEGVIDEO_H +#define AVCODEC_ARM_MPEGVIDEO_H + +#include "libavcodec/mpegvideo.h" + +void MPV_common_init_iwmmxt(MpegEncContext *s); +void MPV_common_init_armv5te(MpegEncContext *s); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_armv5te.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/mpegvideo_armv5te.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,101 @@ +/* + * Optimization of some functions from mpegvideo.c for armv5te + * Copyright (c) 2007 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" +#include "mpegvideo_arm.h" + +void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count); + +#ifdef ENABLE_ARM_TESTS +/** + * h263 dequantizer supplementary function, it is performance critical and needs to + * have optimized implementations for each architecture. Is also used as a reference + * implementation in regression tests + */ +static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count) +{ + int i, level; + for (i = 0; i < count; i++) { + level = block[i]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[i] = level; + } + } +} +#endif + +static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int level, qmul, qadd; + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + qmul = qscale << 1; + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level = block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); + block[0] = level; +} + +static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int qmul, qadd; + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + qadd = (qscale - 1) | 1; + qmul = qscale << 1; + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); +} + +void MPV_common_init_armv5te(MpegEncContext *s) +{ + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_armv5te_s.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/mpegvideo_armv5te_s.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,117 @@ +/* + * Optimization of some functions from mpegvideo.c for armv5te + * Copyright (c) 2007 Siarhei Siamashka + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + +/* + * Special optimized version of dct_unquantize_h263_helper_c, it + * requires the block to be at least 8 bytes aligned, and may process + * more elements than requested. But it is guaranteed to never + * process more than 64 elements provided that count argument is <= 64, + * so it is safe. This function is optimized for a common distribution + * of values for nCoeffs (they are mostly multiple of 8 plus one or + * two extra elements). So this function processes data as 8 elements + * per loop iteration and contains optional 2 elements processing in + * the end. + * + * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) + */ +function ff_dct_unquantize_h263_armv5te, export=1 + push {r4-r9,lr} + mov ip, #0 + subs r3, r3, #2 + ble 2f + ldrd r4, [r0, #0] +1: + ldrd r6, [r0, #8] + + rsbs r9, ip, r4, asr #16 + addgt r9, r2, #0 + rsblt r9, r2, #0 + smlatbne r9, r4, r1, r9 + + rsbs lr, ip, r5, asr #16 + addgt lr, r2, #0 + rsblt lr, r2, #0 + smlatbne lr, r5, r1, lr + + rsbs r8, ip, r4, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r4, r4, r1, r8 + + rsbs r8, ip, r5, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r5, r5, r1, r8 + + strh r4, [r0], #2 + strh r9, [r0], #2 + strh r5, [r0], #2 + strh lr, [r0], #2 + + rsbs r9, ip, r6, asr #16 + addgt r9, r2, #0 + rsblt r9, r2, #0 + smlatbne r9, r6, r1, r9 + + rsbs lr, ip, r7, asr #16 + addgt lr, r2, #0 + rsblt lr, r2, #0 + smlatbne lr, r7, r1, lr + + rsbs r8, ip, r6, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r6, r6, r1, r8 + + rsbs r8, ip, r7, asl #16 + addgt r8, r2, #0 + rsblt r8, r2, #0 + smlabbne r7, r7, r1, r8 + + strh r6, [r0], #2 + strh r9, [r0], #2 + strh r7, [r0], #2 + strh lr, [r0], #2 + + subs r3, r3, #8 + ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ + bgt 1b + + adds r3, r3, #2 + pople {r4-r9,pc} +2: + ldrsh r9, [r0, #0] + ldrsh lr, [r0, #2] + mov r8, r2 + cmp r9, #0 + rsblt r8, r2, #0 + smlabbne r9, r9, r1, r8 + mov r8, r2 + cmp lr, #0 + rsblt r8, r2, #0 + smlabbne lr, lr, r1, r8 + strh r9, [r0], #2 + strh lr, [r0], #2 + pop {r4-r9,pc} +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/mpegvideo_iwmmxt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/mpegvideo_iwmmxt.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,120 @@ +/* + * copyright (c) 2004 AGAWA Koji + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" +#include "mpegvideo_arm.h" + +static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int level, qmul, qadd; + int nCoeffs; + DCTELEM *block_orig = block; + + assert(s->block_last_index[n]>=0); + + qmul = qscale << 1; + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level = block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + __asm__ volatile ( +/* "movd %1, %%mm6 \n\t" //qmul */ +/* "packssdw %%mm6, %%mm6 \n\t" */ +/* "packssdw %%mm6, %%mm6 \n\t" */ + "tbcsth wr6, %[qmul] \n\t" +/* "movd %2, %%mm5 \n\t" //qadd */ +/* "packssdw %%mm5, %%mm5 \n\t" */ +/* "packssdw %%mm5, %%mm5 \n\t" */ + "tbcsth wr5, %[qadd] \n\t" + "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ + "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ + "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ + "1: \n\t" + "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ + "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ + "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ + "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ +/* "movq (%0, %3), %%mm2 \n\t" */ +/* "movq 8(%0, %3), %%mm3 \n\t" */ + "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ + "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ + "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ + "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ + "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ + "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ + "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ + "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ + "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ + "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ + "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ + "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ + "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ + "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ + "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ + "subs %[i], %[i], #1 \n\t" + "bne 1b \n\t" /* "jng 1b \n\t" */ + :[block]"+r"(block) + :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) + :"memory"); + + block_orig[0] = level; +} + +#if 0 +static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + + assert(s->block_last_index[n]>=0); + + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); +} +#endif + +void MPV_common_init_iwmmxt(MpegEncContext *s) +{ + if (!(mm_flags & FF_MM_IWMMXT)) return; + + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; +#if 0 + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; +#endif +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/rdft_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/rdft_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,151 @@ +/* + * ARM NEON optimised RDFT + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + +function ff_rdft_calc_neon, export=1 + push {r4-r8,lr} + + ldr r6, [r0, #4] @ inverse + mov r4, r0 + mov r5, r1 + + lsls r6, r6, #31 + bne 1f + add r0, r4, #20 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_calc_neon) +1: + ldr r12, [r4, #0] @ nbits + mov r2, #1 + lsl r12, r2, r12 + add r0, r5, #8 + add r1, r5, r12, lsl #2 + lsr r12, r12, #2 + ldr r2, [r4, #12] @ tcos + sub r12, r12, #2 + ldr r3, [r4, #16] @ tsin + mov r7, r0 + sub r1, r1, #8 + mov lr, r1 + mov r8, #-8 + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + vld1.32 {d5}, [r3,:64]! @ tsin[i] + vmov.f32 d18, #0.5 @ k1 + vdup.32 d19, r6 + pld [r0, #32] + veor d19, d18, d19 @ k2 + vmov.i32 d16, #0 + vmov.i32 d17, #1<<31 + pld [r1, #-32] + vtrn.32 d16, d17 + pld [r2, #32] + vrev64.32 d16, d16 @ d16=1,0 d17=0,1 + pld [r3, #32] +2: + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vld1.32 {d24}, [r0,:64]! @ d1[0,1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] + pld [r0, #32] + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + pld [r1, #-32] + vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] + vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + veor d2, d3, d16 @ -od.re, od.im + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + veor d7, d23, d16 @ -od.im, od.re + vld1.32 {d5}, [r3,:64]! @ tsin[i] + veor d24, d22, d17 @ ev.re,-ev.im + vrev64.32 d3, d23 @ od.re, od.im + pld [r2, #32] + veor d2, d3, d16 @ -od.re, od.im + pld [r3, #32] + vmla.f32 d22, d3, d4[0] + vmla.f32 d22, d7, d5[0] + vmla.f32 d24, d2, d4[0] + vmla.f32 d24, d23, d5[0] + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vst1.32 {d20}, [r7,:64]! + vst1.32 {d6}, [lr,:64], r8 + vst1.32 {d22}, [r7,:64]! + vst1.32 {d24}, [lr,:64], r8 + subs r12, r12, #2 + bgt 2b + + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + ldr r2, [r4, #8] @ sign_convention + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + add r0, r0, #4 + bfc r2, #0, #31 + vld1.32 {d0[0]}, [r0,:32] + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + vld1.32 {d22}, [r5,:64] + vdup.32 d1, r2 + vmov d23, d22 + veor d2, d3, d16 @ -od.re, od.im + vtrn.32 d22, d23 + veor d0, d0, d1 + veor d23, d23, d17 + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vadd.f32 d22, d22, d23 + vst1.32 {d20}, [r7,:64] + vst1.32 {d6}, [lr,:64] + vst1.32 {d0[0]}, [r0,:32] + vst1.32 {d22}, [r5,:64] + + cmp r6, #0 + popeq {r4-r8,pc} + + vmul.f32 d22, d22, d18 + vst1.32 {d22}, [r5,:64] + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + pop {r4-r8,lr} + b X(ff_fft_calc_neon) +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_arm.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/simple_idct_arm.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,486 @@ +/* + * simple_idct_arm.S + * Copyright (C) 2002 Frederic 'dilb' Boulay + * + * Author: Frederic Boulay + * + * The function defined in this file is derived from the simple_idct function + * from the libavcodec library part of the FFmpeg project. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +/* useful constants for the algorithm, they are save in __constant_ptr__ at */ +/* the end of the source code.*/ +#define W1 22725 +#define W2 21407 +#define W3 19266 +#define W4 16383 +#define W5 12873 +#define W6 8867 +#define W7 4520 +#define MASK_MSHW 0xFFFF0000 + +/* offsets of the constants in the vector */ +#define offW1 0 +#define offW2 4 +#define offW3 8 +#define offW4 12 +#define offW5 16 +#define offW6 20 +#define offW7 24 +#define offMASK_MSHW 28 + +#define ROW_SHIFT 11 +#define ROW_SHIFT2MSHW (16-11) +#define COL_SHIFT 20 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ + + + .text + +function ff_simple_idct_arm, export=1 + @@ void simple_idct_arm(int16_t *block) + @@ save stack for reg needed (take all of them), + @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block + @@ so it must not be overwritten, if it is not saved!! + @@ R12 is another scratch register, so it should not be saved too + @@ save all registers + stmfd sp!, {r4-r11, r14} @ R14 is also called LR + @@ at this point, R0=block, other registers are free. + add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. + adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it + @@ add 2 temporary variables in the stack: R0 and R14 + sub sp, sp, #8 @ allow 2 local variables + str r0, [sp, #0] @ save block in sp[0] + @@ stack status + @@ sp+4 free + @@ sp+0 R0 (block) + + + @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free + + +__row_loop: + @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) + ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) + ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] + ldr r3, [r14, #8] @ R3=ROWr32[2] + ldr r4, [r14, #12] @ R4=ROWr32[3] + @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), + @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) + @@ else follow the complete algorithm. + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free + orr r5, r4, r3 @ R5=R4 | R3 + orr r5, r5, r2 @ R5=R4 | R3 | R2 + orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) + beq __end_row_loop + mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) + ldrsh r6, [r14, #0] @ R6=ROWr16[0] + orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 + beq __almost_empty_row + +__b_evaluation: + @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], + @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, + @@ R12=__const_ptr_, R14=&block[n] + @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 + + @@ MUL16(b0, W1, row[1]); + @@ MUL16(b1, W3, row[1]); + @@ MUL16(b2, W5, row[1]); + @@ MUL16(b3, W7, row[1]); + @@ MAC16(b0, W3, row[3]); + @@ MAC16(b1, -W7, row[3]); + @@ MAC16(b2, -W1, row[3]); + @@ MAC16(b3, -W5, row[3]); + ldr r8, [r12, #offW1] @ R8=W1 + mov r2, r2, asr #16 @ R2=ROWr16[3] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, [r12, #offW3] @ R9=W3 + ldr r10, [r12, #offW5] @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, [r12, #offW7] @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if null avoid muls + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] + beq __end_b_evaluation + + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, row[5]); + @@ MAC16(b2, W7, row[5]); + @@ MAC16(b3, W3, row[5]); + @@ MAC16(b1, -W1, row[5]); + @@ MAC16(b0, W7, row[7]); + @@ MAC16(b2, W3, row[7]); + @@ MAC16(b3, -W1, row[7]); + @@ MAC16(b1, -W5, row[7]); + mov r3, r3, asr #16 @ R3=ROWr16[5] + teq r3, #0 @ if null avoid muls + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 + mov r4, r4, asr #16 @ R4=ROWr16[7] + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5] + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 + @@ R3 is free now + teq r4, #0 @ if null avoid muls + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 + @@ R4 is free now +__end_b_evaluation: + @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), + @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +__a_evaluation: + @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldr r9, [r12, #offW4] @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, [r12, #offW6] @ R10=W6 + ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) + + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, [r12, #offW2] @ R8=W2 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; + @@ if (temp != 0) {} + teq r2, #0 + beq __end_bef_a_evaluation + + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + + + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #8] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + ldrsh r9, [r14, #12] @ R9=ROWr16[6] + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) + +__end_a_evaluation: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ row[0] = (a0 + b0) >> ROW_SHIFT; + @@ row[1] = (a1 + b1) >> ROW_SHIFT; + @@ row[2] = (a2 + b2) >> ROW_SHIFT; + @@ row[3] = (a3 + b3) >> ROW_SHIFT; + @@ row[4] = (a3 - b3) >> ROW_SHIFT; + @@ row[5] = (a2 - b2) >> ROW_SHIFT; + @@ row[6] = (a1 - b1) >> ROW_SHIFT; + @@ row[7] = (a0 - b0) >> ROW_SHIFT; + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + @@ put 2 16 bits half-words in a 32bits word + @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) + ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) + mvn r11, r10 @ R11= NOT R10= 0x0000FFFF + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) + orr r8, r8, r9 + str r8, [r14, #0] + + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) + orr r8, r8, r9 + str r8, [r14, #4] + + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) + orr r8, r8, r9 + str r8, [r14, #8] + + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) + orr r8, r8, r9 + str r8, [r14, #12] + + bal __end_row_loop + +__almost_empty_row: + @@ the row was empty, except ROWr16[0], now, management of this special case + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], + @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], + @@ R8=0xFFFF (temp), R9-R11 free + mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). + sub r8, r8, #1 @ R8 is now ready. + and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF + orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) + str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 + str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 + str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 + str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 + +__end_row_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. + sub r14, r14, #16 + bne __row_loop + + + + @@ at this point, R0=block, R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. +__col_loop: + +__b_evaluation2: + @@ at this point, R0=block (temp), R1-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + @@ proceed with b0-b3 first, followed by a0-a3 + @@ MUL16(b0, W1, col[8x1]); + @@ MUL16(b1, W3, col[8x1]); + @@ MUL16(b2, W5, col[8x1]); + @@ MUL16(b3, W7, col[8x1]); + @@ MAC16(b0, W3, col[8x3]); + @@ MAC16(b1, -W7, col[8x3]); + @@ MAC16(b2, -W1, col[8x3]); + @@ MAC16(b3, -W5, col[8x3]); + ldr r8, [r12, #offW1] @ R8=W1 + ldrsh r7, [r14, #16] + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r9, [r12, #offW3] @ R9=W3 + ldr r10, [r12, #offW5] @ R10=W5 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldr r11, [r12, #offW7] @ R11=W7 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + ldrsh r2, [r14, #48] + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if 0, then avoid muls + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + rsbne r2, r2, #0 @ R2=-ROWr16[3] + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, + @@ R12=__const_ptr_, R14=&block[n] + @@ MAC16(b0, W5, col[5x8]); + @@ MAC16(b2, W7, col[5x8]); + @@ MAC16(b3, W3, col[5x8]); + @@ MAC16(b1, -W1, col[5x8]); + @@ MAC16(b0, W7, col[7x8]); + @@ MAC16(b2, W3, col[7x8]); + @@ MAC16(b3, -W1, col[7x8]); + @@ MAC16(b1, -W5, col[7x8]); + ldrsh r3, [r14, #80] @ R3=COLr16[5x8] + teq r3, #0 @ if 0 then avoid muls + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 + rsbne r3, r3, #0 @ R3=-ROWr16[5x8] + ldrsh r4, [r14, #112] @ R4=COLr16[7x8] + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 + @@ R3 is free now + teq r4, #0 @ if 0 then avoid muls + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 + rsbne r4, r4, #0 @ R4=-ROWr16[7x8] + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 + @@ R4 is free now +__end_b_evaluation2: + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), + @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + +__a_evaluation2: + @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); + @@ a1 = a0 + W6 * row[2]; + @@ a2 = a0 - W6 * row[2]; + @@ a3 = a0 - W2 * row[2]; + @@ a0 = a0 + W2 * row[2]; + ldrsh r6, [r14, #0] + ldr r9, [r12, #offW4] @ R9=W4 + mul r6, r9, r6 @ R6=W4*ROWr16[0] + ldr r10, [r12, #offW6] @ R10=W6 + ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) + add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) + mul r11, r10, r4 @ R11=W6*ROWr16[2] + ldr r8, [r12, #offW2] @ R8=W2 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ a0 += W4*row[4] + @@ a1 -= W4*row[4] + @@ a2 -= W4*row[4] + @@ a3 += W4*row[4] + ldrsh r11, [r14, #64] @ R11=ROWr16[4] + teq r11, #0 @ if null avoid muls + mulne r11, r9, r11 @ R11=W4*ROWr16[4] + @@ R9 is free now + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) + ldrsh r9, [r14, #96] @ R9=ROWr16[6] + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead + teq r9, #0 @ if null avoid muls + mulne r11, r10, r9 @ R11=W6*ROWr16[6] + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) + mulne r10, r8, r9 @ R10=W2*ROWr16[6] + @@ a0 += W6*row[6]; + @@ a3 -= W6*row[6]; + @@ a1 -= W2*row[6]; + @@ a2 += W2*row[6]; + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) +__end_a_evaluation2: + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), + @@ R12=__const_ptr_, R14=&block[n] + @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); + @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); + @@ col[16] = ((a2 + b2) >> COL_SHIFT); + @@ col[24] = ((a3 + b3) >> COL_SHIFT); + @@ col[32] = ((a3 - b3) >> COL_SHIFT); + @@ col[40] = ((a2 - b2) >> COL_SHIFT); + @@ col[48] = ((a1 - b1) >> COL_SHIFT); + @@ col[56] = ((a0 - b0) >> COL_SHIFT); + @@@@@ no optimization here @@@@@ + add r8, r6, r0 @ R8=a0+b0 + add r9, r2, r1 @ R9=a1+b1 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #0] + strh r9, [r14, #16] + add r8, r3, r5 @ R8=a2+b2 + add r9, r4, r7 @ R9=a3+b3 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #32] + strh r9, [r14, #48] + sub r8, r4, r7 @ R8=a3-b3 + sub r9, r3, r5 @ R9=a2-b2 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #64] + strh r9, [r14, #80] + sub r8, r2, r1 @ R8=a1-b1 + sub r9, r6, r0 @ R9=a0-b0 + mov r8, r8, asr #COL_SHIFT + mov r9, r9, asr #COL_SHIFT + strh r8, [r14, #96] + strh r9, [r14, #112] + +__end_col_loop: + @@ at this point, R0-R11 (free) + @@ R12=__const_ptr_, R14=&block[n] + ldr r0, [sp, #0] @ R0=block + teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. + sub r14, r14, #2 + bne __col_loop + + + + +__end_simple_idct_arm: + @@ restore registers to previous status! + add sp, sp, #8 @@ the local variables! + ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. + + + +@@ kind of sub-function, here not to overload the common case. +__end_bef_a_evaluation: + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) + mul r11, r8, r4 @ R11=W2*ROWr16[2] + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) + bal __end_a_evaluation + + +__constant_ptr__: @@ see #defines at the beginning of the source code for values. + .align + .word W1 + .word W2 + .word W3 + .word W4 + .word W5 + .word W6 + .word W7 + .word MASK_MSHW diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_armv5te.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/simple_idct_armv5te.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,703 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * Copyright (c) 2006 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + + .text + .align +w13: .long W13 +w26: .long W26 +w57: .long W57 + +function idct_row_armv5te + str lr, [sp, #-4]! + + ldrd v1, [a1, #8] + ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ + orrs v1, v1, v2 + cmpeq v1, a4 + cmpeq v1, a3, lsr #16 + beq row_dc_only + + mov v1, #(1<<(ROW_SHIFT-1)) + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ + smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ + ldr ip, w26 /* ip = W2 | (W6 << 16) */ + smultb a2, ip, a4 + smulbb lr, ip, a4 + add v2, v1, a2 + sub v3, v1, a2 + sub v4, v1, lr + add v1, v1, lr + + ldr ip, w13 /* ip = W1 | (W3 << 16) */ + ldr lr, w57 /* lr = W5 | (W7 << 16) */ + smulbt v5, ip, a3 + smultt v6, lr, a4 + smlatt v5, ip, a4, v5 + smultt a2, ip, a3 + smulbt v7, lr, a3 + sub v6, v6, a2 + smulbt a2, ip, a4 + smultt fp, lr, a3 + sub v7, v7, a2 + smulbt a2, lr, a4 + ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ + sub fp, fp, a2 + + orrs a2, a3, a4 + beq 1f + + smlabt v5, lr, a3, v5 + smlabt v6, ip, a3, v6 + smlatt v5, lr, a4, v5 + smlabt v6, lr, a4, v6 + smlatt v7, lr, a3, v7 + smlatt fp, ip, a3, fp + smulbt a2, ip, a4 + smlatt v7, ip, a4, v7 + sub fp, fp, a2 + + ldr ip, w26 /* ip = W2 | (W6 << 16) */ + mov a2, #16384 + sub a2, a2, #1 /* a2 = W4 */ + smulbb a2, a2, a3 /* a2 = W4*row[4] */ + smultb lr, ip, a4 /* lr = W6*row[6] */ + add v1, v1, a2 /* v1 += W4*row[4] */ + add v1, v1, lr /* v1 += W6*row[6] */ + add v4, v4, a2 /* v4 += W4*row[4] */ + sub v4, v4, lr /* v4 -= W6*row[6] */ + smulbb lr, ip, a4 /* lr = W2*row[6] */ + sub v2, v2, a2 /* v2 -= W4*row[4] */ + sub v2, v2, lr /* v2 -= W2*row[6] */ + sub v3, v3, a2 /* v3 -= W4*row[4] */ + add v3, v3, lr /* v3 += W2*row[6] */ + +1: add a2, v1, v5 + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v2, v6 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v3, v7 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + add a2, v4, fp + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, [a1] + + sub a2, v4, fp + mov a3, a2, lsr #11 + bic a3, a3, #0x1f0000 + sub a2, v3, v7 + mov a2, a2, lsr #11 + add a3, a3, a2, lsl #16 + add a2, v2, v6 + mov a4, a2, lsr #11 + bic a4, a4, #0x1f0000 + sub a2, v1, v5 + mov a2, a2, lsr #11 + add a4, a4, a2, lsl #16 + strd a3, [a1, #8] + + ldr pc, [sp], #4 + +row_dc_only: + orr a3, a3, a3, lsl #16 + bic a3, a3, #0xe000 + mov a3, a3, lsl #3 + mov a4, a3 + strd a3, [a1] + strd a3, [a1, #8] + + ldr pc, [sp], #4 +endfunc + + .macro idct_col + ldr a4, [a1] /* a4 = col[1:0] */ + mov ip, #16384 + sub ip, ip, #1 /* ip = W4 */ +#if 0 + mov v1, #(1<<(COL_SHIFT-1)) + smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ + smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ + ldr a4, [a1, #(16*4)] +#else + mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ + add v2, v1, a4, asr #16 + rsb v2, v2, v2, lsl #14 + mov a4, a4, lsl #16 + add v1, v1, a4, asr #16 + ldr a4, [a1, #(16*4)] + rsb v1, v1, v1, lsl #14 +#endif + + smulbb lr, ip, a4 + smulbt a3, ip, a4 + sub v3, v1, lr + sub v5, v1, lr + add v7, v1, lr + add v1, v1, lr + sub v4, v2, a3 + sub v6, v2, a3 + add fp, v2, a3 + ldr ip, w26 + ldr a4, [a1, #(16*2)] + add v2, v2, a3 + + smulbb lr, ip, a4 + smultb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + add v3, v3, a3 + sub v5, v5, a3 + smulbt lr, ip, a4 + smultt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + add v4, v4, a3 + ldr a4, [a1, #(16*6)] + sub v6, v6, a3 + + smultb lr, ip, a4 + smulbb a3, ip, a4 + add v1, v1, lr + sub v7, v7, lr + sub v3, v3, a3 + add v5, v5, a3 + smultt lr, ip, a4 + smulbt a3, ip, a4 + add v2, v2, lr + sub fp, fp, lr + sub v4, v4, a3 + add v6, v6, a3 + + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} + + ldr ip, w13 + ldr a4, [a1, #(16*1)] + ldr lr, w57 + smulbb v1, ip, a4 + smultb v3, ip, a4 + smulbb v5, lr, a4 + smultb v7, lr, a4 + smulbt v2, ip, a4 + smultt v4, ip, a4 + smulbt v6, lr, a4 + smultt fp, lr, a4 + rsb v4, v4, #0 + ldr a4, [a1, #(16*3)] + rsb v3, v3, #0 + + smlatb v1, ip, a4, v1 + smlatb v3, lr, a4, v3 + smulbb a3, ip, a4 + smulbb a2, lr, a4 + sub v5, v5, a3 + sub v7, v7, a2 + smlatt v2, ip, a4, v2 + smlatt v4, lr, a4, v4 + smulbt a3, ip, a4 + smulbt a2, lr, a4 + sub v6, v6, a3 + ldr a4, [a1, #(16*5)] + sub fp, fp, a2 + + smlabb v1, lr, a4, v1 + smlabb v3, ip, a4, v3 + smlatb v5, lr, a4, v5 + smlatb v7, ip, a4, v7 + smlabt v2, lr, a4, v2 + smlabt v4, ip, a4, v4 + smlatt v6, lr, a4, v6 + ldr a3, [a1, #(16*7)] + smlatt fp, ip, a4, fp + + smlatb v1, lr, a3, v1 + smlabb v3, lr, a3, v3 + smlatb v5, ip, a3, v5 + smulbb a4, ip, a3 + smlatt v2, lr, a3, v2 + sub v7, v7, a4 + smlabt v4, lr, a3, v4 + smulbt a4, ip, a3 + smlatt v6, ip, a3, v6 + sub fp, fp, a4 + .endm + +function idct_col_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + adds a2, a3, v1 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + add ip, a4, v2 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1] + subs a3, a3, v1 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + sub a4, a4, v2 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*7)] + + subs a2, a3, v3 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + sub ip, a4, v4 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*1)] + adds a3, a3, v3 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + add a4, a4, v4 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*6)] + + adds a2, a3, v5 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + add ip, a4, v6 + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*2)] + subs a3, a3, v5 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + sub a4, a4, v6 + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + ldmfd sp!, {a3, a4} + str a2, [a1, #(16*5)] + + adds a2, a3, v7 + mov a2, a2, lsr #20 + orrmi a2, a2, #0xf000 + add ip, a4, fp + mov ip, ip, asr #20 + orr a2, a2, ip, lsl #16 + str a2, [a1, #(16*3)] + subs a3, a3, v7 + mov a2, a3, lsr #20 + orrmi a2, a2, #0xf000 + sub a4, a4, fp + mov a4, a4, asr #20 + orr a2, a2, a4, lsl #16 + str a2, [a1, #(16*4)] + + ldr pc, [sp], #4 +endfunc + +function idct_col_put_armv5te + str lr, [sp, #-4]! + + idct_col + + ldmfd sp!, {a3, a4} + ldr lr, [sp, #32] + add a2, a3, v1 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add ip, a4, v2 + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + sub a3, a3, v1 + movs a3, a3, asr #20 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + sub a4, a4, v2 + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + ldr v1, [sp, #28] + movgt a4, #255 + strh a2, [v1] + add a2, v1, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + rsb v2, lr, lr, lsl #3 + ldmfd sp!, {a3, a4} + strh a2, [v2, v1]! + + sub a2, a3, v3 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub ip, a4, v4 + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr]! + add a3, a3, v3 + movs a2, a3, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add a4, a4, v4 + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh a2, [v2, -lr]! + + add a2, a3, v5 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add ip, a4, v6 + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr]! + sub a3, a3, v5 + movs a2, a3, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub a4, a4, v6 + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a2, a4, lsl #8 + ldmfd sp!, {a3, a4} + strh a2, [v2, -lr]! + + add a2, a3, v7 + movs a2, a2, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add ip, a4, fp + movs ip, ip, asr #20 + movmi ip, #0 + cmp ip, #255 + movgt ip, #255 + orr a2, a2, ip, lsl #8 + strh a2, [v1, lr] + sub a3, a3, v7 + movs a2, a3, asr #20 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub a4, a4, fp + movs a4, a4, asr #20 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a2, a4, lsl #8 + strh a2, [v2, -lr] + + ldr pc, [sp], #4 +endfunc + +function idct_col_add_armv5te + str lr, [sp, #-4]! + + idct_col + + ldr lr, [sp, #36] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr] + add a2, a3, v1 + mov a2, a2, asr #20 + sub a3, a3, v1 + and v1, ip, #255 + adds a2, a2, v1 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add v1, a4, v2 + mov v1, v1, asr #20 + adds v1, v1, ip, lsr #8 + movmi v1, #0 + cmp v1, #255 + movgt v1, #255 + orr a2, a2, v1, lsl #8 + ldr v1, [sp, #32] + sub a4, a4, v2 + rsb v2, v1, v1, lsl #3 + ldrh ip, [v2, lr]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + add a2, lr, #2 + str a2, [sp, #28] + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr, v1]! + sub a2, a3, v3 + mov a2, a2, asr #20 + add a3, a3, v3 + and v3, ip, #255 + adds a2, a2, v3 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + sub v3, a4, v4 + mov v3, v3, asr #20 + adds v3, v3, ip, lsr #8 + movmi v3, #0 + cmp v3, #255 + movgt v3, #255 + orr a2, a2, v3, lsl #8 + add a4, a4, v4 + ldrh ip, [v2, -v1]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr, v1]! + add a2, a3, v5 + mov a2, a2, asr #20 + sub a3, a3, v5 + and v3, ip, #255 + adds a2, a2, v3 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add v3, a4, v6 + mov v3, v3, asr #20 + adds v3, v3, ip, lsr #8 + movmi v3, #0 + cmp v3, #255 + movgt v3, #255 + orr a2, a2, v3, lsl #8 + sub a4, a4, v6 + ldrh ip, [v2, -v1]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldmfd sp!, {a3, a4} + ldrh ip, [lr, v1]! + add a2, a3, v7 + mov a2, a2, asr #20 + sub a3, a3, v7 + and v3, ip, #255 + adds a2, a2, v3 + movmi a2, #0 + cmp a2, #255 + movgt a2, #255 + add v3, a4, fp + mov v3, v3, asr #20 + adds v3, v3, ip, lsr #8 + movmi v3, #0 + cmp v3, #255 + movgt v3, #255 + orr a2, a2, v3, lsl #8 + sub a4, a4, fp + ldrh ip, [v2, -v1]! + strh a2, [lr] + mov a3, a3, asr #20 + and a2, ip, #255 + adds a3, a3, a2 + movmi a3, #0 + cmp a3, #255 + movgt a3, #255 + mov a4, a4, asr #20 + adds a4, a4, ip, lsr #8 + movmi a4, #0 + cmp a4, #255 + movgt a4, #255 + orr a2, a3, a4, lsl #8 + strh a2, [v2] + + ldr pc, [sp], #4 +endfunc + +function ff_simple_idct_armv5te, export=1 + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + add a1, a1, #4 + bl idct_col_armv5te + + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc + +function ff_simple_idct_add_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + add a1, a1, #4 + bl idct_col_add_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc + +function ff_simple_idct_put_armv5te, export=1 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} + + mov a1, a3 + + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + add a1, a1, #16 + bl idct_row_armv5te + + sub a1, a1, #(16*7) + + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + add a1, a1, #4 + bl idct_col_put_armv5te + + add sp, sp, #8 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_armv6.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/simple_idct_armv6.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,433 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * Copyright (c) 2007 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define W13 (W1 | (W3 << 16)) +#define W26 (W2 | (W6 << 16)) +#define W42 (W4 | (W2 << 16)) +#define W42n (-W4&0xffff | (-W2 << 16)) +#define W46 (W4 | (W6 << 16)) +#define W57 (W5 | (W7 << 16)) + + .text + .align +w13: .long W13 +w26: .long W26 +w42: .long W42 +w42n: .long W42n +w46: .long W46 +w57: .long W57 + +/* + Compute partial IDCT of single row. + shift = left-shift amount + r0 = source address + r2 = row[2,0] <= 2 cycles + r3 = row[3,1] + ip = w42 <= 2 cycles + + Output in registers r4--r11 +*/ + .macro idct_row shift + ldr lr, w46 /* lr = W4 | (W6 << 16) */ + mov r1, #(1<<(\shift-1)) + smlad r4, r2, ip, r1 + smlsd r7, r2, ip, r1 + ldr ip, w13 /* ip = W1 | (W3 << 16) */ + ldr r10,w57 /* r10 = W5 | (W7 << 16) */ + smlad r5, r2, lr, r1 + smlsd r6, r2, lr, r1 + + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ + ldr lr, [r0, #12] /* lr = row[7,5] */ + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ + smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ + + ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */ + smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ + ldr r2, [r0, #4] /* r2 = row[6,4] */ + smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ + ldr ip, w46 /* ip = W4 | (W6 << 16) */ + smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ + + smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ + smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ + smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ + smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ + .endm + +/* + Compute partial IDCT of half row. + shift = left-shift amount + r2 = row[2,0] + r3 = row[3,1] + ip = w42 + + Output in registers r4--r11 +*/ + .macro idct_row4 shift + ldr lr, w46 /* lr = W4 | (W6 << 16) */ + ldr r10,w57 /* r10 = W5 | (W7 << 16) */ + mov r1, #(1<<(\shift-1)) + smlad r4, r2, ip, r1 + smlsd r7, r2, ip, r1 + ldr ip, w13 /* ip = W1 | (W3 << 16) */ + smlad r5, r2, lr, r1 + smlsd r6, r2, lr, r1 + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ + .endm + +/* + Compute final part of IDCT single row without shift. + Input in registers r4--r11 + Output in registers ip, r4--r6, lr, r8--r10 +*/ + .macro idct_finish + add ip, r4, r8 /* r1 = A0 + B0 */ + sub lr, r4, r8 /* r2 = A0 - B0 */ + sub r4, r5, r9 /* r2 = A1 + B1 */ + add r8, r5, r9 /* r2 = A1 - B1 */ + add r5, r6, r10 /* r1 = A2 + B2 */ + sub r9, r6, r10 /* r1 = A2 - B2 */ + add r6, r7, r11 /* r2 = A3 + B3 */ + sub r10,r7, r11 /* r2 = A3 - B3 */ + .endm + +/* + Compute final part of IDCT single row. + shift = right-shift amount + Input/output in registers r4--r11 +*/ + .macro idct_finish_shift shift + add r3, r4, r8 /* r3 = A0 + B0 */ + sub r2, r4, r8 /* r2 = A0 - B0 */ + mov r4, r3, asr #\shift + mov r8, r2, asr #\shift + + sub r3, r5, r9 /* r3 = A1 + B1 */ + add r2, r5, r9 /* r2 = A1 - B1 */ + mov r5, r3, asr #\shift + mov r9, r2, asr #\shift + + add r3, r6, r10 /* r3 = A2 + B2 */ + sub r2, r6, r10 /* r2 = A2 - B2 */ + mov r6, r3, asr #\shift + mov r10,r2, asr #\shift + + add r3, r7, r11 /* r3 = A3 + B3 */ + sub r2, r7, r11 /* r2 = A3 - B3 */ + mov r7, r3, asr #\shift + mov r11,r2, asr #\shift + .endm + +/* + Compute final part of IDCT single row, saturating results at 8 bits. + shift = right-shift amount + Input/output in registers r4--r11 +*/ + .macro idct_finish_shift_sat shift + add r3, r4, r8 /* r3 = A0 + B0 */ + sub ip, r4, r8 /* ip = A0 - B0 */ + usat r4, #8, r3, asr #\shift + usat r8, #8, ip, asr #\shift + + sub r3, r5, r9 /* r3 = A1 + B1 */ + add ip, r5, r9 /* ip = A1 - B1 */ + usat r5, #8, r3, asr #\shift + usat r9, #8, ip, asr #\shift + + add r3, r6, r10 /* r3 = A2 + B2 */ + sub ip, r6, r10 /* ip = A2 - B2 */ + usat r6, #8, r3, asr #\shift + usat r10,#8, ip, asr #\shift + + add r3, r7, r11 /* r3 = A3 + B3 */ + sub ip, r7, r11 /* ip = A3 - B3 */ + usat r7, #8, r3, asr #\shift + usat r11,#8, ip, asr #\shift + .endm + +/* + Compute IDCT of single row, storing as column. + r0 = source + r1 = dest +*/ +function idct_row_armv6 + push {lr} + + ldr lr, [r0, #12] /* lr = row[7,5] */ + ldr ip, [r0, #4] /* ip = row[6,4] */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + ldr r2, [r0] /* r2 = row[2,0] */ + orrs lr, lr, ip + cmpeq lr, r3 + cmpeq lr, r2, lsr #16 + beq 1f + push {r1} + ldr ip, w42 /* ip = W4 | (W2 << 16) */ + cmp lr, #0 + beq 2f + + idct_row ROW_SHIFT + b 3f + +2: idct_row4 ROW_SHIFT + +3: pop {r1} + idct_finish_shift ROW_SHIFT + + strh r4, [r1] + strh r5, [r1, #(16*2)] + strh r6, [r1, #(16*4)] + strh r7, [r1, #(16*6)] + strh r11,[r1, #(16*1)] + strh r10,[r1, #(16*3)] + strh r9, [r1, #(16*5)] + strh r8, [r1, #(16*7)] + + pop {pc} + +1: mov r2, r2, lsl #3 + strh r2, [r1] + strh r2, [r1, #(16*2)] + strh r2, [r1, #(16*4)] + strh r2, [r1, #(16*6)] + strh r2, [r1, #(16*1)] + strh r2, [r1, #(16*3)] + strh r2, [r1, #(16*5)] + strh r2, [r1, #(16*7)] + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row. + r0 = source + r1 = dest +*/ +function idct_col_armv6 + push {r1, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, w42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1} + idct_finish_shift COL_SHIFT + + strh r4, [r1] + strh r5, [r1, #(16*1)] + strh r6, [r1, #(16*2)] + strh r7, [r1, #(16*3)] + strh r11,[r1, #(16*4)] + strh r10,[r1, #(16*5)] + strh r9, [r1, #(16*6)] + strh r8, [r1, #(16*7)] + + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row, store saturated 8-bit. + r0 = source + r1 = dest + r2 = line size +*/ +function idct_col_put_armv6 + push {r1, r2, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, w42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1, r2} + idct_finish_shift_sat COL_SHIFT + + strb r4, [r1], r2 + strb r5, [r1], r2 + strb r6, [r1], r2 + strb r7, [r1], r2 + strb r11,[r1], r2 + strb r10,[r1], r2 + strb r9, [r1], r2 + strb r8, [r1], r2 + + sub r1, r1, r2, lsl #3 + + pop {pc} +endfunc + +/* + Compute IDCT of single column, read as row, add/store saturated 8-bit. + r0 = source + r1 = dest + r2 = line size +*/ +function idct_col_add_armv6 + push {r1, r2, lr} + + ldr r2, [r0] /* r2 = row[2,0] */ + ldr ip, w42 /* ip = W4 | (W2 << 16) */ + ldr r3, [r0, #8] /* r3 = row[3,1] */ + idct_row COL_SHIFT + pop {r1, r2} + idct_finish + + ldrb r3, [r1] + ldrb r7, [r1, r2] + ldrb r11,[r1, r2, lsl #2] + add ip, r3, ip, asr #COL_SHIFT + usat ip, #8, ip + add r4, r7, r4, asr #COL_SHIFT + strb ip, [r1], r2 + ldrb ip, [r1, r2] + usat r4, #8, r4 + ldrb r11,[r1, r2, lsl #2] + add r5, ip, r5, asr #COL_SHIFT + usat r5, #8, r5 + strb r4, [r1], r2 + ldrb r3, [r1, r2] + ldrb ip, [r1, r2, lsl #2] + strb r5, [r1], r2 + ldrb r7, [r1, r2] + ldrb r4, [r1, r2, lsl #2] + add r6, r3, r6, asr #COL_SHIFT + usat r6, #8, r6 + add r10,r7, r10,asr #COL_SHIFT + usat r10,#8, r10 + add r9, r11,r9, asr #COL_SHIFT + usat r9, #8, r9 + add r8, ip, r8, asr #COL_SHIFT + usat r8, #8, r8 + add lr, r4, lr, asr #COL_SHIFT + usat lr, #8, lr + strb r6, [r1], r2 + strb r10,[r1], r2 + strb r9, [r1], r2 + strb r8, [r1], r2 + strb lr, [r1], r2 + + sub r1, r1, r2, lsl #3 + + pop {pc} +endfunc + +/* + Compute 8 IDCT row transforms. + func = IDCT row->col function + width = width of columns in bytes +*/ + .macro idct_rows func width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + sub r0, r0, #(16*5) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + add r0, r0, #(16*2) + add r1, r1, #\width + bl \func + + sub r0, r0, #(16*7) + .endm + +/* void ff_simple_idct_armv6(DCTELEM *data); */ +function ff_simple_idct_armv6, export=1 + push {r4-r11, lr} + sub sp, sp, #128 + + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r1, r0 + mov r0, sp + idct_rows idct_col_armv6, 2 + + add sp, sp, #128 + pop {r4-r11, pc} +endfunc + +/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ +function ff_simple_idct_add_armv6, export=1 + push {r0, r1, r4-r11, lr} + sub sp, sp, #128 + + mov r0, r2 + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r0, sp + ldr r1, [sp, #128] + ldr r2, [sp, #(128+4)] + idct_rows idct_col_add_armv6, 1 + + add sp, sp, #(128+8) + pop {r4-r11, pc} +endfunc + +/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ +function ff_simple_idct_put_armv6, export=1 + push {r0, r1, r4-r11, lr} + sub sp, sp, #128 + + mov r0, r2 + mov r1, sp + idct_rows idct_row_armv6, 2 + mov r0, sp + ldr r1, [sp, #128] + ldr r2, [sp, #(128+4)] + idct_rows idct_col_put_armv6, 1 + + add sp, sp, #(128+8) + pop {r4-r11, pc} +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/simple_idct_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/simple_idct_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,373 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4c ((1<<(COL_SHIFT-1))/W4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define w1 d0[0] +#define w2 d0[1] +#define w3 d0[2] +#define w4 d0[3] +#define w5 d1[0] +#define w6 d1[1] +#define w7 d1[2] +#define w4c d1[3] + + .macro idct_col4_top + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ + vadd.i32 q11, q15, q7 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ + vadd.i32 q12, q15, q8 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ + vsub.i32 q13, q15, q8 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ + vsub.i32 q14, q15, q7 + + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ + .endm + + .text + .align 6 + +function idct_row4_pld_neon + pld [r0] + add r3, r0, r1, lsl #2 + pld [r0, r1] + pld [r0, r1, lsl #1] + pld [r3, -r1] + pld [r3] + pld [r3, r1] + add r3, r3, r1, lsl #1 + pld [r3] + pld [r3, r1] +endfunc + +function idct_row4_neon + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) + vld1.64 {d2-d5}, [r2,:128]! + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ + vld1.64 {d6,d7}, [r2,:128]! + vorr d10, d3, d5 + vld1.64 {d8,d9}, [r2,:128]! + add r2, r2, #-64 + + vorr d11, d7, d9 + vorr d10, d10, d11 + vmov r3, r4, d10 + + idct_col4_top + + orrs r3, r3, r4 + beq 1f + + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + vsub.i32 q14, q14, q7 + +1: vadd.i32 q3, q11, q9 + vadd.i32 q4, q12, q10 + vshrn.i32 d2, q3, #ROW_SHIFT + vshrn.i32 d4, q4, #ROW_SHIFT + vadd.i32 q7, q13, q5 + vadd.i32 q8, q14, q6 + vtrn.16 d2, d4 + vshrn.i32 d6, q7, #ROW_SHIFT + vshrn.i32 d8, q8, #ROW_SHIFT + vsub.i32 q14, q14, q6 + vsub.i32 q11, q11, q9 + vtrn.16 d6, d8 + vsub.i32 q13, q13, q5 + vshrn.i32 d3, q14, #ROW_SHIFT + vtrn.32 d2, d6 + vsub.i32 q12, q12, q10 + vtrn.32 d4, d8 + vshrn.i32 d5, q13, #ROW_SHIFT + vshrn.i32 d7, q12, #ROW_SHIFT + vshrn.i32 d9, q11, #ROW_SHIFT + + vtrn.16 d3, d5 + vtrn.16 d7, d9 + vtrn.32 d3, d7 + vtrn.32 d5, d9 + + vst1.64 {d2-d5}, [r2,:128]! + vst1.64 {d6-d9}, [r2,:128]! + + bx lr +endfunc + +function idct_col4_neon + mov ip, #16 + vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ + vdup.16 d30, w4c + vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ + vadd.i16 d30, d30, d2 + vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ + vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1< + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + +function ff_synth_filter_float_neon, export=1 + push {r3-r11,lr} + + ldr r4, [r2] @ synth_buf_offset + add r1, r1, r4, lsl #2 @ synth_buf + sub r12, r4, #32 + bfc r12, #9, #23 + bic r4, r4, #63 + str r12, [r2] + + ldr r2, [sp, #12*4] @ in + mov r9, r1 @ synth_buf + +VFP vpush {d0} + bl ff_imdct_half_neon +VFP vpop {d0} + pop {r3} + + ldr r5, [sp, #9*4] @ window + ldr r2, [sp, #10*4] @ out +NOVFP vldr d0, [sp, #12*4] @ scale, bias + add r8, r9, #12*4 + + mov lr, #64*4 + mov r1, #4 +1: + add r10, r9, #16*4 @ synth_buf + add r11, r8, #16*4 + add r0, r5, #16*4 @ window + add r6, r5, #32*4 + add r7, r5, #48*4 + + vld1.32 {q10}, [r3,:128] @ a + add r3, r3, #16*4 + vld1.32 {q1}, [r3,:128] @ b + vmov.f32 q2, #0.0 @ c + vmov.f32 q3, #0.0 @ d + + mov r12, #512 +2: + vld1.32 {q9}, [r8, :128], lr + vrev64.32 q9, q9 + vld1.32 {q8}, [r5, :128], lr + vmls.f32 d20, d16, d19 + vld1.32 {q11}, [r0, :128], lr + vmls.f32 d21, d17, d18 + vld1.32 {q12}, [r9, :128], lr + vmla.f32 d2, d22, d24 + vld1.32 {q8}, [r6, :128], lr + vmla.f32 d3, d23, d25 + vld1.32 {q9}, [r10,:128], lr + vmla.f32 d4, d16, d18 + vld1.32 {q12}, [r11,:128], lr + vmla.f32 d5, d17, d19 + vrev64.32 q12, q12 + vld1.32 {q11}, [r7, :128], lr + vmla.f32 d6, d22, d25 + vmla.f32 d7, d23, d24 + subs r12, r12, #64 + beq 3f + cmp r12, r4 + bne 2b + sub r8, r8, #512*4 + sub r9, r9, #512*4 + sub r10, r10, #512*4 + sub r11, r11, #512*4 + b 2b +3: + vdup.32 q8, d0[1] + vdup.32 q9, d0[1] + vmla.f32 q8, q10, d0[0] + vmla.f32 q9, q1, d0[0] + vst1.32 {q3}, [r3,:128] + sub r3, r3, #16*4 + vst1.32 {q2}, [r3,:128] + vst1.32 {q8}, [r2,:128] + add r2, r2, #16*4 + vst1.32 {q9}, [r2,:128] + + subs r1, r1, #1 + popeq {r4-r11,pc} + + cmp r4, #0 + subeq r8, r8, #512*4 + subeq r9, r9, #512*4 + sub r5, r5, #512*4 + sub r2, r2, #12*4 @ out + add r3, r3, #4*4 @ synth_buf2 + add r5, r5, #4*4 @ window + add r9, r9, #4*4 @ synth_buf + sub r8, r8, #4*4 @ synth_buf + b 1b +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/arm/vp3dsp_neon.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/arm/vp3dsp_neon.S Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2009 David Conrad + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +.section .rodata +.align 4 + +vp3_idct_constants: +.short 64277, 60547, 54491, 46341, 36410, 25080, 12785 + +#define xC1S7 d0[0] +#define xC2S6 d0[1] +#define xC3S5 d0[2] +#define xC4S4 d0[3] +#define xC5S3 d1[0] +#define xC6S2 d1[1] +#define xC7S1 d1[2] + +.text + +.macro vp3_loop_filter + vsubl.u8 q3, d18, d17 + vsubl.u8 q2, d16, d19 + vadd.i16 q1, q3, q3 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q1, q2 + vrshr.s16 q0, q0, #3 + vmovl.u8 q9, d18 + vdup.u16 q15, r2 + + vabs.s16 q1, q0 + vshr.s16 q0, q0, #15 + vqsub.u16 q2, q15, q1 + vqsub.u16 q3, q2, q1 + vsub.i16 q1, q2, q3 + veor q1, q1, q0 + vsub.i16 q0, q1, q0 + + vaddw.u8 q2, q0, d17 + vsub.i16 q3, q9, q0 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 +.endm + +function ff_vp3_v_loop_filter_neon, export=1 + sub ip, r0, r1 + sub r0, r0, r1, lsl #1 + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d17}, [r0,:64], r1 + vld1.64 {d18}, [r0,:64], r1 + vld1.64 {d19}, [r0,:64], r1 + ldrb r2, [r2, #129*4] + + vp3_loop_filter + + vst1.64 {d0}, [ip,:64], r1 + vst1.64 {d1}, [ip,:64], r1 + bx lr +endfunc + +function ff_vp3_h_loop_filter_neon, export=1 + sub ip, r0, #1 + sub r0, r0, #2 + vld1.32 {d16[]}, [r0], r1 + vld1.32 {d17[]}, [r0], r1 + vld1.32 {d18[]}, [r0], r1 + vld1.32 {d19[]}, [r0], r1 + vld1.32 {d16[1]}, [r0], r1 + vld1.32 {d17[1]}, [r0], r1 + vld1.32 {d18[1]}, [r0], r1 + vld1.32 {d19[1]}, [r0], r1 + ldrb r2, [r2, #129*4] + + vtrn.8 d16, d17 + vtrn.8 d18, d19 + vtrn.16 d16, d18 + vtrn.16 d17, d19 + + vp3_loop_filter + + vtrn.8 d0, d1 + + vst1.16 {d0[0]}, [ip], r1 + vst1.16 {d1[0]}, [ip], r1 + vst1.16 {d0[1]}, [ip], r1 + vst1.16 {d1[1]}, [ip], r1 + vst1.16 {d0[2]}, [ip], r1 + vst1.16 {d1[2]}, [ip], r1 + vst1.16 {d0[3]}, [ip], r1 + vst1.16 {d1[3]}, [ip], r1 + bx lr +endfunc + + +function vp3_idct_start_neon + vpush {d8-d15} + movrel r3, vp3_idct_constants + vld1.64 {d0-d1}, [r3,:128] + vld1.64 {d16-d19}, [r2,:128]! + vld1.64 {d20-d23}, [r2,:128]! + vld1.64 {d24-d27}, [r2,:128]! + vadd.s16 q1, q8, q12 + vsub.s16 q8, q8, q12 + vld1.64 {d28-d31}, [r2,:128]! +endfunc + +function vp3_idct_core_neon + vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16 + vmull.s16 q3, d19, xC1S7 + vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16 + vmull.s16 q5, d3, xC4S4 + vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16 + vmull.s16 q7, d17, xC4S4 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 + vshrn.s32 d9, q7, #16 + vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4 + vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4 + vadd.s16 q1, q2, q9 // ip[1] * C1 + + vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16 + vmull.s16 q3, d31, xC1S7 + vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16 + vmull.s16 q5, d31, xC7S1 + vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16 + vmull.s16 q7, d19, xC7S1 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 // ip[7] * C7 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 // ip[1] * C7 + vshrn.s32 d9, q7, #16 + vadd.s16 q2, q2, q15 // ip[7] * C1 + vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7 + vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1 + + vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16 + vmull.s16 q3, d23, xC5S3 + vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16 + vmull.s16 q5, d23, xC3S5 + vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16 + vmull.s16 q7, d27, xC5S3 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 + vshrn.s32 d9, q7, #16 + vadd.s16 q3, q3, q11 // ip[3] * C3 + vadd.s16 q4, q4, q13 // ip[5] * C5 + vadd.s16 q1, q2, q11 // ip[3] * C5 + vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5 + + vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16 + vmull.s16 q3, d27, xC3S5 + vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16 + vmull.s16 q5, d21, xC2S6 + vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16 + vmull.s16 q7, d29, xC6S2 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 // ip[6] * C6 + vshrn.s32 d9, q7, #16 + vadd.s16 q2, q2, q13 // ip[5] * C3 + vadd.s16 q3, q3, q10 // ip[2] * C2 + vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5 + vsub.s16 q1, q9, q11 // (A - C) + vadd.s16 q11, q9, q11 // Cd = A + C + vsub.s16 q9, q15, q13 // (B - D) + vadd.s16 q13, q15, q13 // Dd = B + D + vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6 + + vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16 + vmull.s16 q3, d3, xC4S4 + vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16 + vmull.s16 q5, d29, xC2S6 + vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16 + vmull.s16 q7, d21, xC6S2 + vshrn.s32 d4, q2, #16 + vshrn.s32 d5, q3, #16 + vshrn.s32 d6, q4, #16 + vshrn.s32 d7, q5, #16 + vshrn.s32 d8, q6, #16 // ip[2] * C6 + vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16 + vmull.s16 q6, d19, xC4S4 + vshrn.s32 d9, q7, #16 + vadd.s16 q3, q3, q14 // ip[6] * C2 + vadd.s16 q10, q1, q2 // Ad = (A - C) * C4 + vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2 + bx lr +endfunc + +.macro VP3_IDCT_END type +function vp3_idct_end_\type\()_neon +.ifc \type, col + vdup.16 q0, r3 + vadd.s16 q12, q12, q0 + vadd.s16 q8, q8, q0 +.endif + + vshrn.s32 d2, q5, #16 + vshrn.s32 d3, q6, #16 + vadd.s16 q2, q12, q15 // Gd = E + G + vadd.s16 q9, q1, q9 // (B - D) * C4 + vsub.s16 q12, q12, q15 // Ed = E - G + vsub.s16 q3, q8, q10 // Fd = F - Ad + vadd.s16 q10, q8, q10 // Add = F + Ad + vadd.s16 q4, q9, q14 // Hd = Bd + H + vsub.s16 q14, q9, q14 // Bdd = Bd - H + vadd.s16 q8, q2, q11 // [0] = Gd + Cd + vsub.s16 q15, q2, q11 // [7] = Gd - Cd + vadd.s16 q9, q10, q4 // [1] = Add + Hd + vsub.s16 q10, q10, q4 // [2] = Add - Hd + vadd.s16 q11, q12, q13 // [3] = Ed + Dd + vsub.s16 q12, q12, q13 // [4] = Ed - Dd +.ifc \type, row + vtrn.16 q8, q9 +.endif + vadd.s16 q13, q3, q14 // [5] = Fd + Bdd + vsub.s16 q14, q3, q14 // [6] = Fd - Bdd + +.ifc \type, row + // 8x8 transpose + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vswp d17, d24 + vswp d19, d26 + vadd.s16 q1, q8, q12 + vswp d21, d28 + vsub.s16 q8, q8, q12 + vswp d23, d30 +.endif + bx lr +endfunc +.endm + +VP3_IDCT_END row +VP3_IDCT_END col + +function ff_vp3_idct_neon, export=1 + mov ip, lr + mov r2, r0 + bl vp3_idct_start_neon + bl vp3_idct_end_row_neon + mov r3, #8 + bl vp3_idct_core_neon + bl vp3_idct_end_col_neon + mov lr, ip + vpop {d8-d15} + + vshr.s16 q8, q8, #4 + vshr.s16 q9, q9, #4 + vshr.s16 q10, q10, #4 + vshr.s16 q11, q11, #4 + vshr.s16 q12, q12, #4 + vst1.64 {d16-d19}, [r0,:128]! + vshr.s16 q13, q13, #4 + vshr.s16 q14, q14, #4 + vst1.64 {d20-d23}, [r0,:128]! + vshr.s16 q15, q15, #4 + vst1.64 {d24-d27}, [r0,:128]! + vst1.64 {d28-d31}, [r0,:128]! + bx lr +endfunc + +function ff_vp3_idct_put_neon, export=1 + mov ip, lr + bl vp3_idct_start_neon + bl vp3_idct_end_row_neon + mov r3, #8 + add r3, r3, #2048 // convert signed pixel to unsigned + bl vp3_idct_core_neon + bl vp3_idct_end_col_neon + mov lr, ip + vpop {d8-d15} + + vqshrun.s16 d0, q8, #4 + vqshrun.s16 d1, q9, #4 + vqshrun.s16 d2, q10, #4 + vqshrun.s16 d3, q11, #4 + vst1.64 {d0}, [r0,:64], r1 + vqshrun.s16 d4, q12, #4 + vst1.64 {d1}, [r0,:64], r1 + vqshrun.s16 d5, q13, #4 + vst1.64 {d2}, [r0,:64], r1 + vqshrun.s16 d6, q14, #4 + vst1.64 {d3}, [r0,:64], r1 + vqshrun.s16 d7, q15, #4 + vst1.64 {d4}, [r0,:64], r1 + vst1.64 {d5}, [r0,:64], r1 + vst1.64 {d6}, [r0,:64], r1 + vst1.64 {d7}, [r0,:64], r1 + bx lr +endfunc + +function ff_vp3_idct_add_neon, export=1 + mov ip, lr + bl vp3_idct_start_neon + bl vp3_idct_end_row_neon + mov r3, #8 + bl vp3_idct_core_neon + bl vp3_idct_end_col_neon + mov lr, ip + vpop {d8-d15} + mov r2, r0 + + vld1.64 {d0}, [r0,:64], r1 + vshr.s16 q8, q8, #4 + vld1.64 {d1}, [r0,:64], r1 + vshr.s16 q9, q9, #4 + vld1.64 {d2}, [r0,:64], r1 + vaddw.u8 q8, q8, d0 + vld1.64 {d3}, [r0,:64], r1 + vaddw.u8 q9, q9, d1 + vld1.64 {d4}, [r0,:64], r1 + vshr.s16 q10, q10, #4 + vld1.64 {d5}, [r0,:64], r1 + vshr.s16 q11, q11, #4 + vld1.64 {d6}, [r0,:64], r1 + vqmovun.s16 d0, q8 + vld1.64 {d7}, [r0,:64], r1 + vqmovun.s16 d1, q9 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vshr.s16 q12, q12, #4 + vshr.s16 q13, q13, #4 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vshr.s16 q14, q14, #4 + vshr.s16 q15, q15, #4 + vst1.64 {d0}, [r2,:64], r1 + vqmovun.s16 d4, q12 + vst1.64 {d1}, [r2,:64], r1 + vqmovun.s16 d5, q13 + vst1.64 {d2}, [r2,:64], r1 + vaddw.u8 q14, q14, d6 + vst1.64 {d3}, [r2,:64], r1 + vaddw.u8 q15, q15, d7 + vst1.64 {d4}, [r2,:64], r1 + vqmovun.s16 d6, q14 + vst1.64 {d5}, [r2,:64], r1 + vqmovun.s16 d7, q15 + vst1.64 {d6}, [r2,:64], r1 + vst1.64 {d7}, [r2,:64], r1 + bx lr +endfunc + +function ff_vp3_idct_dc_add_neon, export=1 + ldrsh r2, [r2] + movw r3, #46341 + mul r2, r3, r2 + smulwt r2, r3, r2 + mov r3, r0 + vdup.16 q15, r2 + vrshr.s16 q15, q15, #4 + + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d1}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64], r1 + vaddw.u8 q8, q15, d0 + vld1.8 {d3}, [r0,:64], r1 + vaddw.u8 q9, q15, d1 + vld1.8 {d4}, [r0,:64], r1 + vaddw.u8 q10, q15, d2 + vld1.8 {d5}, [r0,:64], r1 + vaddw.u8 q11, q15, d3 + vld1.8 {d6}, [r0,:64], r1 + vaddw.u8 q12, q15, d4 + vld1.8 {d7}, [r0,:64], r1 + vaddw.u8 q13, q15, d5 + vqmovun.s16 d0, q8 + vaddw.u8 q14, q15, d6 + vqmovun.s16 d1, q9 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d2, q10 + vst1.8 {d0}, [r3,:64], r1 + vqmovun.s16 d3, q11 + vst1.8 {d1}, [r3,:64], r1 + vqmovun.s16 d4, q12 + vst1.8 {d2}, [r3,:64], r1 + vqmovun.s16 d5, q13 + vst1.8 {d3}, [r3,:64], r1 + vqmovun.s16 d6, q14 + vst1.8 {d4}, [r3,:64], r1 + vqmovun.s16 d7, q15 + vst1.8 {d5}, [r3,:64], r1 + vst1.8 {d6}, [r3,:64], r1 + vst1.8 {d7}, [r3,:64], r1 + bx lr +endfunc diff -r 11d15c47beaf -r 897f711a7157 libavcodec/avcodec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/avcodec.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,407 @@ +#ifndef AVCODEC_AVCODEC_H +#define AVCODEC_AVCODEC_H + +#include +#include +#include "config.h" + +#include "libavutil/mem.h" + +#define MAX_SPS_COUNT 32 +#define MAX_PPS_COUNT 256 + + +#ifndef CABAC +#define CABAC h->pps.cabac +#endif + +#define EXTENDED_SAR 255 + +#define MB_TYPE_REF0 MB_TYPE_ACPRED //dirty but it fits in 16 bit +#define MB_TYPE_8x8DCT 0x01000000 +#define IS_REF0(a) ((a) & MB_TYPE_REF0) +#define IS_8x8DCT(a) ((a) & MB_TYPE_8x8DCT) + +#define LIST_NOT_USED -1 +#define PART_NOT_AVAILABLE -2 + +/* dct code */ +typedef short DCTELEM; + +/** +* Required number of additionally allocated bytes at the end of the input bitstream for decoding. +* This is mainly needed because some optimized bitstream readers read +* 32 or 64 bit at once and could read over the end.
+* Note: If the first 23 bits of the additional bytes are not 0, then damaged +* MPEG bitstreams could cause overread and segfault. +*/ +#define FF_INPUT_BUFFER_PADDING_SIZE 8 + +enum AVColorPrimaries{ + AVCOL_PRI_BT709 =1, ///< also ITU-R BT1361 / IEC 61966-2-4 / SMPTE RP177 Annex B + AVCOL_PRI_UNSPECIFIED=2, + AVCOL_PRI_BT470M =4, + AVCOL_PRI_BT470BG =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM + AVCOL_PRI_SMPTE170M =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC + AVCOL_PRI_SMPTE240M =7, ///< functionally identical to above + AVCOL_PRI_FILM =8, + AVCOL_PRI_NB , ///< Not part of ABI +}; + +enum AVColorTransferCharacteristic{ + AVCOL_TRC_BT709 =1, ///< also ITU-R BT1361 + AVCOL_TRC_UNSPECIFIED=2, + AVCOL_TRC_GAMMA22 =4, ///< also ITU-R BT470M / ITU-R BT1700 625 PAL & SECAM + AVCOL_TRC_GAMMA28 =5, ///< also ITU-R BT470BG + AVCOL_TRC_NB , ///< Not part of ABI +}; + +enum AVColorSpace{ + AVCOL_SPC_RGB =0, + AVCOL_SPC_BT709 =1, ///< also ITU-R BT1361 / IEC 61966-2-4 xvYCC709 / SMPTE RP177 Annex B + AVCOL_SPC_UNSPECIFIED=2, + AVCOL_SPC_FCC =4, + AVCOL_SPC_BT470BG =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM / IEC 61966-2-4 xvYCC601 + AVCOL_SPC_SMPTE170M =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC / functionally identical to above + AVCOL_SPC_SMPTE240M =7, + AVCOL_SPC_NB , ///< Not part of ABI +}; + +enum AVColorRange{ + AVCOL_RANGE_UNSPECIFIED=0, + AVCOL_RANGE_MPEG =1, ///< the normal 219*2^(n-8) "MPEG" YUV ranges + AVCOL_RANGE_JPEG =2, ///< the normal 2^n-1 "JPEG" YUV ranges + AVCOL_RANGE_NB , ///< Not part of ABI +}; + +#define MAX_MMCO_COUNT 66 +/** +* Memory management control operation opcode. +*/ +typedef enum MMCOOpcode{ + MMCO_END=0, + MMCO_SHORT2UNUSED, + MMCO_LONG2UNUSED, + MMCO_SHORT2LONG, + MMCO_SET_MAX_LONG, + MMCO_RESET, + MMCO_LONG, +} MMCOOpcode; + +/* NAL unit types */ +enum { + NAL_SLICE=1, + NAL_DPA, + NAL_DPB, + NAL_DPC, + NAL_IDR_SLICE, + NAL_SEI, + NAL_SPS, + NAL_PPS, + NAL_AUD, + NAL_END_SEQUENCE, + NAL_END_STREAM, + NAL_FILLER_DATA, + NAL_SPS_EXT, + NAL_AUXILIARY_SLICE=19 +}; + +/** +* SEI message types +*/ +typedef enum { + SEI_BUFFERING_PERIOD = 0, ///< buffering period (H.264, D.1.1) + SEI_TYPE_PIC_TIMING = 1, ///< picture timing + SEI_TYPE_USER_DATA_UNREGISTERED = 5, ///< unregistered user data + SEI_TYPE_RECOVERY_POINT = 6 ///< recovery point (frame # to decoder sync) +} SEI_Type; + +/** +* pic_struct in picture timing SEI message +*/ +typedef enum { + SEI_PIC_STRUCT_FRAME = 0, ///< 0: %frame + SEI_PIC_STRUCT_TOP_FIELD = 1, ///< 1: top field + SEI_PIC_STRUCT_BOTTOM_FIELD = 2, ///< 2: bottom field + SEI_PIC_STRUCT_TOP_BOTTOM = 3, ///< 3: top field, bottom field, in that order + SEI_PIC_STRUCT_BOTTOM_TOP = 4, ///< 4: bottom field, top field, in that order + SEI_PIC_STRUCT_TOP_BOTTOM_TOP = 5, ///< 5: top field, bottom field, top field repeated, in that order + SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM = 6, ///< 6: bottom field, top field, bottom field repeated, in that order + SEI_PIC_STRUCT_FRAME_DOUBLING = 7, ///< 7: %frame doubling + SEI_PIC_STRUCT_FRAME_TRIPLING = 8 ///< 8: %frame tripling +} SEI_PicStructType; + +#define FF_MAX_B_FRAMES 16 + + +//The following defines may change, don't expect compatibility if you use them. +#define MB_TYPE_INTRA4x4 0x0001 +#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific +#define MB_TYPE_INTRA_PCM 0x0004 //FIXME H.264-specific +#define MB_TYPE_16x16 0x0008 +#define MB_TYPE_16x8 0x0010 +#define MB_TYPE_8x16 0x0020 +#define MB_TYPE_8x8 0x0040 +#define MB_TYPE_INTERLACED 0x0080 +#define MB_TYPE_DIRECT2 0x0100 //FIXME +#define MB_TYPE_ACPRED 0x0200 +#define MB_TYPE_GMC 0x0400 +#define MB_TYPE_SKIP 0x0800 +#define MB_TYPE_P0L0 0x1000 +#define MB_TYPE_P1L0 0x2000 +#define MB_TYPE_P0L1 0x4000 +#define MB_TYPE_P1L1 0x8000 +#define MB_TYPE_L0 (MB_TYPE_P0L0 | MB_TYPE_P1L0) +#define MB_TYPE_L1 (MB_TYPE_P0L1 | MB_TYPE_P1L1) +#define MB_TYPE_L0L1 (MB_TYPE_L0 | MB_TYPE_L1) +#define MB_TYPE_QUANT 0x00010000 +#define MB_TYPE_CBP 0x00020000 +//Note bits 24-31 are reserved for codec specific use (h264 ref0, mpeg1 0mv, ...) + +#define FF_BUFFER_TYPE_INTERNAL 1 +#define FF_BUFFER_TYPE_USER 2 ///< direct rendering buffers (image is (de)allocated by user) +#define FF_BUFFER_TYPE_SHARED 4 ///< Buffer from somewhere else; don't deallocate image (data/base), all other tables are not shared. +#define FF_BUFFER_TYPE_COPY 8 ///< Just a (modified) copy of some other buffer, don't deallocate anything. + + +#define FF_I_TYPE 1 ///< Intra +#define FF_P_TYPE 2 ///< Predicted +#define FF_B_TYPE 3 ///< Bi-dir predicted +#define FF_S_TYPE 4 ///< S(GMC)-VOP MPEG4 +#define FF_SI_TYPE 5 ///< Switching Intra +#define FF_SP_TYPE 6 ///< Switching Predicted +#define FF_BI_TYPE 7 + +#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if there is just one type +#define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4) +#define IS_INTRA16x16(a) ((a)&MB_TYPE_INTRA16x16) +#define IS_PCM(a) ((a)&MB_TYPE_INTRA_PCM) +#define IS_INTRA(a) ((a)&7) +#define IS_INTER(a) ((a)&(MB_TYPE_16x16|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8)) +#define IS_SKIP(a) ((a)&MB_TYPE_SKIP) +#define IS_INTRA_PCM(a) ((a)&MB_TYPE_INTRA_PCM) +#define IS_INTERLACED(a) ((a)&MB_TYPE_INTERLACED) +#define IS_DIRECT(a) ((a)&MB_TYPE_DIRECT2) +#define IS_GMC(a) ((a)&MB_TYPE_GMC) +#define IS_16X16(a) ((a)&MB_TYPE_16x16) +#define IS_16X8(a) ((a)&MB_TYPE_16x8) +#define IS_8X16(a) ((a)&MB_TYPE_8x16) +#define IS_8X8(a) ((a)&MB_TYPE_8x8) +#define IS_SUB_8X8(a) ((a)&MB_TYPE_16x16) //note reused +#define IS_SUB_8X4(a) ((a)&MB_TYPE_16x8) //note reused +#define IS_SUB_4X8(a) ((a)&MB_TYPE_8x16) //note reused +#define IS_SUB_4X4(a) ((a)&MB_TYPE_8x8) //note reused +#define IS_ACPRED(a) ((a)&MB_TYPE_ACPRED) +#define IS_QUANT(a) ((a)&MB_TYPE_QUANT) +#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list)))) +#define USES_LIST(a, list) ((a) & ((MB_TYPE_P0L0|MB_TYPE_P1L0)<<(2*(list)))) ///< does this mb use listX, note does not work if subMBs +#define HAS_CBP(a) ((a)&MB_TYPE_CBP) + + +#define FF_MM_FORCE 0x80000000 /* Force usage of selected flags (OR) */ + /* lower 16 bits - CPU features */ +#define FF_MM_MMX 0x0001 ///< standard MMX +#define FF_MM_3DNOW 0x0004 ///< AMD 3DNOW +#define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext +#define FF_MM_SSE 0x0008 ///< SSE functions +#define FF_MM_SSE2 0x0010 ///< PIV SSE2 functions +#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt +#define FF_MM_SSE3 0x0040 ///< Prescott SSE3 functions +#define FF_MM_SSSE3 0x0080 ///< Conroe SSSE3 functions +#define FF_MM_SSE4 0x0100 ///< Penryn SSE4.1 functions +#define FF_MM_SSE42 0x0200 ///< Nehalem SSE4.2 functions +#define FF_MM_IWMMXT 0x0100 ///< XScale IWMMXT +#define FF_MM_ALTIVEC 0x0001 ///< standard AltiVec + + +/** +* Sequence parameter set +*/ +typedef struct SPS{ + + int profile_idc; + int level_idc; + int chroma_format_idc; + int transform_bypass; ///< qpprime_y_zero_transform_bypass_flag + int log2_max_frame_num; ///< log2_max_frame_num_minus4 + 4 + int poc_type; ///< pic_order_cnt_type + int log2_max_poc_lsb; ///< log2_max_pic_order_cnt_lsb_minus4 + int delta_pic_order_always_zero_flag; + int offset_for_non_ref_pic; + int offset_for_top_to_bottom_field; + int poc_cycle_length; ///< num_ref_frames_in_pic_order_cnt_cycle + int ref_frame_count; ///< num_ref_frames + int gaps_in_frame_num_allowed_flag; + int mb_width; ///< pic_width_in_mbs_minus1 + 1 + int mb_height; ///< pic_height_in_map_units_minus1 + 1 + int frame_mbs_only_flag; + int mb_aff; /// free, 1 -> needs to be displayed, 2 -> needed for reference, 3 -> 1 && 2 + int key_frame; + int mmco_reset; ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET. + +} DecodedPicture; + + +#endif /* AVCODEC_AVCODEC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cabac.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cabac.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,242 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Context Adaptive Binary Arithmetic Coder. + */ + +#include + +#include "libavutil/common.h" +//#include "get_bits.h" +#include "cabac.h" + +static const uint8_t lps_range[64][4]= { +{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, +{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, +{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135}, +{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110}, +{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89}, +{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72}, +{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59}, +{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48}, +{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39}, +{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31}, +{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25}, +{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21}, +{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17}, +{ 10, 12, 14, 16}, { 9, 11, 13, 15}, { 9, 11, 12, 14}, { 8, 10, 12, 14}, +{ 8, 9, 11, 13}, { 7, 9, 11, 12}, { 7, 9, 10, 12}, { 7, 8, 10, 11}, +{ 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, +}; + +uint8_t ff_h264_mlps_state[4*64]; +uint8_t ff_h264_lps_range[4*2*64]; +uint8_t ff_h264_lps_state[2*64]; +uint8_t ff_h264_mps_state[2*64]; + +static const uint8_t mps_state[64]= { + 1, 2, 3, 4, 5, 6, 7, 8, + 9,10,11,12,13,14,15,16, + 17,18,19,20,21,22,23,24, + 25,26,27,28,29,30,31,32, + 33,34,35,36,37,38,39,40, + 41,42,43,44,45,46,47,48, + 49,50,51,52,53,54,55,56, + 57,58,59,60,61,62,62,63, +}; + +static const uint8_t lps_state[64]= { + 0, 0, 1, 2, 2, 4, 4, 5, + 6, 7, 8, 9, 9,11,11,12, + 13,13,15,15,16,16,18,18, + 19,19,21,21,22,22,23,24, + 24,25,26,26,27,27,28,29, + 29,30,30,30,31,32,32,33, + 33,33,34,34,35,35,35,36, + 36,36,37,37,37,38,38,63, +}; + +const uint8_t ff_h264_norm_shift[512]= { + 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + +/** + * + * @param buf_size size of buf in bits + */ +void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){ + c->bytestream_start= + c->bytestream= buf; + c->bytestream_end= buf + buf_size; + +#if CABAC_BITS == 16 + c->low = (*c->bytestream++)<<18; + c->low+= (*c->bytestream++)<<10; +#else + c->low = (*c->bytestream++)<<10; +#endif + c->low+= ((*c->bytestream++)<<2) + 2; + c->range= 0x1FE; +} + +void ff_init_cabac_states(){ + int i, j; + + for(i=0; i<64; i++){ + for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save + ff_h264_lps_range[j*2*64+2*i+0]= + ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j]; + } + + ff_h264_mlps_state[128+2*i+0]= + ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0; + ff_h264_mlps_state[128+2*i+1]= + ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1; + + if( i ){ +#ifdef BRANCHLESS_CABAC_DECODER + ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0; + ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1; + }else{ + ff_h264_mlps_state[128-2*i-1]= 1; + ff_h264_mlps_state[128-2*i-2]= 0; +#else + ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0; + ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1; + }else{ + ff_h264_lps_state[2*i+0]= 1; + ff_h264_lps_state[2*i+1]= 0; +#endif + } + } +} + +#ifdef TEST +#define SIZE 10240 +#define START_TIMER +#define STOP_TIMER(...) +#define av_log(...) +// #include "libavutil/lfg.h" +#include "avcodec.h" +#include "cabac.h" + +int main(void){ + CABACContext c; + uint8_t b[9*SIZE]; + uint8_t r[9*SIZE]; + int i; + uint8_t state[10]= {0}; +// AVLFG prng; + +// // av_lfg_init(&prng, 1); +// ff_init_cabac_encoder(&c, b, SIZE); +// ff_init_cabac_states(); +// +// for(i=0; i + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Context Adaptive Binary Arithmetic Coder. + */ + +#ifndef AVCODEC_CABAC_H +#define AVCODEC_CABAC_H + +//#undef NDEBUG +#include +#include "libavutil/x86_cpu.h" +#include "libavutil/attributes.h" + +#define CABAC_BITS 16 +#define CABAC_MASK ((1<low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); +#else + c->low+= c->bytestream[0]<<1; +#endif + c->low -= CABAC_MASK; + c->bytestream+= CABAC_BITS/8; +} + +static void refill2(CABACContext *c){ + int i, x; + + x= c->low ^ (c->low-1); + i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; + + x= -CABAC_MASK; + +#if CABAC_BITS == 16 + x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); +#else + x+= c->bytestream[0]<<1; +#endif + + c->low += x<bytestream+= CABAC_BITS/8; +} + +static inline void renorm_cabac_decoder(CABACContext *c){ + while(c->range < 0x100){ + c->range+= c->range; + c->low+= c->low; + if(!(c->low & CABAC_MASK)) + refill(c); + } +} + +static inline void renorm_cabac_decoder_once(CABACContext *c){ + + int shift= (uint32_t)(c->range - 0x100)>>31; + c->range<<= shift; + c->low <<= shift; + + if(!(c->low & CABAC_MASK)) + refill(c); +} + +static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){ + + int s = *state; + int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; + int bit, lps_mask av_unused; + + c->range -= RangeLPS; +#ifndef BRANCHLESS_CABAC_DECODER + if(c->low < (c->range<<(CABAC_BITS+1))){ + bit= s&1; + *state= ff_h264_mps_state[s]; + renorm_cabac_decoder_once(c); + }else{ + bit= ff_h264_norm_shift[RangeLPS]; + c->low -= (c->range<<(CABAC_BITS+1)); + *state= ff_h264_lps_state[s]; + c->range = RangeLPS<low <<= bit; + bit= (s&1)^1; + + if(!(c->low & CABAC_MASK)){ + refill2(c); + } + } +#else /* BRANCHLESS_CABAC_DECODER */ + lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31; + + c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask; + c->range += (RangeLPS - c->range) & lps_mask; + + s^=lps_mask; + *state= (ff_h264_mlps_state+128)[s]; + bit= s&1; + + lps_mask= ff_h264_norm_shift[c->range]; + c->range<<= lps_mask; + c->low <<= lps_mask; + if(!(c->low & CABAC_MASK)) + refill2(c); +#endif /* BRANCHLESS_CABAC_DECODER */ + + return bit; +} + +static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){ + return get_cabac_inline(c, state); +} + +static int av_unused get_cabac(CABACContext *c, uint8_t * const state){ + return get_cabac_inline(c, state); +} + +static int av_unused get_cabac_bypass(CABACContext *c){ + + int range; + c->low += c->low; + + if(!(c->low & CABAC_MASK)) + refill(c); + + range= c->range<<(CABAC_BITS+1); + if(c->low < range){ + return 0; + }else{ + c->low -= range; + return 1; + } +} + +static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ + int range, mask; + c->low += c->low; + + if(!(c->low & CABAC_MASK)) + refill(c); + + range= c->range<<(CABAC_BITS+1); + c->low -= range; + mask= c->low >> 31; + range &= mask; + c->low += range; + return (val^mask)-mask; +} + +/** + * + * @return the number of bytes read or 0 if no end + */ +static int av_unused get_cabac_terminate(CABACContext *c){ + c->range -= 2; + if(c->low < c->range<<(CABAC_BITS+1)){ + renorm_cabac_decoder_once(c); + return 0; + }else{ + return c->bytestream - c->bytestream_start; + } +} + +#endif /* AVCODEC_CABAC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/cabac_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/cabac_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,140 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Context Adaptive Binary Arithmetic Coder. + */ + +#include + +#include "libavutil/common.h" +//#include "get_bits.h" +#include "cabac_spu.h" +#define av_log(...) + +int bytecount =0; +static const uint8_t lps_range[64][4]= { +{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, +{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, +{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135}, +{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110}, +{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89}, +{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72}, +{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59}, +{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48}, +{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39}, +{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31}, +{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25}, +{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21}, +{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17}, +{ 10, 12, 14, 16}, { 9, 11, 13, 15}, { 9, 11, 12, 14}, { 8, 10, 12, 14}, +{ 8, 9, 11, 13}, { 7, 9, 11, 12}, { 7, 9, 10, 12}, { 7, 8, 10, 11}, +{ 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, +}; + +uint8_t ff_h264_mlps_state[4*64]; +uint8_t ff_h264_lps_range[4*2*64]; +uint8_t ff_h264_lps_state[2*64]; +uint8_t ff_h264_mps_state[2*64]; + +static const uint8_t mps_state[64]= { + 1, 2, 3, 4, 5, 6, 7, 8, + 9,10,11,12,13,14,15,16, + 17,18,19,20,21,22,23,24, + 25,26,27,28,29,30,31,32, + 33,34,35,36,37,38,39,40, + 41,42,43,44,45,46,47,48, + 49,50,51,52,53,54,55,56, + 57,58,59,60,61,62,62,63, +}; + +static const uint8_t lps_state[64]= { + 0, 0, 1, 2, 2, 4, 4, 5, + 6, 7, 8, 9, 9,11,11,12, + 13,13,15,15,16,16,18,18, + 19,19,21,21,22,22,23,24, + 24,25,26,26,27,27,28,29, + 29,30,30,30,31,32,32,33, + 33,33,34,34,35,35,35,36, + 36,36,37,37,37,38,38,63, +}; + +const uint8_t ff_h264_norm_shift[512]= { + 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + +/** + * + * @param buf_size size of buf in bits + */ + +void ff_init_cabac_states(){ + int i, j; + + for(i=0; i<64; i++){ + for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save + ff_h264_lps_range[j*2*64+2*i+0]= + ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j]; + } + + ff_h264_mlps_state[128+2*i+0]= + ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0; + ff_h264_mlps_state[128+2*i+1]= + ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1; + + if( i ){ +#ifdef BRANCHLESS_CABAC_DECODER + ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0; + ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1; + }else{ + ff_h264_mlps_state[128-2*i-1]= 1; + ff_h264_mlps_state[128-2*i-2]= 0; +#else + ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0; + ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1; + }else{ + ff_h264_lps_state[2*i+0]= 1; + ff_h264_lps_state[2*i+1]= 0; +#endif + } + } +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/cabac_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/cabac_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,233 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Context Adaptive Binary Arithmetic Coder. + */ + +#ifndef AVCODEC_CABAC_H +#define AVCODEC_CABAC_H + +//#undef NDEBUG +#include +#include "h264_dma.h" +#include "libavutil/x86_cpu.h" +#include "libavutil/attributes.h" + +#define CABAC_BITS 16 +#define CABAC_MASK ((1<bytestream == c->bytestream_end){ + if (c->bufsize>0){ + int size = (c->bufsize > sizeof(bytestream_ls)) ? sizeof(bytestream_ls) : c->bufsize; + int align = size &0xF; + int dma_size = size + (align? 16-align : 0); + + spu_dma_get(bytestream_ls, (unsigned) c->bytestream_ea, dma_size, ED_raw); + wait_dma_id(ED_raw); + c->bytestream = bytestream_ls; + c->bytestream_end = &bytestream_ls[size]; + c->bytestream_ea += dma_size; + c->bufsize -= size; + } + bytecount =0; + }else if((unsigned)c->bytestream > (unsigned)c->bytestream_end +2){ + //fprintf(stderr, "Read beyond end of frame %d\n", c->bufsize); + bytecount =0; + } +} + +static void refill(CABACContext *c){ + dma_cabac(c); + + c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); + + c->low -= CABAC_MASK; + c->bytestream+= CABAC_BITS/8; +} + +static void refill2(CABACContext *c){ + int i, x; + + dma_cabac(c); + + x= c->low ^ (c->low-1); + i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; + + x= -CABAC_MASK; + + x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); + + c->low += x<bytestream+= CABAC_BITS/8; +} + +static inline void renorm_cabac_decoder(CABACContext *c){ + while(c->range < 0x100){ + c->range+= c->range; + c->low+= c->low; + if(!(c->low & CABAC_MASK)) + refill(c); + } +} + +static inline void renorm_cabac_decoder_once(CABACContext *c){ + + int shift= (uint32_t)(c->range - 0x100)>>31; + c->range<<= shift; + c->low <<= shift; + + if(!(c->low & CABAC_MASK)) + refill(c); +} + +static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){ + + int s = *state; + int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; + int bit, lps_mask av_unused; + + c->range -= RangeLPS; +#ifndef BRANCHLESS_CABAC_DECODER + if(c->low < (c->range<<(CABAC_BITS+1))){ + bit= s&1; + *state= ff_h264_mps_state[s]; + renorm_cabac_decoder_once(c); + }else{ + bit= ff_h264_norm_shift[RangeLPS]; + c->low -= (c->range<<(CABAC_BITS+1)); + *state= ff_h264_lps_state[s]; + c->range = RangeLPS<low <<= bit; + bit= (s&1)^1; + + if(!(c->low & CABAC_MASK)){ + refill2(c); + } + } +#else /* BRANCHLESS_CABAC_DECODER */ + lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31; + + c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask; + c->range += (RangeLPS - c->range) & lps_mask; + + s^=lps_mask; + *state= (ff_h264_mlps_state+128)[s]; + bit= s&1; + + lps_mask= ff_h264_norm_shift[c->range]; + c->range<<= lps_mask; + c->low <<= lps_mask; + if(!(c->low & CABAC_MASK)) + refill2(c); +#endif /* BRANCHLESS_CABAC_DECODER */ + + return bit; +} + +static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){ + return get_cabac_inline(c, state); +} + +static int av_unused get_cabac(CABACContext *c, uint8_t * const state){ + return get_cabac_inline(c, state); +} + +static int av_unused get_cabac_bypass(CABACContext *c){ + + int range; + c->low += c->low; + + if(!(c->low & CABAC_MASK)) + refill(c); + + range= c->range<<(CABAC_BITS+1); + if(c->low < range){ + return 0; + }else{ + c->low -= range; + return 1; + } +} + +static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ + int range, mask; + c->low += c->low; + + if(!(c->low & CABAC_MASK)) + refill(c); + + range= c->range<<(CABAC_BITS+1); + c->low -= range; + mask= c->low >> 31; + range &= mask; + c->low += range; + return (val^mask)-mask; +} + +/** + * + * @return the number of bytes read or 0 if no end + */ +static int av_unused get_cabac_terminate(CABACContext *c){ + c->range -= 2; + if(c->low < c->range<<(CABAC_BITS+1)){ + renorm_cabac_decoder_once(c); + return 0; + }else{ + return c->bytestream - c->bytestream_start; + } +} + +#endif /* AVCODEC_CABAC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/dsputil_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/dsputil_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1147 @@ +/* + * Copyright (c) 2009 TUDelft + * + * Cell Parallel SPU - 2DWave Macroblock Decoding. + */ + +/** + * @file libavcodec/cell/spu/h264_main_spu.c + * Cell Parallel SPU - 2DWave Macroblock Decoding + * @author C C Chi + * + * SIMD SPU kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + + +#include "dsputil_spu.h" +#include "h264_idct_spu.h" +#include "h264_deblock_spu.h" +#include "types_spu.h" +#include "libavutil/intreadwrite.h" + +#include +#include +#include +#include + +//Luma interpolation +#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s +#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s) + +#define OP_U8_SPU PUT_OP_U8_SPU +#define PREFIX_h264_qpel16_h_lowpass_spu put_h264_qpel16_h_lowpass_spu +#define PREFIX_h264_qpel16_v_lowpass_spu put_h264_qpel16_v_lowpass_spu +#define PREFIX_h264_qpel16_hv_lowpass_spu put_h264_qpel16_hv_lowpass_spu +#define PREFIX_h264_qpel8_h_lowpass_spu put_h264_qpel8_h_lowpass_spu +#define PREFIX_h264_qpel8_v_lowpass_spu put_h264_qpel8_v_lowpass_spu +#define PREFIX_h264_qpel8_hv_lowpass_spu put_h264_qpel8_hv_lowpass_spu +#define PREFIX_h264_qpel4_h_lowpass_spu put_h264_qpel4_h_lowpass_spu +#define PREFIX_h264_qpel4_v_lowpass_spu put_h264_qpel4_v_lowpass_spu +#define PREFIX_h264_qpel4_hv_lowpass_spu put_h264_qpel4_hv_lowpass_spu +#include "h264_luma_template_spu.c" +#undef OP_U8_SPU +#undef PREFIX_h264_qpel16_h_lowpass_spu +#undef PREFIX_h264_qpel16_v_lowpass_spu +#undef PREFIX_h264_qpel16_hv_lowpass_spu +#undef PREFIX_h264_qpel8_h_lowpass_spu +#undef PREFIX_h264_qpel8_v_lowpass_spu +#undef PREFIX_h264_qpel8_hv_lowpass_spu +#undef PREFIX_h264_qpel4_h_lowpass_spu +#undef PREFIX_h264_qpel4_v_lowpass_spu +#undef PREFIX_h264_qpel4_hv_lowpass_spu + +#define OP_U8_SPU AVG_OP_U8_SPU +#define PREFIX_h264_qpel16_h_lowpass_spu avg_h264_qpel16_h_lowpass_spu +#define PREFIX_h264_qpel16_v_lowpass_spu avg_h264_qpel16_v_lowpass_spu +#define PREFIX_h264_qpel16_hv_lowpass_spu avg_h264_qpel16_hv_lowpass_spu +#define PREFIX_h264_qpel8_h_lowpass_spu avg_h264_qpel8_h_lowpass_spu +#define PREFIX_h264_qpel8_v_lowpass_spu avg_h264_qpel8_v_lowpass_spu +#define PREFIX_h264_qpel8_hv_lowpass_spu avg_h264_qpel8_hv_lowpass_spu +#define PREFIX_h264_qpel4_h_lowpass_spu avg_h264_qpel4_h_lowpass_spu +#define PREFIX_h264_qpel4_v_lowpass_spu avg_h264_qpel4_v_lowpass_spu +#define PREFIX_h264_qpel4_hv_lowpass_spu avg_h264_qpel4_hv_lowpass_spu +#include "h264_luma_template_spu.c" +#undef OP_U8_SPU +#undef PREFIX_h264_qpel16_h_lowpass_spu +#undef PREFIX_h264_qpel16_v_lowpass_spu +#undef PREFIX_h264_qpel16_hv_lowpass_spu +#undef PREFIX_h264_qpel8_h_lowpass_spu +#undef PREFIX_h264_qpel8_v_lowpass_spu +#undef PREFIX_h264_qpel8_hv_lowpass_spu +#undef PREFIX_h264_qpel4_h_lowpass_spu +#undef PREFIX_h264_qpel4_v_lowpass_spu +#undef PREFIX_h264_qpel4_hv_lowpass_spu + +#define H264_MC(OPNAME, SIZE, CODETYPE) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \ + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ +}\ + + +/**************************/ +/* put pixels functions */ +/*************************/ + +static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1, + const uint8_t * src2, int dst_stride, + int src_stride1, int h) +{ + int i; + + const int perm_src1 = (unsigned int) src1 & 15; + + for (i=0; i> log2_denom ) +#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) +#define H264_WEIGHT(W,H) \ +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ + int y; \ + offset <<= log2_denom; \ + if(log2_denom) offset += 1<<(log2_denom-1); \ + for(y=0; y> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); + tc++; + } + if( FFABS( q2 - q0 ) < beta ) { + pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); + tc++; + } + + i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ + pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ + } + pix += ystride; + } + } +} +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); +} +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); +} + +static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) +{ + int d; + for( d = 0; d < 16; d++ ) { + const int p2 = pix[-3*xstride]; + const int p1 = pix[-2*xstride]; + const int p0 = pix[-1*xstride]; + + const int q0 = pix[ 0*xstride]; + const int q1 = pix[ 1*xstride]; + const int q2 = pix[ 2*xstride]; + + if( FFABS( p0 - q0 ) < alpha && + FFABS( p1 - p0 ) < beta && + FFABS( q1 - q0 ) < beta ) { + + if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ + if( FFABS( p2 - p0 ) < beta) + { + const int p3 = pix[-4*xstride]; + /* p0', p1', p2' */ + pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; + pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; + pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; + } else { + /* p0' */ + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + } + if( FFABS( q2 - q0 ) < beta) + { + const int q3 = pix[3*xstride]; + /* q0', q1', q2' */ + pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; + pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; + pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; + } else { + /* q0' */ + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + }else{ + /* p0', q0' */ + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + } + pix += ystride; + } +} +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); +} +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); +} + +static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) +{ + int i, d; + for( i = 0; i < 4; i++ ) { + const int tc = tc0[i]; + if( tc <= 0 ) { + pix += 2*ystride; + continue; + } + for( d = 0; d < 2; d++ ) { + const int p0 = pix[-1*xstride]; + const int p1 = pix[-2*xstride]; + const int q0 = pix[0]; + const int q1 = pix[1*xstride]; + + if( FFABS( p0 - q0 ) < alpha && + FFABS( p1 - p0 ) < beta && + FFABS( q1 - q0 ) < beta ) { + + int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + + pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ + pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ + } + pix += ystride; + } + } +} +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); +} +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); +} + +static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) +{ + int d; + for( d = 0; d < 8; d++ ) { + const int p0 = pix[-1*xstride]; + const int p1 = pix[-2*xstride]; + const int q0 = pix[0]; + const int q1 = pix[1*xstride]; + + if( FFABS( p0 - q0 ) < alpha && + FFABS( p1 - p0 ) < beta && + FFABS( q1 - q0 ) < beta ) { + + pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ + } + pix += ystride; + } +} +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); +} +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); +} + + +void dsputil_h264_init_cell(DSPContext_spu* c) { + + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; + c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; + c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; + + c->h264_idct_add[0] = h264_idct8_add_spu; + c->h264_idct_add[1] = h264_idct4_add_spu; + + + c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu; + c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu; + c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu; + c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu; + c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu; + c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu; + + c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; + c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; + c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; + c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; + c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; + c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; + c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; + c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; + c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; + c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; + c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; + c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; + c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; + c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; + c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; + c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; + c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; + c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; + c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; + c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; + + +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \ + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \ + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \ + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \ + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \ + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \ + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \ + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \ + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \ + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \ + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \ + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \ + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \ + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \ + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \ + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu + + dspfunc(put_h264_qpel, 0, 16); + dspfunc(put_h264_qpel, 1, 8); + dspfunc(put_h264_qpel, 2, 4); + + dspfunc(avg_h264_qpel, 0, 16); + dspfunc(avg_h264_qpel, 1, 8); + dspfunc(avg_h264_qpel, 2, 4); + +#undef dspfunc + + +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/dsputil_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/dsputil_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,34 @@ +#ifndef DSPUTIL_CELL_H +#define DSPUTIL_CELL_H + +#include "types_spu.h" + +typedef struct DSPContext_spu { + + void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0); + /* v/h_loop_filter_luma_intra: align 16 */ + void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); + void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); + void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0); + void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); + void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); + + qpel_mc_func put_h264_qpel_pixels_tab[3][16]; + qpel_mc_func avg_h264_qpel_pixels_tab[3][16]; + + h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; + h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; + + h264_idct_func h264_idct_add[2]; + + h264_weight_func weight_h264_pixels_tab[10]; + h264_biweight_func biweight_h264_pixels_tab[10]; + +} DSPContext_spu; + + +void dsputil_h264_init_cell(DSPContext_spu* c); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_cabac_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_cabac_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,2633 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 cabac decoding. + * @author Michael Niedermayer + */ +#define CELL_SPE +#include +#include +#include "libavutil/intreadwrite.h" +#include "libavutil/mem.h" +#include "libavcodec/avcodec.h" +#include "h264_deblock_spu.h" +#include "h264_pred_spu.h" +#include "h264_direct_spu.h" +#include "h264_tables.h" +#include "mathops_spu.h" +//#include "libavcodec/h264_data.h" +#include "cabac_spu.h" +#include "rectangle_spu.h" +#include "libavutil/log.h" + +//#undef NDEBUG +#include +#define INT_BIT (sizeof(int) * 8) +/* Cabac pre state table */ +typedef struct IMbInfo{ + uint16_t type; + uint8_t pred_mode; + uint8_t cbp; +} IMbInfo; + +extern int bytecount; + +static const IMbInfo i_mb_type_info[26]={ +{MB_TYPE_INTRA4x4 , -1, -1}, +{MB_TYPE_INTRA16x16, 2, 0}, +{MB_TYPE_INTRA16x16, 1, 0}, +{MB_TYPE_INTRA16x16, 0, 0}, +{MB_TYPE_INTRA16x16, 3, 0}, +{MB_TYPE_INTRA16x16, 2, 16}, +{MB_TYPE_INTRA16x16, 1, 16}, +{MB_TYPE_INTRA16x16, 0, 16}, +{MB_TYPE_INTRA16x16, 3, 16}, +{MB_TYPE_INTRA16x16, 2, 32}, +{MB_TYPE_INTRA16x16, 1, 32}, +{MB_TYPE_INTRA16x16, 0, 32}, +{MB_TYPE_INTRA16x16, 3, 32}, +{MB_TYPE_INTRA16x16, 2, 15+0}, +{MB_TYPE_INTRA16x16, 1, 15+0}, +{MB_TYPE_INTRA16x16, 0, 15+0}, +{MB_TYPE_INTRA16x16, 3, 15+0}, +{MB_TYPE_INTRA16x16, 2, 15+16}, +{MB_TYPE_INTRA16x16, 1, 15+16}, +{MB_TYPE_INTRA16x16, 0, 15+16}, +{MB_TYPE_INTRA16x16, 3, 15+16}, +{MB_TYPE_INTRA16x16, 2, 15+32}, +{MB_TYPE_INTRA16x16, 1, 15+32}, +{MB_TYPE_INTRA16x16, 0, 15+32}, +{MB_TYPE_INTRA16x16, 3, 15+32}, +{MB_TYPE_INTRA_PCM , -1, -1}, +}; + +typedef struct PMbInfo{ + uint16_t type; + uint8_t partition_count; +} PMbInfo; + +static const PMbInfo p_mb_type_info[5]={ +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 4}, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4}, +}; + +static const PMbInfo p_sub_mb_type_info[4]={ +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, +{MB_TYPE_16x8 |MB_TYPE_P0L0 , 2}, +{MB_TYPE_8x16 |MB_TYPE_P0L0 , 2}, +{MB_TYPE_8x8 |MB_TYPE_P0L0 , 4}, +}; + +static const PMbInfo b_mb_type_info[23]={ +{MB_TYPE_DIRECT2|MB_TYPE_L0L1 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, +}; + +static const PMbInfo b_sub_mb_type_info[13]={ +{MB_TYPE_DIRECT2 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 4, }, +{MB_TYPE_8x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 4, }, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, +}; + +static const int8_t cabac_context_init_I[460][2] = +{ + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 unsused for I */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, + + /* 24- 39 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + + /* 40 - 53 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, + + /* 54 - 59 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 -> 87 */ + { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, + { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, + { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, + { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, + { -12, 115 },{ -16, 122 }, + + /* 88 -> 104 */ + { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, + { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, + { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, + { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, + { -22, 125 }, + + /* 105 -> 135 */ + { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, + { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, + { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, + { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, + { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, + { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, + { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, + { 14, 62 }, { -13, 108 },{ -15, 100 }, + + /* 136 -> 165 */ + { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, + { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, + { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, + { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, + { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, + { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, + { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, + { 0, 62 }, { 12, 72 }, + + /* 166 -> 196 */ + { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, + { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, + { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, + { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, + { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, + { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, + { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, + { 0, 89 }, { 26, -19 }, { 22, -17 }, + + /* 197 -> 226 */ + { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, + { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, + { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, + { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, + { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, + { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, + { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, + { 12, 68 }, { 2, 97 }, + + /* 227 -> 251 */ + { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, + { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, + { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, + { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, + { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, + { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, + { -4, 65 }, + + /* 252 -> 275 */ + { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, + { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, + { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, + { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, + { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, + { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 -> 307 */ + { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, + { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, + { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, + { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, + { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, + { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, + { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, + { 9, 64 }, { -12, 104 },{ -11, 97 }, + + /* 308 -> 337 */ + { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, + { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, + { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, + { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, + { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, + { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, + { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, + { 5, 64 }, { 12, 70 }, + + /* 338 -> 368 */ + { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, + { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, + { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, + { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, + { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, + { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, + { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, + { -12, 109 },{ 36, -35 }, { 36, -34 }, + + /* 369 -> 398 */ + { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, + { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, + { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, + { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, + { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, + { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, + { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, + { 29, 39 }, { 19, 66 }, + + /* 399 -> 435 */ + { 31, 21 }, { 31, 31 }, { 25, 50 }, + { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, + { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, + { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, + { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, + { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, + { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, + { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, + { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, + { 0, 68 }, { -9, 92 }, + + /* 436 -> 459 */ + { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, + { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, + { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, + { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, + { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, + { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 } +}; + +static const int8_t cabac_context_init_PB[3][460][2] = +{ + /* i_cabac_init_idc == 0 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, + { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, + { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, + { 17, 50 }, + + /* 24 - 39 */ + { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, + { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, + { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, + { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, + + /* 40 - 53 */ + { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, + { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, + { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, + { -3, 81 }, { 0, 88 }, + + /* 54 - 59 */ + { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, + { -7, 72 }, { 1, 58 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 87 */ + { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, + { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, + { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, + { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, + { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, + { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, + { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, + { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, + { 0, 68 }, { -4, 69 }, { -8, 88 }, + + /* 105 -> 165 */ + { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, + { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, + { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, + { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, + { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, + { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, + { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, + { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, + { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, + { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, + { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, + { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, + { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, + { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, + { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, + { 9, 69 }, + + /* 166 - 226 */ + { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, + { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, + { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, + { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, + { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, + { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, + { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, + { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, + { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, + { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, + { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, + { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, + { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, + { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, + { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, + { -9, 108 }, + + /* 227 - 275 */ + { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, + { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, + { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, + { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, + { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, + { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, + { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, + { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, + { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, + { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, + { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, + { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, + { -8, 85 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, + { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, + { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, + { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, + { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, + { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, + { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, + { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, + { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, + { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, + { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, + { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, + { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, + { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, + { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, + { 26, 43 }, + + /* 338 - 398 */ + { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, + { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, + { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, + { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, + { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, + { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, + { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, + { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, + { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, + { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, + { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, + { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, + { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, + { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, + { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, + { 11, 86 }, + + /* 399 - 435 */ + { 12, 40 }, { 11, 51 }, { 14, 59 }, + { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, + { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, + { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, + { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, + { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, + { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, + { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, + { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, + { -8, 66 }, { -8, 76 }, + + /* 436 - 459 */ + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, + { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, + { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, + }, + + /* i_cabac_init_idc == 1 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, + { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, + { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, + { 10, 54 }, + + /* 24 - 39 */ + { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, + { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, + { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, + { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, + + /* 40 - 53 */ + { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, + { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, + { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, + { -7, 86 },{ -5, 95 }, + + /* 54 - 59 */ + { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, + { -5, 72 },{ 0, 61 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 104 */ + { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, + { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, + { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, + { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, + { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, + { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, + { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, + { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, + { 0, 68 }, { -7, 74 }, { -9, 88 }, + + /* 105 -> 165 */ + { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, + { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, + { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, + { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, + { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, + { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, + { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, + { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, + { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, + { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, + { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, + { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, + { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, + { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, + { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, + { 0, 89 }, + + /* 166 - 226 */ + { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, + { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, + { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, + { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, + { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, + { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, + { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, + { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, + { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, + { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, + { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, + { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, + { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, + { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, + { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, + { -10, 116 }, + + /* 227 - 275 */ + { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, + { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, + { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, + { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, + { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, + { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, + { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, + { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, + { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, + { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, + { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, + { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, + { -4, 78 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, + { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, + { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, + { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, + { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, + { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, + { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, + { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, + { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, + { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, + { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, + { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, + { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, + { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, + { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, + { 18, 50 }, + + /* 338 - 398 */ + { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, + { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, + { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, + { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, + { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, + { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, + { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, + { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, + { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, + { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, + { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, + { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, + { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, + { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, + { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, + { 11, 83 }, + + /* 399 - 435 */ + { 25, 32 }, { 21, 49 }, { 21, 54 }, + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, + { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, + { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, + { -4, 67 }, { -7, 82 }, + + /* 436 - 459 */ + { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, + { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, + { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, + { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, + }, + + /* i_cabac_init_idc == 2 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, + { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, + { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, + { 14, 57 }, + + /* 24 - 39 */ + { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, + { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, + { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, + { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, + + /* 40 - 53 */ + { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, + { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, + { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, + { -3, 90 },{ -1, 101 }, + + /* 54 - 59 */ + { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, + { -7, 50 },{ 1, 60 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 104 */ + { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, + { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, + { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, + { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, + { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, + { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, + { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, + { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, + { 3, 68 }, { -8, 71 }, { -13, 98 }, + + /* 105 -> 165 */ + { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, + { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, + { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, + { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, + { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, + { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, + { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, + { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, + { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, + { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, + { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, + { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, + { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, + { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, + { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, + { -22, 127 }, + + /* 166 - 226 */ + { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, + { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, + { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, + { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, + { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, + { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, + { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, + { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, + { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, + { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, + { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, + { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, + { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, + { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, + { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, + { -24, 127 }, + + /* 227 - 275 */ + { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, + { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, + { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, + { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, + { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, + { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, + { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, + { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, + { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, + { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, + { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, + { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, + { -10, 87 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, + { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, + { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, + { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, + { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, + { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, + { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, + { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, + { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, + { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, + { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, + { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, + { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, + { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, + { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, + { 25, 42 }, + + /* 338 - 398 */ + { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, + { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, + { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, + { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, + { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, + { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, + { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, + { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, + { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, + { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, + { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, + { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, + { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, + { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, + { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, + { 25, 61 }, + + /* 399 - 435 */ + { 21, 33 }, { 19, 50 }, { 17, 61 }, + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, + { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, + { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, + { -6, 68 }, { -10, 79 }, + + /* 436 - 459 */ + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, + } +}; + +static const uint8_t left_block_options[4][16]={ + {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, + {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, + {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, + {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} +}; + +void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c) { + int i; + const int8_t (*tab)[2]; + + if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I; + else tab = cabac_context_init_PB[s->cabac_init_idc]; + + /* calculate pre-state */ + for( i= 0; i < 460; i++ ) { + int pre = 2*(((tab[i][0] * s->qscale) >>4 ) + tab[i][1]) - 127; + + pre^= pre>>31; + if(pre > 124) + pre= 124 + (pre&1); + + c->cabac_state[i] = pre; + } +} + +static void fill_decode_neighbors(H264Cabac_spu *hc, EDSlice_spu *s){ + H264Mb *m = s->m; + const int mb_x = m->mb_x; + const int mb_y = m->mb_y; + + m->top_type = hc->mb_type_top[mb_x]; + m->left_type = hc->mb_type[mb_x-1] ; + +} + +static void fill_decode_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ + H264Mb *m = s->m; + int topleft_xy, top_xy, topright_xy, left_xy; + int topleft_type, top_type, topright_type, left_type; + const uint8_t * left_block= left_block_options[0]; + const int mb_x = m->mb_x; + const int mb_y = m->mb_y; + const int b_stride = hc->b_stride; + int i; + + topleft_type = hc->mb_type_top[mb_x-1] ; + top_type = m->top_type ; + topright_type= hc->mb_type_top[mb_x+1] ; + left_type = m->left_type ; + + if (s->slice_type_nos == FF_B_TYPE){ + get_list = get_list_buf; + for(int i=0; i<2; i++){ + get_dma_list(hc->list1_motion_val[i], s->list1.motion_val[i][4*mb_x + 4*mb_y*b_stride], 16, 4, b_stride*2*sizeof(int16_t), ED_get_mv, 0); + } + if (hc->blocking) wait_dma_id(ED_get_mv); + } + + if(!IS_SKIP(mb_type)){ + if(IS_INTRA(mb_type)){ + int type_mask= s->pps.constrained_intra_pred ? IS_INTRA(-1) : -1; + m->topleft_samples_available= + m->top_samples_available= + m->left_samples_available= 0xFFFF; + m->topright_samples_available= 0xEEEA; + + if(!(top_type & type_mask)){ + m->topleft_samples_available= 0xB3FF; + m->top_samples_available= 0x33FF; + m->topright_samples_available= 0x26EA; + } + if(!(left_type & type_mask)){ + m->topleft_samples_available&= 0xDF5F; + m->left_samples_available&= 0x5F5F; + } + + if(!(topleft_type & type_mask)) + m->topleft_samples_available&= 0x7FFF; + + if(!(topright_type & type_mask)) + m->topright_samples_available&= 0xFBFF; + + if(IS_INTRA4x4(mb_type)){ + if(IS_INTRA4x4(top_type)){ + AV_COPY32(m->intra4x4_pred_mode_cache+4+8*0, &hc->intra4x4_pred_mode_top[8*mb_x]); + }else{ + m->intra4x4_pred_mode_cache[4+8*0]= + m->intra4x4_pred_mode_cache[5+8*0]= + m->intra4x4_pred_mode_cache[6+8*0]= + m->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask); + } + for(i=0; i<2; i++){ + if(IS_INTRA4x4(left_type)){ + int8_t *mode= &hc->intra4x4_pred_mode[8*(mb_x-1)]; + m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= mode[6-left_block[0+2*i]]; + m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= mode[6-left_block[1+2*i]]; + }else{ + m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= + m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= 2 - 3*!(left_type & type_mask); + } + } + } + } + if(top_type){ + AV_COPY32(&m->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]); + m->non_zero_count_cache[1+8*0]= hc->non_zero_count_top[mb_x][1+1*8]; + m->non_zero_count_cache[2+8*0]= hc->non_zero_count_top[mb_x][2+1*8]; + m->non_zero_count_cache[1+8*3]= hc->non_zero_count_top[mb_x][1+2*8]; + m->non_zero_count_cache[2+8*3]= hc->non_zero_count_top[mb_x][2+2*8]; + }else { + m->non_zero_count_cache[1+8*0]= + m->non_zero_count_cache[2+8*0]= + m->non_zero_count_cache[1+8*3]= + m->non_zero_count_cache[2+8*3]= + AV_WN32A(&m->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040); + } + + for (i=0; i<2; i++) { + if(left_type){ + m->non_zero_count_cache[3+8*1 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+0+2*i]]; + m->non_zero_count_cache[3+8*2 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+1+2*i]]; + m->non_zero_count_cache[0+8*1 + 8*i]= hc->non_zero_count[mb_x-1][left_block[8+4+2*i]]; + m->non_zero_count_cache[0+8*4 + 8*i]= hc->non_zero_count[mb_x-1][left_block[8+5+2*i]]; + }else{ + m->non_zero_count_cache[3+8*1 + 2*8*i]= + m->non_zero_count_cache[3+8*2 + 2*8*i]= + m->non_zero_count_cache[0+8*1 + 8*i]= + m->non_zero_count_cache[0+8*4 + 8*i]= !IS_INTRA(mb_type) ? 0 : 64; + } + } + + + // top_cbp + if(top_type) { + hc->top_cbp = hc->cbp_top[mb_x]; + } else { + hc->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; + } + // left_cbp + if (left_type) { + hc->left_cbp = (hc->cbp[mb_x-1] & 0x1f0) + | ((hc->cbp[mb_x-1]>>(left_block[0]&(~1)))&2) + | (((hc->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2); + } else { + hc->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; + } + } + + if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ + int list; + + m->ref_cache[0][scan8[5 ]+1] = m->ref_cache[0][scan8[7 ]+1] = m->ref_cache[0][scan8[13]+1] = + m->ref_cache[1][scan8[5 ]+1] = m->ref_cache[1][scan8[7 ]+1] = m->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; + + for(list=0; listlist_count; list++){ + if(!USES_LIST(mb_type, list)){ + continue; + } + assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); + + if(USES_LIST(top_type, list)){ + const int b_xy= 4*mb_x + 3*hc->b_stride; + AV_COPY128(m->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]); + m->ref_cache[list][scan8[0] + 0 - 1*8]= + m->ref_cache[list][scan8[0] + 1 - 1*8]= hc->ref_index_top[list][4*mb_x + 2]; + m->ref_cache[list][scan8[0] + 2 - 1*8]= + m->ref_cache[list][scan8[0] + 3 - 1*8]= hc->ref_index_top[list][4*mb_x + 3]; + }else{ + AV_ZERO128(m->mv_cache[list][scan8[0] + 0 - 1*8]); + AV_WN32A(&m->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); + } + + if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ + for(i=0; i<2; i++){ + int cache_idx = scan8[0] - 1 + i*2*8; + if(USES_LIST(left_type, list)){ + const int b_xy= 4*(mb_x-1) + 3; + const int b8_x= 4*(mb_x-1) + 1; + AV_COPY32(m->mv_cache[list][cache_idx ], hc->motion_val[list][b_xy + hc->b_stride*left_block[0+i*2]]); + AV_COPY32(m->mv_cache[list][cache_idx+8], hc->motion_val[list][b_xy + hc->b_stride*left_block[1+i*2]]); + m->ref_cache[list][cache_idx ]= hc->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; + m->ref_cache[list][cache_idx+8]= hc->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; + }else{ + AV_ZERO32(m->mv_cache [list][cache_idx ]); + AV_ZERO32(m->mv_cache [list][cache_idx+8]); + m->ref_cache[list][cache_idx ]= + m->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); + } + } + }else{ + if(USES_LIST(left_type, list)){ + const int b_x = 4*(mb_x-1) + 3; + const int b8_x= 4*(mb_x-1) + 1; + AV_COPY32(m->mv_cache[list][scan8[0] - 1], hc->motion_val[list][b_x + hc->b_stride*left_block[0]]); + m->ref_cache[list][scan8[0] - 1]= hc->ref_index[list][b8_x + (left_block[0]&~1)]; + }else{ + AV_ZERO32(m->mv_cache [list][scan8[0] - 1]); + m->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + } + + if(USES_LIST(topright_type, list)){ + const int b_xy= 4*(mb_x+1) + 3*hc->b_stride; + AV_COPY32(m->mv_cache[list][scan8[0] + 4 - 1*8], hc->motion_val_top[list][b_xy]); + m->ref_cache[list][scan8[0] + 4 - 1*8]= hc->ref_index_top[list][4*(mb_x+1) + 2]; + }else{ + AV_ZERO32(m->mv_cache [list][scan8[0] + 4 - 1*8]); + m->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + if(m->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ + int topleft_partition= -1; + if(USES_LIST(topleft_type, list)){ + const int b_xy = 4*(mb_x-1) + 3 + hc->b_stride + (topleft_partition & 2*hc->b_stride); + const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); + AV_COPY32(m->mv_cache[list][scan8[0] - 1 - 1*8], hc->motion_val_top[list][b_xy]); + m->ref_cache[list][scan8[0] - 1 - 1*8]= hc->ref_index_top[list][b8_x]; + }else{ + AV_ZERO32(m->mv_cache[list][scan8[0] - 1 - 1*8]); + m->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + } + + if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) + continue; + + if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { + m->ref_cache[list][scan8[4 ]] = + m->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; + AV_ZERO32(m->mv_cache [list][scan8[4 ]]); + AV_ZERO32(m->mv_cache [list][scan8[12]]); + + + /* XXX beurk, Load mvd */ + if(USES_LIST(top_type, list)){ +// const int b_xy= hc->mb2br_top_xy; + AV_COPY64(hc->mvd_cache[list][scan8[0] + 0 - 1*8], hc->mvd_top[list][8*mb_x + 0]); + }else{ + AV_ZERO64(hc->mvd_cache[list][scan8[0] + 0 - 1*8]); + } + if(USES_LIST(left_type, list)){ +// const int b_xy= hc->mb2br_left_xy + 6; + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 0*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[0]]); + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 1*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[1]]); + }else{ + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 0*8]); + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 1*8]); + } + if(USES_LIST(left_type, list)){ +// const int b_xy= hc->mb2br_left_xy + 6; + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 2*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[2]]); + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 3*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[3]]); + }else{ + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 2*8]); + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 3*8]); + } + AV_ZERO16(hc->mvd_cache [list][scan8[4 ]]); + AV_ZERO16(hc->mvd_cache [list][scan8[12]]); + if(s->slice_type_nos == FF_B_TYPE){ + fill_rectangle(&hc->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); + + if(IS_DIRECT(top_type)){ + AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1)); + }else if(IS_8X8(top_type)){ + int b8_x = 4*mb_x; + hc->direct_cache[scan8[0] + 0 - 1*8]= hc->direct_top[b8_x + 2]; + hc->direct_cache[scan8[0] + 2 - 1*8]= hc->direct_top[b8_x + 3]; + }else{ + AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1)); + } + + if(IS_DIRECT(left_type)) + hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1; + else if(IS_8X8(left_type)) + hc->direct_cache[scan8[0] - 1 + 0*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)]; + else + hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1; + + if(IS_DIRECT(left_type)) + hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1; + else if(IS_8X8(left_type)) + hc->direct_cache[scan8[0] - 1 + 2*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)]; + else + hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1; + } + } + } + } + hc->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type); + + if (s->slice_type_nos == FF_B_TYPE){ + wait_dma_id(ED_get_mv); + } +} + +static int check_mv(H264Cabac_spu *hc, EDSlice_spu *s, long b_idx, long bn_idx, int mvy_limit){ + int v; + + v= hc->ref_cache[0][b_idx] != hc->ref_cache[0][bn_idx]; + if(!v && hc->ref_cache[0][b_idx]!=-1) + // absolute value >= 7 | ... + v= ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) | + ((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit); + + if(s->list_count==2){ + if(!v) + v = (hc->ref_cache[1][b_idx] != hc->ref_cache[1][bn_idx]) | + ((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) | + ((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit); + + if(v){ + if((hc->ref_cache[0][b_idx] != hc->ref_cache[1][bn_idx]) | + (hc->ref_cache[1][b_idx] != hc->ref_cache[0][bn_idx])) + return 1; + return + ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) | + ((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit) | + ((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) | + ((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit); + } + } + + return v; +} + +static void calc_bS_values(H264Cabac_spu *hc, EDSlice_spu *s, int mvy_limit, int dir) { + H264Mb *m = s->m; + int mb_type = m->mb_type; + int edge; + const int mbm_type = dir == 0 ? m->left_type : m->top_type; + + // how often to recheck mv-based bS when iterating between edges + static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1}, + {0,3,1,1,3,3,3,3}}; + const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7]; + const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4; + // how often to recheck mv-based bS when iterating along each edge + const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); + + m->edges[dir]= edges; + + if(mbm_type){ + int16_t* bS=m->bS[dir][0]; + if( IS_INTRA(mb_type|mbm_type)) { + AV_WN64A(bS, 0x0004000400040004ULL); + } else { + int i; + int mv_done; + if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { + int b_idx= 8 + 4; + int bn_idx= b_idx - (dir ? 8:1); + + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, 8 + 4, bn_idx, mvy_limit); + mv_done = 1; + } + else + mv_done = 0; + + for( i = 0; i < 4; i++ ) { + int x = dir == 0 ? 0 : i; + int y = dir == 0 ? i : 0; + int b_idx= 8 + 4 + x + 8*y; + int bn_idx= b_idx - (dir ? 8:1); + + if( hc->non_zero_count_cache[b_idx] | + hc->non_zero_count_cache[bn_idx] ) { + bS[i] = 2; + } + else if(!mv_done) + { + bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); + } + } + } + } + + /* Calculate bS */ + for( edge = 1; edge < edges; edge++ ) { + int16_t* bS=m->bS[dir][edge]; + + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) + continue; + + if( IS_INTRA(mb_type)) { + AV_WN64A(bS, 0x0003000300030003ULL); + } else { + int i; + int mv_done; + + if( edge & mask_edge ) { + AV_ZERO64(bS); + mv_done = 1; + } + else if( mask_par0 ) { + int b_idx= 8 + 4 + edge * (dir ? 8:1); + int bn_idx= b_idx - (dir ? 8:1); + + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); + mv_done = 1; + } + else + mv_done = 0; + + for( i = 0; i < 4; i++ ) { + int x = dir == 0 ? edge : i; + int y = dir == 0 ? i : edge; + int b_idx= 8 + 4 + x + 8*y; + int bn_idx= b_idx - (dir ? 8:1); + + if( hc->non_zero_count_cache[b_idx] | + hc->non_zero_count_cache[bn_idx] ) { + bS[i] = 2; + } + else if(!mv_done) + { + bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); + } + } + + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) + continue; + } + + } +} + +/** +* +* @return zero if the loop filter can be skiped +*/ +static int fill_filter_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ + H264Mb *m = s->m; + const int mb_x = m->mb_x; + const int mb_y = m->mb_y; + int top_type, left_type; + int qp, top_qp, left_qp; + int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice + + m->dequant4_coeff_y = hc->dequant4_coeff[0][s->qscale][0]; + m->dequant4_coeff_cb = hc->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][s->chroma_qp[0]][0]; + m->dequant4_coeff_cr = hc->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][s->chroma_qp[1]][0]; + + m->qscale_mb_xy = qp = hc->qscale[mb_x]; + m->qscale_left_mb_xy = left_qp = hc->qscale[mb_x-1]; + m->qscale_top_mb_xy = top_qp = hc->qscale_top[mb_x]; + + //for sufficiently low qp, filtering wouldn't do anything + //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp + if(qp <= qp_thresh + && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh) + && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){ + m->deblock_mb = 0; + return 0; + } + + + m->deblock_mb = 1; + + top_type = hc->mb_type_top[mb_x] ; + left_type = hc->mb_type[mb_x -1]; + + m->top_type = top_type ; + m->left_type = left_type; + + if(IS_INTRA(mb_type)){ + calc_bS_values(hc, s, 4, 0); + calc_bS_values(hc, s, 4, 1); + return 1; + } + + AV_COPY64(&hc->non_zero_count_cache[0+8*1], &hc->non_zero_count[mb_x][ 0]); + AV_COPY64(&hc->non_zero_count_cache[0+8*2], &hc->non_zero_count[mb_x][ 8]); + AV_COPY32(&hc->non_zero_count_cache[0+8*5], &hc->non_zero_count[mb_x][16]); + AV_COPY32(&hc->non_zero_count_cache[4+8*3], &hc->non_zero_count[mb_x][20]); + AV_COPY64(&hc->non_zero_count_cache[0+8*4], &hc->non_zero_count[mb_x][24]); + + m->cbp= hc->cbp[mb_x]; + + { + int list; + for(list=0; listlist_count; list++){ + int8_t *ref; + int y, b_stride; + int16_t (*mv_dst)[2]; + int16_t (*mv_src)[2]; + + if(!USES_LIST(mb_type, list)){ + fill_rectangle( hc->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); + AV_WN32A(&hc->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&hc->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&hc->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&hc->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + continue; + } + + ref = &hc->ref_index[list][4*mb_x]; + { + int (*ref2frm)[64] =(void *) (s->ref2frm[0] + 2); + AV_WN32A(&hc->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + AV_WN32A(&hc->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + ref += 2; + AV_WN32A(&hc->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + AV_WN32A(&hc->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + } + b_stride = hc->b_stride; + mv_dst = &hc->mv_cache[list][scan8[0]]; + mv_src = &hc->motion_val[list][4*mb_x]; + for(y=0; y<4; y++){ + AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride); + } + + } + } + + /* + 0 . T T. T T T T + 1 L . .L . . . . + 2 L . .L . . . . + 3 . T TL . . . . + 4 L . .L . . . . + 5 L . .. . . . . + */ + //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) + if(top_type){ + AV_COPY32(&hc->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]); + } + + if(left_type){ + hc->non_zero_count_cache[3+8*1]= hc->non_zero_count[mb_x-1][7+0*8]; + hc->non_zero_count_cache[3+8*2]= hc->non_zero_count[mb_x-1][7+1*8]; + hc->non_zero_count_cache[3+8*3]= hc->non_zero_count[mb_x-1][7+2*8]; + hc->non_zero_count_cache[3+8*4]= hc->non_zero_count[mb_x-1][7+3*8]; + } + + if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ + int list; + for(list=0; listlist_count; list++){ + if(USES_LIST(top_type, list)){ + const int b_xy= 4*mb_x + 3*hc->b_stride; + const int b8_x= 4*mb_x + 2; + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); + AV_COPY128(hc->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]); + hc->ref_cache[list][scan8[0] + 0 - 1*8]= + hc->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 0]]; + hc->ref_cache[list][scan8[0] + 2 - 1*8]= + hc->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 1]]; + }else{ + AV_ZERO128(hc->mv_cache[list][scan8[0] + 0 - 1*8]); + AV_WN32A(&hc->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u); + } + + if(USES_LIST(left_type, list)){ + const int b_x = 4*(mb_x-1) + 3; + const int b8_x= 4*(mb_x-1) + 1; + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 0 ], hc->motion_val[list][b_x + hc->b_stride*0]); + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 8 ], hc->motion_val[list][b_x + hc->b_stride*1]); + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +16 ], hc->motion_val[list][b_x + hc->b_stride*2]); + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +24 ], hc->motion_val[list][b_x + hc->b_stride*3]); + hc->ref_cache[list][scan8[0] - 1 + 0 ]= + hc->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*0]]; + hc->ref_cache[list][scan8[0] - 1 +16 ]= + hc->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*1]]; + }else{ + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 0 ]); + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 8 ]); + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +16 ]); + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +24 ]); + hc->ref_cache[list][scan8[0] - 1 + 0 ]= + hc->ref_cache[list][scan8[0] - 1 + 8 ]= + hc->ref_cache[list][scan8[0] - 1 + 16 ]= + hc->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED; + } + } + } + calc_bS_values(hc, s, 4, 0); + calc_bS_values(hc, s, 4, 1); + return 1; +} + + +/** +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. +*/ +static int check_intra4x4_pred_mode(EDSlice_spu *s){ + H264Mb *m = s->m; + static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0}; + static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED}; + int i; + + if(!(m->top_samples_available&0x8000)){ + for(i=0; i<4; i++){ + int status= top[ m->intra4x4_pred_mode_cache[scan8[0] + i] ]; + if(status<0){ + fprintf(stderr, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); + return -1; + } else if(status){ + m->intra4x4_pred_mode_cache[scan8[0] + i]= status; + } + } + } + + if((m->left_samples_available&0x8888)!=0x8888){ + static const int mask[4]={0x8000,0x2000,0x80,0x20}; + for(i=0; i<4; i++){ + if(!(m->left_samples_available&mask[i])){ + int status= left[ m->intra4x4_pred_mode_cache[scan8[0] + 8*i] ]; + if(status<0){ + fprintf(stderr, "left block unavailable for requested intra4x4 mode %d at %d %d, %x\n", status, m->mb_x, m->mb_y, m->left_samples_available); + return -1; + } else if(status){ + m->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status; + } + } + } + } + return 0; +} + +/** +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. +*/ +static int check_intra_pred_mode(EDSlice_spu *s, int mode){ + H264Mb *m = s->m; + static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1}; + static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8}; + + if(mode > 6) { + fprintf(stderr, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y); + return -1; + } + + if(!(m->top_samples_available&0x8000)){ + mode= top[ mode ]; + if(mode<0){ + fprintf(stderr, "top block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y); + return -1; + } + } + + if((m->left_samples_available&0x8080) != 0x8080){ + mode= left[ mode ]; + if(m->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred + mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(m->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8); + } + if(mode<0){ + fprintf(stderr, "left block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y); + return -1; + } + } + return mode; +} + +/** + * gets the predicted intra4x4 prediction mode. + */ +static inline int pred_intra_mode(EDSlice_spu *s, int n){ + H264Mb *m = s->m; + const int index8= scan8[n]; + const int left= m->intra4x4_pred_mode_cache[index8 - 1]; + const int top = m->intra4x4_pred_mode_cache[index8 - 8]; + const int min= FFMIN(left, top); + + if(min<0) return DC_PRED; + else return min; +} + +static void write_back_intra_pred_mode(H264Cabac_spu *hc, EDSlice_spu *s){ + H264Mb *m = s->m; + const int mb_x = m->mb_x; + int8_t *mode= &hc->intra4x4_pred_mode[8*mb_x]; + + AV_COPY32(mode, m->intra4x4_pred_mode_cache + 4 + 8*4); + mode[4]= m->intra4x4_pred_mode_cache[7+8*3]; + mode[5]= m->intra4x4_pred_mode_cache[7+8*2]; + mode[6]= m->intra4x4_pred_mode_cache[7+8*1]; +} + +static inline void write_back_non_zero_count(H264Cabac_spu *hc, EDSlice_spu *s){ + H264Mb *m = s->m; + const int mb_x= m->mb_x; + + AV_COPY64(&hc->non_zero_count[mb_x][ 0], &m->non_zero_count_cache[0+8*1]); + AV_COPY64(&hc->non_zero_count[mb_x][ 8], &m->non_zero_count_cache[0+8*2]); + AV_COPY32(&hc->non_zero_count[mb_x][16], &m->non_zero_count_cache[0+8*5]); + AV_COPY32(&hc->non_zero_count[mb_x][20], &m->non_zero_count_cache[4+8*3]); + AV_COPY64(&hc->non_zero_count[mb_x][24], &m->non_zero_count_cache[0+8*4]); +} + +static inline void write_back_motion(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ + H264Mb *m = s->m; + const int mb_x = m->mb_x; + int b_stride = hc->b_stride; + const int b_x = 4*m->mb_x; //try mb2b(8)_xy + const int b8_x= 4*m->mb_x; + int list; + + if(!USES_LIST(mb_type, 0)) + fill_rectangle(&hc->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); + + for(list=0; listlist_count; list++){ + int y; + int16_t (*mv_dst)[2]; + int16_t (*mv_src)[2]; + + if(!USES_LIST(mb_type, list)) + continue; + + mv_dst = &hc->motion_val[list][b_x]; + mv_src = &m->mv_cache[list][scan8[0]]; + for(y=0; y<4; y++){ + AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); + } + { + uint8_t (*mvd_dst)[2] = (void *) hc->mvd[list][8*mb_x]; + uint8_t (*mvd_src)[2] = &hc->mvd_cache[list][scan8[0]]; + if(IS_SKIP(mb_type)) + AV_ZERO128(mvd_dst); + else{ + AV_COPY64(mvd_dst, mvd_src + 8*3); + AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); + AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); + AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); + } + } + + { + int8_t *ref_index = &hc->ref_index[list][b8_x]; + ref_index[0+0*2]= m->ref_cache[list][scan8[0]]; + ref_index[1+0*2]= m->ref_cache[list][scan8[4]]; + ref_index[0+1*2]= m->ref_cache[list][scan8[8]]; + ref_index[1+1*2]= m->ref_cache[list][scan8[12]]; + } + } + + if(s->slice_type_nos == FF_B_TYPE){ + if(IS_8X8(mb_type)){ + uint8_t *direct = &hc->direct[4*mb_x]; + direct[1] = m->sub_mb_type[1]>>1; + direct[2] = m->sub_mb_type[2]>>1; + direct[3] = m->sub_mb_type[3]>>1; + } + } +} + +static inline int get_dct8x8_allowed(EDSlice_spu *s){ + H264Mb *m = s->m; + if(s->direct_8x8_inference_flag) + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); + else + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); +} + +static inline int fetch_diagonal_mv(EDSlice_spu *s, const int16_t **C, int i, int list, int part_width){ + H264Mb *m = s->m; + const int topright_ref= m->ref_cache[list][ i - 8 + part_width ]; + + if(topright_ref != PART_NOT_AVAILABLE){ + *C= m->mv_cache[list][ i - 8 + part_width ]; + return topright_ref; + }else{ + *C= m->mv_cache[list][ i - 8 - 1 ]; + return m->ref_cache[list][ i - 8 - 1 ]; + } +} + +/** + * gets the predicted MV. + * @param n the block index + * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4) + * @param mx the x component of the predicted motion vector + * @param my the y component of the predicted motion vector + */ +static inline void pred_motion(EDSlice_spu *s, int n, int part_width, int list, int ref, int * const mx, int * const my){ + H264Mb *m = s->m; + const int index8= scan8[n]; + const int top_ref= m->ref_cache[list][ index8 - 8 ]; + const int left_ref= m->ref_cache[list][ index8 - 1 ]; + const int16_t * const A= m->mv_cache[list][ index8 - 1 ]; + const int16_t * const B= m->mv_cache[list][ index8 - 8 ]; + const int16_t * C; + int diagonal_ref, match_count; + + assert(part_width==1 || part_width==2 || part_width==4); + +/* mv_cache + B . . A T T T T + U . . L . . , . + U . . L . . . . + U . . L . . , . + . . . L . . . . +*/ + + diagonal_ref= fetch_diagonal_mv(s, &C, index8, list, part_width); + match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref); + + if(match_count > 1){ //most common + *mx= mid_pred(A[0], B[0], C[0]); + *my= mid_pred(A[1], B[1], C[1]); + }else if(match_count==1){ + if(left_ref==ref){ + *mx= A[0]; + *my= A[1]; + }else if(top_ref==ref){ + *mx= B[0]; + *my= B[1]; + }else{ + *mx= C[0]; + *my= C[1]; + } + }else{ + if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){ + *mx= A[0]; + *my= A[1]; + }else{ + *mx= mid_pred(A[0], B[0], C[0]); + *my= mid_pred(A[1], B[1], C[1]); + } + } + +} + +/** + * gets the directionally predicted 16x8 MV. + * @param n the block index + * @param mx the x component of the predicted motion vector + * @param my the y component of the predicted motion vector + */ +static inline void pred_16x8_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){ + H264Mb *m = s->m; + if(n==0){ + const int top_ref= m->ref_cache[list][ scan8[0] - 8 ]; + const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ]; + + if(top_ref == ref){ + *mx= B[0]; + *my= B[1]; + return; + } + }else{ + const int left_ref= m->ref_cache[list][ scan8[8] - 1 ]; + const int16_t * const A= m->mv_cache[list][ scan8[8] - 1 ]; + + if(left_ref == ref){ + *mx= A[0]; + *my= A[1]; + return; + } + } + + //RARE + pred_motion(s, n, 4, list, ref, mx, my); +} + +/** + * gets the directionally predicted 8x16 MV. + * @param n the block index + * @param mx the x component of the predicted motion vector + * @param my the y component of the predicted motion vector + */ +static inline void pred_8x16_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){ + H264Mb *m = s->m; + if(n==0){ + const int left_ref= m->ref_cache[list][ scan8[0] - 1 ]; + const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ]; + + if(left_ref == ref){ + *mx= A[0]; + *my= A[1]; + return; + } + }else{ + const int16_t * C; + int diagonal_ref; + + diagonal_ref= fetch_diagonal_mv(s, &C, scan8[4], list, 2); + if(diagonal_ref == ref){ + *mx= C[0]; + *my= C[1]; + return; + } + } + + //RARE + pred_motion(s, n, 2, list, ref, mx, my); +} + +static inline void pred_pskip_motion(EDSlice_spu *s, int * const mx, int * const my){ + H264Mb *m = s->m; + const int top_ref = m->ref_cache[0][ scan8[0] - 8 ]; + const int left_ref= m->ref_cache[0][ scan8[0] - 1 ]; + + if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE + || !( top_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 8 ])) + || !(left_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 1 ]))){ + + *mx = *my = 0; + return; + } + + pred_motion(s, 0, 4, 0, 0, mx, my); + + return; +} + +/** + * decodes a P_SKIP or B_SKIP macroblock + */ +static void decode_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s){ + H264Mb *m = s->m; + const int mb_x = m->mb_x; + int mb_type=0; + + memset(hc->non_zero_count[mb_x], 0, 32); + memset(m->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui + + if( s->slice_type_nos == FF_B_TYPE ) + { + // just for fill_caches. pred_direct_motion will set the real mb_type + mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; + fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ... + + ff_h264_pred_direct_motion(hc, s, &mb_type); + mb_type|= MB_TYPE_SKIP; + } + else + { + int mx, my; + mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; + + fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ... + pred_pskip_motion(s, &mx, &my); + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); + fill_rectangle( m->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); + } + + write_back_motion(hc, s, mb_type); + hc->mb_type[mb_x]= mb_type; + m->mb_type = mb_type; + hc->qscale[mb_x]= s->qscale; + fill_filter_caches(hc, s, mb_type); +} + +static int decode_cabac_intra_mb_type(EDSlice_spu *s, CABACContext *c, int ctx_base, int intra_slice) { + H264Mb *m =s->m; + uint8_t *state= &c->cabac_state[ctx_base]; + int mb_type; + + if(intra_slice){ + int ctx=0; + if( m->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) + ctx++; + if( m->top_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) + ctx++; + if( get_cabac_noinline( c, &state[ctx] ) == 0 ) + return 0; /* I4x4 */ + state += 2; + }else{ + if( get_cabac_noinline( c, state ) == 0 ) + return 0; /* I4x4 */ + } + + if( get_cabac_terminate( c ) ) + return 25; /* PCM */ + + mb_type = 1; /* I16x16 */ + mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */ + if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */ + mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] ); + mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] ); + mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] ); + return mb_type; +} + +static int decode_cabac_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s, H264Mb *m, CABACContext *c) { + int ctx = 0; + const int mb_x = m->mb_x; + + if( m->mb_x>0 && !IS_SKIP( hc->mb_type[mb_x-1] )) + ctx++; + if( m->mb_y>0 && !IS_SKIP( hc->mb_type_top[mb_x] )) + ctx++; + + if( s->slice_type_nos == FF_B_TYPE ) + ctx += 13; + return get_cabac_noinline(c, &c->cabac_state[11+ctx] ); +} + +static int decode_cabac_mb_intra4x4_pred_mode( CABACContext *c, int pred_mode ) { + int mode = 0; + + if( get_cabac(c, &c->cabac_state[68] ) ) + return pred_mode; + + mode += 1 * get_cabac(c, &c->cabac_state[69] ); + mode += 2 * get_cabac(c, &c->cabac_state[69] ); + mode += 4 * get_cabac(c, &c->cabac_state[69] ); + + return mode + ( mode >= pred_mode ); +} + +static int decode_cabac_mb_chroma_pre_mode(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) { + H264Mb *m = s->m; + const int mb_x = m->mb_x; + + int ctx = 0; + + /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */ + if( m->left_type && hc->chroma_pred_mode[mb_x-1] != 0 ) + ctx++; + + if( m->top_type && hc->chroma_pred_mode_top[mb_x] != 0 ) + ctx++; + + if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 ) + return 0; + + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) + return 1; + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) + return 2; + else + return 3; +} + +static int decode_cabac_mb_cbp_luma(H264Cabac_spu *hc, CABACContext *c) { + int cbp_b, cbp_a, ctx, cbp = 0; + + cbp_a = hc->left_cbp; + cbp_b = hc->top_cbp; + + ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]); + ctx = !(cbp & 0x01) + 2 * !(cbp_b & 0x08); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1; + ctx = !(cbp_a & 0x08) + 2 * !(cbp & 0x01); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2; + ctx = !(cbp & 0x04) + 2 * !(cbp & 0x02); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3; + return cbp; +} +static int decode_cabac_mb_cbp_chroma(H264Cabac_spu *hc, CABACContext *c) { + int ctx; + int cbp_a, cbp_b; + + cbp_a = (hc->left_cbp>>4)&0x03; + cbp_b = (hc-> top_cbp>>4)&0x03; + + ctx = 0; + if( cbp_a > 0 ) ctx++; + if( cbp_b > 0 ) ctx += 2; + if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 ) + return 0; + + ctx = 4; + if( cbp_a == 2 ) ctx++; + if( cbp_b == 2 ) ctx += 2; + return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] ); +} + +static int decode_cabac_p_mb_sub_type( CABACContext *c) { + if( get_cabac(c, &c->cabac_state[21] ) ) + return 0; /* 8x8 */ + if( !get_cabac(c, &c->cabac_state[22] ) ) + return 1; /* 8x4 */ + if( get_cabac(c, &c->cabac_state[23] ) ) + return 2; /* 4x8 */ + return 3; /* 4x4 */ +} +static int decode_cabac_b_mb_sub_type(CABACContext *c) { + int type; + if( !get_cabac(c, &c->cabac_state[36] ) ) + return 0; /* B_Direct_8x8 */ + if( !get_cabac(c, &c->cabac_state[37] ) ) + return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */ + type = 3; + if( get_cabac(c, &c->cabac_state[38] ) ) { + if( get_cabac(c, &c->cabac_state[39] ) ) + return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */ + type += 4; + } + type += 2*get_cabac(c, &c->cabac_state[39] ); + type += get_cabac(c, &c->cabac_state[39] ); + return type; +} + +static int decode_cabac_mb_ref(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, int list, int n ) { + H264Mb *m = s->m; + int refa = m->ref_cache[list][scan8[n] - 1]; + int refb = m->ref_cache[list][scan8[n] - 8]; + int ref = 0; + int ctx = 0; + + if( s->slice_type_nos == FF_B_TYPE) { + if( refa > 0 && !(hc->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) ) + ctx++; + if( refb > 0 && !(hc->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) ) + ctx += 2; + } else { + if( refa > 0 ) + ctx++; + if( refb > 0 ) + ctx += 2; + } + + while( get_cabac(c, &c->cabac_state[54+ctx] ) ) { + ref++; + ctx = (ctx>>2)+4; + if(ref >= 32 /*h->ref_list[list]*/){ + fprintf(stderr, "refcount %d\n", ref); + return -1; + } + } + return ref; +} + +static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) { + int mvd; + + if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){ +// if(!get_cabac(&h->cabac, &c->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){ + *mvda= 0; + return 0; + } + + mvd= 1; + ctxbase+= 3; + while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) { + if( mvd < 4 ) + ctxbase++; + mvd++; + } + + if( mvd >= 9 ) { + int k = 3; + while( get_cabac_bypass(c ) ) { + mvd += 1 << k; + k++; + if(k>24){ + fprintf(stderr, "overflow in decode_cabac_mb_mvd\n"); + return INT_MIN; + } + } + while( k-- ) { + mvd += get_cabac_bypass(c )<mvd_cache[list][scan8[n] - 1][0] +\ + hc->mvd_cache[list][scan8[n] - 8][0];\ + int amvd1 = hc->mvd_cache[list][scan8[n] - 1][1] +\ + hc->mvd_cache[list][scan8[n] - 8][1];\ +\ + mx += decode_cabac_mb_mvd( c, 40, amvd0, &mpx );\ + my += decode_cabac_mb_mvd( c, 47, amvd1, &mpy );\ +} + +static av_always_inline int get_cabac_cbf_ctx(H264Cabac_spu *hc, EDSlice_spu *s, int cat, int idx, int is_dc ) { + H264Mb *m = s->m; + int nza, nzb; + int ctx = 0; + + if( is_dc ) { + if( cat == 0 ) { + nza = hc->left_cbp&0x100; + nzb = hc-> top_cbp&0x100; + } else { + nza = (hc->left_cbp>>(6+idx))&0x01; + nzb = (hc-> top_cbp>>(6+idx))&0x01; + } + } else { + assert(cat == 1 || cat == 2 || cat == 4); + nza = m->non_zero_count_cache[scan8[idx] - 1]; + nzb = m->non_zero_count_cache[scan8[idx] - 8]; + } + + if( nza > 0 ) + ctx++; + + if( nzb > 0 ) + ctx += 2; + + return ctx + 4 * cat; +} + + uint8_t last_coeff_flag_offset_8x8[63] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 +}; + +static const int significant_coeff_flag_offset[2][6] = { + { 105+0, 105+15, 105+29, 105+44, 105+47, 402 }, + { 277+0, 277+15, 277+29, 277+44, 277+47, 436 } +}; +static const int last_coeff_flag_offset[2][6] = { + { 166+0, 166+15, 166+29, 166+44, 166+47, 417 }, + { 338+0, 338+15, 338+29, 338+44, 338+47, 451 } +}; +static const int coeff_abs_level_m1_offset[6] = { + 227+0, 227+10, 227+20, 227+30, 227+39, 426 +}; +static const uint8_t significant_coeff_flag_offset_8x8[2][63] = { + { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, + 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, + 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, + 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 }, + { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5, + 6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, + 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, + 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 } +}; +/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). +* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). +* map node ctx => cabac ctx for level=1 */ +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; +/* map node ctx => cabac ctx for level>1 */ +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; +static const uint8_t coeff_abs_level_transition[2][8] = { + /* update node ctx after decoding a level=1 */ + { 1, 2, 3, 3, 4, 5, 6, 7 }, + /* update node ctx after decoding a level>1 */ + { 4, 4, 4, 4, 5, 6, 7, 7 } +}; + +static av_always_inline void decode_cabac_residual_internal(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { + H264Mb *m = s->m; + const int mb_x = m->mb_x; + int index[64]; + + int av_unused last; + int coeff_count = 0; + int node_ctx = 0; + + uint8_t *significant_coeff_ctx_base; + uint8_t *last_coeff_ctx_base; + uint8_t *abs_level_m1_ctx_base; + + /* read coded block flag */ + if( is_dc || cat != 5 ) { + if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( hc, s, cat, n, is_dc ) ] ) == 0 ) { + if( !is_dc ) + m->non_zero_count_cache[scan8[n]] = 0; + return; + } + } + + significant_coeff_ctx_base = c->cabac_state + + significant_coeff_flag_offset[0][cat]; + last_coeff_ctx_base = c->cabac_state + + last_coeff_flag_offset[0][cat]; + abs_level_m1_ctx_base = c->cabac_state + + coeff_abs_level_m1_offset[cat]; + + if( !is_dc && cat == 5 ) { +#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \ + for(last= 0; last < coefs; last++) { \ + uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \ + if( get_cabac( c, sig_ctx )) { \ + uint8_t *last_ctx = last_coeff_ctx_base + last_off; \ + index[coeff_count++] = last; \ + if( get_cabac( c, last_ctx ) ) { \ + last= max_coeff; \ + break; \ + } \ + } \ + }\ + if( last == max_coeff -1 ) {\ + index[coeff_count++] = last;\ + }\ + + const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0]; + DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); + } else { + DECODE_SIGNIFICANCE( max_coeff - 1, last, last ); + } + assert(coeff_count > 0); + + if( is_dc ) { + if( cat == 0 ) + hc->cbp[mb_x] |= 0x100; + else + hc->cbp[mb_x] |= 0x40 << n; + } else { + if( cat == 5 ) + fill_rectangle(&m->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); + else { + assert( cat == 1 || cat == 2 || cat == 4 ); + m->non_zero_count_cache[scan8[n]] = coeff_count; + } + } + + do { + uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; + int j= scantable[index[--coeff_count]]; + + if( get_cabac( c, ctx ) == 0 ) { + node_ctx = coeff_abs_level_transition[0][node_ctx]; + if( is_dc ) { + block[j] = get_cabac_bypass_sign( c, -1); + }else{ + block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6; + } + } else { + int coeff_abs = 2; + ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; + node_ctx = coeff_abs_level_transition[1][node_ctx]; + + while( coeff_abs < 15 && get_cabac( c, ctx ) ) { + coeff_abs++; + } + + if( coeff_abs >= 15 ) { + int j = 0; + while( get_cabac_bypass( c ) ) { + j++; + } + + coeff_abs=1; + while( j-- ) { + coeff_abs += coeff_abs + get_cabac_bypass( c ); + } + coeff_abs+= 14; + } + + if( is_dc ) { + block[j] = get_cabac_bypass_sign( c, -coeff_abs ); + }else{ + block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6; + } + } + } while( coeff_count ); + +} + +static void decode_cabac_residual_dc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { + decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, NULL, max_coeff, 1); +} + +static void decode_cabac_residual_nondc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { + decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, qmul, max_coeff, 0); +} + +/** + * decodes a macroblock + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed + */ +int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) { + H264Mb *m = s->m; + int mb_x = m->mb_x; + int mb_type, partition_count, cbp = 0; + int dct8x8_allowed= s->pps.transform_8x8_mode; + + fill_decode_neighbors(hc, s); + memset(m->mb, 0 , sizeof(m->mb)); + + if( s->slice_type_nos != FF_I_TYPE ) { + int skip; + /* a skipped mb needs the aff flag from the following mb */ + skip = decode_cabac_mb_skip( hc, s, m, c); + + /* read skip flags */ + if( skip ) { + decode_mb_skip(hc, s); + hc->cbp[mb_x] = m->cbp = 0; + hc->chroma_pred_mode[mb_x] = 0; + s->last_qscale_diff = 0; + return 0; + } + } + + if( s->slice_type_nos == FF_B_TYPE ) { + int ctx = 0; + + if( !IS_DIRECT( m->left_type-1 ) ) + ctx++; + if( !IS_DIRECT( m->top_type-1 ) ) + ctx++; + + if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){ + mb_type= 0; /* B_Direct_16x16 */ + }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) { + mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */ + }else{ + int bits; + bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3; + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2; + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1; + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ); + if( bits < 8 ){ + mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */ + }else if( bits == 13 ){ + mb_type= decode_cabac_intra_mb_type(s, c, 32, 0); + goto decode_intra_mb; + }else if( bits == 14 ){ + mb_type= 11; /* B_L1_L0_8x16 */ + }else if( bits == 15 ){ + mb_type= 22; /* B_8x8 */ + }else{ + bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] ); + mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */ + } + } + partition_count= b_mb_type_info[mb_type].partition_count; + mb_type= b_mb_type_info[mb_type].type; + } else if( s->slice_type_nos == FF_P_TYPE ) { + if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) { + /* P-type */ + if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) { + /* P_L0_D16x16, P_8x8 */ + mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] ); + } else { + /* P_L0_D8x16, P_L0_D16x8 */ + mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] ); + } + partition_count= p_mb_type_info[mb_type].partition_count; + mb_type= p_mb_type_info[mb_type].type; + } else { + mb_type= decode_cabac_intra_mb_type(s, c, 17, 0); + goto decode_intra_mb; + } + } else { + mb_type= decode_cabac_intra_mb_type(s ,c, 3, 1); + if(s->slice_type == FF_SI_TYPE && mb_type) + mb_type--; + assert(s->slice_type_nos == FF_I_TYPE); +decode_intra_mb: + partition_count = 0; + cbp= i_mb_type_info[mb_type].cbp; + m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode; + mb_type= i_mb_type_info[mb_type].type; + } + + if(IS_INTRA_PCM(mb_type)) { + uint8_t *ptr; + // We assume these blocks are very rare so we do not optimize it. + // FIXME The two following lines get the bitstream position in the cabac + // decode, I think it should be done by a function in cabac.h (or cabac.c). + ptr=c->bytestream; + if(c->low&0x1) ptr--; + if(CABAC_BITS==16){ + if(c->low&0x1FF) ptr--; + } + if ((unsigned) (ptr + 384) >= (unsigned) c->bytestream_end){ + fprintf(stderr, "Intra PCM mb crossed bytestream buffer\n Known issue."); + } + + // The pixels are stored in the same order as levels in h->mb array. + memcpy(m->mb, ptr, 256); ptr+=256; + memcpy(m->mb+128, ptr, 128); ptr+=128; + + c->bytestream = ptr; + #if CABAC_BITS == 16 + c->low = (*c->bytestream++)<<18; + c->low+= (*c->bytestream++)<<10; + #else + c->low = (*c->bytestream++)<<10; + #endif + c->low+= ((*c->bytestream++)<<2) + 2; + c->range= 0x1FE; + + // All blocks are present + hc->cbp[mb_x] = 0x1ef; + hc->chroma_pred_mode[mb_x] = 0; + // In deblocking, the quantizer is 0 + hc->qscale[mb_x]= 0; + // All coeffs are present + memset(hc->non_zero_count[mb_x], 16, 32); + hc->mb_type[mb_x]= m->mb_type = mb_type; + s->last_qscale_diff = 0; + fill_filter_caches(hc, s, mb_type); + return 0; + } + fill_decode_caches(hc, s, mb_type); + + if( IS_INTRA( mb_type ) ) { + int i, pred_mode; + if( IS_INTRA4x4( mb_type ) ) { + if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ) ) { + mb_type |= MB_TYPE_8x8DCT; + for( i = 0; i < 16; i+=4 ) { + int pred = pred_intra_mode( s, i ); + int mode = decode_cabac_mb_intra4x4_pred_mode(c, pred ); + fill_rectangle( &m->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 ); + } + } else { + for( i = 0; i < 16; i++ ) { + int pred = pred_intra_mode( s, i ); + m->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode(c, pred ); + } + } + write_back_intra_pred_mode(hc, s); + if( check_intra4x4_pred_mode(s) < 0 ) return -1; + } else { + m->intra16x16_pred_mode= check_intra_pred_mode(s, m->intra16x16_pred_mode ); + if( m->intra16x16_pred_mode < 0 ) return -1; + } + + hc->chroma_pred_mode[mb_x] = + pred_mode = decode_cabac_mb_chroma_pre_mode( hc, s, c ); + + pred_mode= check_intra_pred_mode( s, pred_mode ); + if( pred_mode < 0 ) return -1; + m->chroma_pred_mode= pred_mode; + + } else if( partition_count == 4 ) { + int i, j, sub_partition_count[4], list, ref[2][4]; + + if( s->slice_type_nos == FF_B_TYPE ) { + for( i = 0; i < 4; i++ ) { + m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c ); + sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; + m->sub_mb_type[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].type; + } + if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | + m->sub_mb_type[2] | m->sub_mb_type[3]) ) { + ff_h264_pred_direct_motion(hc, s, &mb_type); + m->ref_cache[0][scan8[4]] = + m->ref_cache[1][scan8[4]] = + m->ref_cache[0][scan8[12]] = + m->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; + for( i = 0; i < 4; i++ ) + fill_rectangle( &hc->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 ); + } + } else { + for( i = 0; i < 4; i++ ) { + m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c ); + sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; + m->sub_mb_type[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].type; + } + } + + for( list = 0; list < s->list_count; list++ ) { + for( i = 0; i < 4; i++ ) { + if(IS_DIRECT(m->sub_mb_type[i])) continue; + if(IS_DIR(m->sub_mb_type[i], 0, list)){ + if( s->ref_count[list] > 1 ){ + ref[list][i] = decode_cabac_mb_ref(hc, s, c, list, 4*i ); + if(ref[list][i] >= s->ref_count[list]){ + fprintf(stderr, "Reference %d >= %d\n", ref[list][i], s->ref_count[list]); + return -1; + } + }else + ref[list][i] = 0; + } else { + ref[list][i] = -1; + } + m->ref_cache[list][ scan8[4*i]+1 ]= + m->ref_cache[list][ scan8[4*i]+8 ]=m->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i]; + } + } + + if(dct8x8_allowed) + dct8x8_allowed = get_dct8x8_allowed(s); + + for(list=0; listlist_count; list++){ + for(i=0; i<4; i++){ + m->ref_cache[list][ scan8[4*i] ]=m->ref_cache[list][ scan8[4*i]+1 ]; + if(IS_DIRECT(m->sub_mb_type[i])){ + fill_rectangle(hc->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); + continue; + } + + if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){ + const int sub_mb_type= m->sub_mb_type[i]; + const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; + for(j=0; jmv_cache[list][ scan8[index]]; + uint8_t (* mvd_cache)[2]= &hc->mvd_cache[list][ scan8[index]]; + pred_motion(s, index, block_width, list, m->ref_cache[list][ scan8[index] ], &mx, &my); + DECODE_CABAC_MB_MVD( hc, c, list, index) + + if(IS_SUB_8X8(sub_mb_type)){ + mv_cache[ 1 ][0]= + mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx; + mv_cache[ 1 ][1]= + mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my; + + mvd_cache[ 1 ][0]= + mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx; + mvd_cache[ 1 ][1]= + mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy; + }else if(IS_SUB_8X4(sub_mb_type)){ + mv_cache[ 1 ][0]= mx; + mv_cache[ 1 ][1]= my; + + mvd_cache[ 1 ][0]= mpx; + mvd_cache[ 1 ][1]= mpy; + }else if(IS_SUB_4X8(sub_mb_type)){ + mv_cache[ 8 ][0]= mx; + mv_cache[ 8 ][1]= my; + + mvd_cache[ 8 ][0]= mpx; + mvd_cache[ 8 ][1]= mpy; + } + mv_cache[ 0 ][0]= mx; + mv_cache[ 0 ][1]= my; + + mvd_cache[ 0 ][0]= mpx; + mvd_cache[ 0 ][1]= mpy; + } + }else{ + fill_rectangle(m->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); + fill_rectangle(hc->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); + } + } + } + } else if( IS_DIRECT(mb_type) ) { + ff_h264_pred_direct_motion(hc, s, &mb_type); + fill_rectangle(hc->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); + fill_rectangle(hc->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); + dct8x8_allowed &= s->direct_8x8_inference_flag; + } else { + int list, i; + if(IS_16X16(mb_type)){ + for(list=0; listlist_count; list++){ + if(IS_DIR(mb_type, 0, list)){ + int ref; + if(s->ref_count[list] > 1){ + ref= decode_cabac_mb_ref(hc, s, c, list, 0); + if(ref >= s->ref_count[list]){ + fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); + return -1; + } + }else + ref=0; + fill_rectangle(&m->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); + } + } + for(list=0; listlist_count; list++){ + if(IS_DIR(mb_type, 0, list)){ + int mx,my,mpx,mpy; + pred_motion(s, 0, 4, list, m->ref_cache[list][ scan8[0] ], &mx, &my); + DECODE_CABAC_MB_MVD( hc, c, list, 0) + + fill_rectangle(hc->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); + fill_rectangle(m->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); + } + + } + } + else if(IS_16X8(mb_type)){ + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + int ref; + if(s->ref_count[list] > 1){ + ref= decode_cabac_mb_ref(hc, s, c, list, 8*i ); + if(ref >= s->ref_count[list]){ + fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); + return -1; + } + }else + ref=0; + fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); + }else + fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); + } + } + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + int mx,my,mpx,mpy; + pred_16x8_motion(s, 8*i, list, m->ref_cache[list][scan8[0] + 16*i], &mx, &my); + DECODE_CABAC_MB_MVD( hc, c, list, 8*i) + + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); + fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); + }else{ + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); + fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); + } + } + } + }else{ + assert(IS_8X16(mb_type)); + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ //FIXME optimize + int ref; + if(s->ref_count[list] > 1){ + ref= decode_cabac_mb_ref(hc, s, c, list, 4*i ); + if(ref >= s->ref_count[list]){ + fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); + return -1; + } + }else + ref=0; + fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); + }else + fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); + } + } + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + int mx,my,mpx,mpy; + pred_8x16_motion( s, i*4, list, m->ref_cache[list][ scan8[0] + 2*i ], &mx, &my); + DECODE_CABAC_MB_MVD( hc, c, list, 4*i) + + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); + fill_rectangle(m->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); + }else{ + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); + fill_rectangle(m-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); + } + } + } + } + } + + if( IS_INTER( mb_type ) ) { + hc->chroma_pred_mode[mb_x] = 0; + write_back_motion( hc, s, mb_type ); + } + + if( !IS_INTRA16x16( mb_type ) ) { + cbp = decode_cabac_mb_cbp_luma( hc, c); + cbp |= decode_cabac_mb_cbp_chroma( hc, c ) << 4; + } + + hc->cbp[mb_x] = m->cbp = cbp; + if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) { + mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ); + } + + if( cbp || IS_INTRA16x16( mb_type ) ) { + const uint8_t *scan, *scan8x8, *dc_scan; + const uint32_t *qmul; + + if (s->transform_bypass && s->qscale){ + scan8x8= ff_zigzag_direct; + scan= zigzag_scan; + }else{ + scan8x8= hc->zigzag_scan8x8; + scan= hc->zigzag_scan; + } + dc_scan= luma_dc_zigzag_scan; + + // decode_cabac_mb_dqp + if(get_cabac_noinline(c, &c->cabac_state[60 + (s->last_qscale_diff != 0)])){ + int val = 1; + int ctx= 2; + + while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) { + ctx= 3; + val++; + if(val > 102){ //prevent infinite loop + fprintf(stderr, "cabac decode of qscale diff failed at %d %d (%d)\n", m->mb_x, m->mb_y, val); + return -1; + } + } + + if( val&0x01 ) + val= (val + 1)>>1 ; + else + val= -((val + 1)>>1); + s->last_qscale_diff = val; + s->qscale += val; + if(((unsigned)s->qscale) > 51){ + if(s->qscale<0) s->qscale+= 52; + else s->qscale-= 52; + } + s->chroma_qp[0] = s->pps.chroma_qp_table[0][s->qscale]; + s->chroma_qp[1] = s->pps.chroma_qp_table[1][s->qscale]; + }else + s->last_qscale_diff=0; + + if( IS_INTRA16x16( mb_type ) ) { + int i; + decode_cabac_residual_dc( hc, s, c, m->mb, 0, 0, dc_scan, 16); + + if( cbp&15 ) { + qmul = hc->dequant4_coeff[0][s->qscale]; + for( i = 0; i < 16; i++ ) { + decode_cabac_residual_nondc( hc, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15); + } + } else { + fill_rectangle(&m->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1); + } + } else { + int i8x8, i4x4; + for( i8x8 = 0; i8x8 < 4; i8x8++ ) { + if( cbp & (1<mb + 64*i8x8, 5, 4*i8x8, + scan8x8, hc->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64); + } else { + qmul = hc->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale]; + for( i4x4 = 0; i4x4 < 4; i4x4++ ) { + const int index = 4*i8x8 + i4x4; +//START_TIMER + decode_cabac_residual_nondc(hc, s, c, m->mb + 16*index, 2, index, scan, qmul, 16); +//STOP_TIMER("decode_residual") + } + } + } else { + uint8_t * const nnz= &m->non_zero_count_cache[ scan8[4*i8x8] ]; + nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; + } + } + } + + if( cbp&0x30 ){ + int i; + for( i = 0; i < 2; i++ ) { + decode_cabac_residual_dc(hc, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4); + } + } + + if( cbp&0x20 ) { + int i, j; + for( i = 0; i < 2; i++ ) { + qmul = hc->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][s->chroma_qp[i]]; + for( j = 0; j < 4; j++ ) { + const int index = 16 + 4 * i + j; + decode_cabac_residual_nondc( hc, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15); + } + } + } else { + uint8_t * const nnz= &m->non_zero_count_cache[0]; + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; + } + } else { + uint8_t * const nnz= &m->non_zero_count_cache[0]; + fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1); + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; + s->last_qscale_diff = 0; + } + hc->mb_type[mb_x]= m->mb_type = mb_type; + hc->qscale[mb_x]= s->qscale; + write_back_non_zero_count(hc, s); + fill_filter_caches(hc, s, mb_type); + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_cabac_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_cabac_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,17 @@ +#ifndef H264_CABAC_H +#define H264_CABAC_H + +#define CELL_SPE +#include "libavcodec/avcodec.h" +#include "h264_types_spu.h" +#include "cabac_spu.h" + + +/** + * decodes a CABAC coded macroblock + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed + */ +int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c); +void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_chroma_template_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_chroma_template_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,355 @@ +static void PREFIX_h264_chroma_mc8_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { + + register int i; + + const int16_t i32ss= 32; + const int16_t imax = 255; + const int16_t iABCD1 = ((8 - x) * (8 - y)); + const int16_t iABCD2 = ((x) * (8 - y)); + const int16_t iABCD3 = ((8 - x) * (y)); + const int16_t iABCD4 = ((x) * (y)); + + const vsint16_t vA = spu_splats(iABCD1); + const vsint16_t vB = spu_splats(iABCD2); + const vsint16_t vC = spu_splats(iABCD3); + const vsint16_t vD = spu_splats(iABCD4); + const vsint32_t vzero = spu_splats(0); + const vsint16_t v32ss = spu_splats(i32ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const int shift_src =(unsigned int) src & 15; + const int shift_dst =(unsigned int) dst & 15; + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; + vuint8_t dstmask; + + if(shift_dst==0) + dstmask=dstmask0; + else + dstmask=dstmask8; + + vuint8_t vsrc0uc1; + vuint8_t vsrc0uc2; + vuint8_t vsrc0uc; + vuint8_t vsrc1uc; + vsrc0uc1 = *(vuint8_t *)(src); + vsrc0uc2 = *(vuint8_t *)(src+16); + vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); + vsrc1uc = spu_slqwbyte(vsrc0uc, 1); + + vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); + vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); + + for (i = 0 ; i < h ; i++) { + + vuint8_t vsrc2uc1; + vuint8_t vsrc2uc2; + vuint8_t vsrc2uc; + vuint8_t vsrc3uc; + vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); + vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); + vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); + vsrc3uc = spu_slqwbyte(vsrc2uc, 1); + + vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); + vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); + + vsint16_t psum; + + vsint32_t psum1 = spu_mule(vsrc0ssH, vA); + vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); + psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + + psum1 = spu_mule(vsrc1ssH, vB); + psum2 = spu_mulo(vsrc1ssH, vB); + vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum1 = spu_mule(vsrc2ssH, vC); + psum2 = spu_mulo(vsrc2ssH, vC); + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum1 = spu_mule(vsrc3ssH, vD); + psum2 = spu_mulo(vsrc3ssH, vD); + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum = spu_add(v32ss, psum); + psum = spu_rlmask(psum, -6); + + //Saturation from 0 to 255 + sat = spu_cmpgt(psum,(vsint16_t)vzero); + psum = spu_and(psum,(vsint16_t)sat); + sat = spu_cmpgt(psum,vmax); + psum = spu_sel(psum,vmax,sat); + + const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + vsrc0ssH = vsrc2ssH; + vsrc1ssH = vsrc3ssH; + + dst += dst_stride; + //src += src_stride; + src += STRIDE_C; + } +} + +static void PREFIX_h264_chroma_mc4_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { + + register int i; + + const int16_t i32ss= 32; + const int16_t imax = 255; + const int16_t iABCD1 = ((8 - x) * (8 - y)); + const int16_t iABCD2 = ((x) * (8 - y)); + const int16_t iABCD3 = ((8 - x) * (y)); + const int16_t iABCD4 = ((x) * (y)); + + const vsint16_t vA = spu_splats(iABCD1); + const vsint16_t vB = spu_splats(iABCD2); + const vsint16_t vC = spu_splats(iABCD3); + const vsint16_t vD = spu_splats(iABCD4); + const vsint32_t vzero = spu_splats(0); + const vsint16_t v32ss = spu_splats(i32ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + const int shift_src = (unsigned int) src & 15; + const int shift_dst = (unsigned int) dst & 15; + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; + + switch(shift_dst){ + case 0: dstmask = dstmask0; + break; + case 4: dstmask = dstmask4; + break; + case 8: dstmask = dstmask8; + break; + case 12: dstmask = dstmask12; + break; + } + + vuint8_t vsrc0uc1; + vuint8_t vsrc0uc2; + vuint8_t vsrc0uc; + vuint8_t vsrc1uc; + vsrc0uc1 = *(vuint8_t *)(src); + vsrc0uc2 = *(vuint8_t *)(src+16); + vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); + vsrc1uc = spu_slqwbyte(vsrc0uc, 1); + + vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); + vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); + + for (i = 0 ; i < h ; i++) { + + vuint8_t vsrc2uc1; + vuint8_t vsrc2uc2; + vuint8_t vsrc2uc; + vuint8_t vsrc3uc; + vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); + vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); + vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); + vsrc3uc = spu_slqwbyte(vsrc2uc, 1); + + vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); + vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); + + vsint16_t psum; + + vsint32_t psum1 = spu_mule(vsrc0ssH, vA); + vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); + psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + + psum1 = spu_mule(vsrc1ssH, vB); + psum2 = spu_mulo(vsrc1ssH, vB); + vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum1 = spu_mule(vsrc2ssH, vC); + psum2 = spu_mulo(vsrc2ssH, vC); + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum1 = spu_mule(vsrc3ssH, vD); + psum2 = spu_mulo(vsrc3ssH, vD); + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum = spu_add(v32ss, psum); + psum = spu_rlmask(psum, -6); + + //Saturation from 0 to 255 + sat = spu_cmpgt(psum,(vsint16_t)vzero); + psum = spu_and(psum,(vsint16_t)sat); + sat = spu_cmpgt(psum,vmax); + psum = spu_sel(psum,vmax,sat); + + const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + vsrc0ssH = vsrc2ssH; + vsrc1ssH = vsrc3ssH; + + dst += dst_stride; + src += STRIDE_C; + } +} + +static void PREFIX_h264_chroma_mc2_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { + + register int i; + + const int16_t i32ss= 32; + const int16_t imax = 255; + const int16_t iABCD1 = ((8 - x) * (8 - y)); + const int16_t iABCD2 = ((x) * (8 - y)); + const int16_t iABCD3 = ((8 - x) * (y)); + const int16_t iABCD4 = ((x) * (y)); + + const vsint16_t vA = spu_splats(iABCD1); + const vsint16_t vB = spu_splats(iABCD2); + const vsint16_t vC = spu_splats(iABCD3); + const vsint16_t vD = spu_splats(iABCD4); + const vsint32_t vzero = spu_splats(0); + const vsint16_t v32ss = spu_splats(i32ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + const int shift_src = (unsigned int) src & 15; + const int shift_dst = (unsigned int) dst & 15; + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const vuint8_t dstmask0= {0x10,0x11,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask2= {0x00,0x01,0x10,0x11,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask6= {0x00,0x01,0x02,0x03,0x04,0x05,0x10,0x11,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask10= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x10,0x11,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x0E,0x0F}; + const vuint8_t dstmask14= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x10,0x11}; + + switch(shift_dst){ + case 0: dstmask = dstmask0; + break; + case 2: dstmask = dstmask2; + break; + case 4: dstmask = dstmask4; + break; + case 6: dstmask = dstmask6; + break; + case 8: dstmask = dstmask8; + break; + case 10: dstmask = dstmask10; + break; + case 12: dstmask = dstmask12; + break; + case 14: dstmask = dstmask14; + break; + } + + vuint8_t vsrc0uc1; + vuint8_t vsrc0uc2; + vuint8_t vsrc0uc; + vuint8_t vsrc1uc; + vsrc0uc1 = *(vuint8_t *)(src); + vsrc0uc2 = *(vuint8_t *)(src+16); + vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); + vsrc1uc = spu_slqwbyte(vsrc0uc, 1); + + vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); + vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); + + for (i = 0 ; i < h ; i++) { + + vuint8_t vsrc2uc1; + vuint8_t vsrc2uc2; + vuint8_t vsrc2uc; + vuint8_t vsrc3uc; + vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); + vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); + vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); + vsrc3uc = spu_slqwbyte(vsrc2uc, 1); + + vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); + vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); + + vsint16_t psum; + + vsint32_t psum1 = spu_mule(vsrc0ssH, vA); + vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); + psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + + psum1 = spu_mule(vsrc1ssH, vB); + psum2 = spu_mulo(vsrc1ssH, vB); + vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum1 = spu_mule(vsrc2ssH, vC); + psum2 = spu_mulo(vsrc2ssH, vC); + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum1 = spu_mule(vsrc3ssH, vD); + psum2 = spu_mulo(vsrc3ssH, vD); + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); + psum = spu_add(psum3, psum); + + psum = spu_add(v32ss, psum); + psum = spu_rlmask(psum, -6); + + //Saturation from 0 to 255 + sat = spu_cmpgt(psum,(vsint16_t)vzero); + psum = spu_and(psum,(vsint16_t)sat); + sat = spu_cmpgt(psum,vmax); + psum = spu_sel(psum,vmax,sat); + + const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + vsrc0ssH = vsrc2ssH; + vsrc1ssH = vsrc3ssH; + + dst += dst_stride; + src += STRIDE_C; + } +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_deblock_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_deblock_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2009 TUDelft + * + * Cell Parallel SPU - 2DWave Macroblock Decoding. + */ + +/** + * @file libavcodec/cell/spu/h264_main_spu.c + * Cell Parallel SPU - 2DWave Macroblock Decoding + * @author C C Chi + * + * SIMD kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + +#include "h264_deblock_spu.h" +#include "h264_decode_mb_spu.h" + +extern int print_debug; + +static void filter_mb_edgev( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { + H264slice *s= h->s; + const int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + tc[0] = tc0_table[index_a][bS[0]]; + tc[1] = tc0_table[index_a][bS[1]]; + tc[2] = tc0_table[index_a][bS[2]]; + tc[3] = tc0_table[index_a][bS[3]]; + + h->dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc); + } else { + h->dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta); + } +} + +static void filter_mb_edgecv( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { + H264slice *s= h->s; + const int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + + tc[0] = tc0_table[index_a][bS[0]]+1; + tc[1] = tc0_table[index_a][bS[1]]+1; + tc[2] = tc0_table[index_a][bS[2]]+1; + tc[3] = tc0_table[index_a][bS[3]]+1; + + h->dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc); + } else { + h->dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta); + } +} + +static void filter_mb_edgeh( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { + H264slice *s= h->s; + const int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + + tc[0] = tc0_table[index_a][bS[0]]; + tc[1] = tc0_table[index_a][bS[1]]; + tc[2] = tc0_table[index_a][bS[2]]; + tc[3] = tc0_table[index_a][bS[3]]; + + h->dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc); + } else { + h->dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta); + } +} + +static void filter_mb_edgech( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { + H264slice *s= h->s; + const int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + + tc[0] = tc0_table[index_a][bS[0]]+1; + tc[1] = tc0_table[index_a][bS[1]]+1; + tc[2] = tc0_table[index_a][bS[2]]+1; + tc[3] = tc0_table[index_a][bS[3]]+1; + + h->dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc); + } else { + h->dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta); + } +} + +static void filter_mb_dir(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int dir) { + H264Mb *mb = h->mb; + H264slice *s = h->s; + const int qp_xy= mb->qscale_mb_xy; + const int qp_dir = dir == 0 ? mb->qscale_left_mb_xy : mb->qscale_top_mb_xy; + const int mbm_type = dir == 0 ? mb->left_type : mb->top_type; + const int mb_type = mb->mb_type; + int edge; + const int edges = mb->edges[dir]; + //int (*ref2frm)[64] = s->ref2frm; + +// int start;//= h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0; +// +// const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) +// == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4; +// // how often to recheck mv-based bS when iterating between edges +// const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 : +// (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0; +// // how often to recheck mv-based bS when iterating along each edge +// const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); + +// if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0)) +// start =1; +// else +// start =0; +// +// /* Calculate bS */ +// for( edge = start; edge < edges; edge++ ) { +// const int mbn_type = edge > 0 ? mb_type : mbm_type; +// const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm; +// int (*ref2frmn)[64] = ref2frm;//edge > 0 ? ref2frm : ref2frmm; +// int16_t bS[4]; +// int qp; +// +// if( (edge&1) && IS_8x8DCT(mb_type) ) +// continue; +// +// if( IS_INTRA(mb_type) || +// IS_INTRA(mbn_type) ) { +// int value; +// +// if (edge == 0) { +// value = 4; +// } else { +// value = 3; +// } +// bS[0] = bS[1] = bS[2] = bS[3] = value; +// } else { +// int i, l; +// int mv_done; +// +// if( edge & mask_edge ) { +// +// bS[0] = bS[1] = bS[2] = bS[3] = 0; +// mv_done = 1; +// } +// else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { +// int b_idx= 8 + 4 + edge * (dir ? 8:1); +// int bn_idx= b_idx - (dir ? 8:1); +// int v = 0; +// +// for( l = 0; !v && l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) { +// v |= ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] || +// FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || +// FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit; +// } +// bS[0] = bS[1] = bS[2] = bS[3] = v; +// +// mv_done = 1; +// } +// else +// mv_done = 0; +// +// for( i = 0; i < 4; i++ ) { +// int x = dir == 0 ? edge : i; +// int y = dir == 0 ? i : edge; +// int b_idx= 8 + 4 + x + 8*y; +// int bn_idx= b_idx - (dir ? 8:1); +// +// if( mb->non_zero_count_cache[b_idx] | +// mb->non_zero_count_cache[bn_idx] ) { +// bS[i] = 2; +// } +// else if(!mv_done) +// { +// bS[i] = 0; +// for( l = 0; l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) { +// if( ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] || +// FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || +// FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) { +// bS[i] = 1; +// break; +// } +// } +// } +// } +// +// if(bS[0]+bS[1]+bS[2]+bS[3] == 0) +// continue; +// } +// qp = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1; + + if(mbm_type){ + int16_t* bS=mb->bS[dir][0]; + /* Filter edge */ + // Do not use s->qscale as luma quantizer because it has not the same + // value in IPCM macroblocks. + if(bS[0]+bS[1]+bS[2]+bS[3]){ + int qp = ( qp_xy + qp_dir + 1 ) >> 1; + if( dir == 0 ) { + filter_mb_edgev(h, &img_y[0], linesize, bS, qp); + { + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; + filter_mb_edgecv(h, &img_cb[0], uvlinesize, bS, qp); + filter_mb_edgecv(h, &img_cr[0], uvlinesize, bS, qp); + } + } else { + filter_mb_edgeh(h, &img_y[0], linesize, bS, qp); + { + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; + filter_mb_edgech(h, &img_cb[0], uvlinesize, bS, qp); + filter_mb_edgech(h, &img_cr[0], uvlinesize, bS, qp); + } + } + } + } + + for( edge = 1; edge < edges; edge++ ) { + int16_t* bS=mb->bS[dir][edge]; + int qp = qp_xy; + + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) + continue; + + /* Filter edge */ + // Do not use s->qscale as luma quantizer because it has not the same + // value in IPCM macroblocks. + + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) + continue; + + if( dir == 0 ) { + filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp ); + if( (edge&1) == 0 ) { + filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) ); + filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) ); + } + } else { + filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp ); + if( (edge&1) == 0 ) { + filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) ); + filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) ); + } + } + } +} + +void filter_mb( H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) { + filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 0); + filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 1); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_deblock_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_deblock_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,80 @@ +#ifndef H264_FILTER_SPU_H +#define H264_FILTER_SPU_H + +#include "types_spu.h" +#include "h264_decode_mb_spu.h" + +#define FFABS(a) ((a) >= 0 ? (a) : (-(a))) + +void filter_mb(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize); + +/* Deblocking filter (p153) */ +static const uint8_t alpha_table[52*3] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, + 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, + 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, + 80, 90,101,113,127,144,162,182,203,226, + 255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, +}; + +static const uint8_t beta_table[52*3] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, + 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, + 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, + 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, +}; + +static const uint8_t tc0_table[52*3][4] = { + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, + {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, + {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, + {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, + {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, + {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, + {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, +}; + +static inline int get_chroma_qp(H264slice *s, int t, int qscale){ + return s->chroma_qp_table[t][qscale]; +} + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_decode_mb_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_decode_mb_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2009 TUDelft + * + * Cell Parallel SPU - 2DWave Macroblock Decoding. + */ + +/** + * @file libavcodec/cell/spu/h264_main_spu.c + * Cell Parallel SPU - 2DWave Macroblock Decoding + * @author C C Chi + * + * SIMD kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + +#include +#include +#include +//#include "dsputil_cell.h" +#include "types_spu.h" +#include "h264_tables.h" +#include "h264_dma.h" +#include "h264_mc_spu.h" +#include "h264_intra_spu.h" +#include "h264_decode_mb_spu.h" +#include "h264_deblock_spu.h" + +//border buffers +DECLARE_ALIGNED_16(TopBorder, top_ls[240]); +LeftBorder left_ls; + +//mb line buffer - statically allocated for up to 1920 width video +DECLARE_ALIGNED_16(uint8_t, dest_y_ls[2*16*20]); +DECLARE_ALIGNED_16(uint8_t, dest_cb_ls[2*8*10]); +DECLARE_ALIGNED_16(uint8_t, dest_cr_ls[2*8*10]); + +//dma transfer buffer +DECLARE_ALIGNED_16(uint8_t, dma_y_ls [64*(32+20)]); //EDGE_WIDTH = 32 +DECLARE_ALIGNED_16(uint8_t, dma_cb_ls[32*(16+10)]); +DECLARE_ALIGNED_16(uint8_t, dma_cr_ls[32*(16+10)]); + +DECLARE_ALIGNED_16(uint8_t, extra_edge_y [32*(32+20)]); //EDGE_WIDTH = 32 +DECLARE_ALIGNED_16(uint8_t, extra_edge_cr[16*(16+10)]); +DECLARE_ALIGNED_16(uint8_t, extra_edge_cb[16*(16+10)]); + + +// For intra mode +/// for now do the extra copy before dma, but it's better to skip this and do the dma right away +static void backup_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ + H264Mb* mb= h->mb; + + int i; + uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y; + uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb; + uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr; + + uint8_t* left_border_y = left_ls.unfiltered_y; + uint8_t* left_border_cb = left_ls.unfiltered_cb; + uint8_t* left_border_cr = left_ls.unfiltered_cr; + + src_y -= linesize; + src_cb -= uvlinesize; + src_cr -= uvlinesize; + + // There are two lines saved, the line above the top macroblock of a pair, + // and the line above the bottom macroblock + left_border_y[0] = top_border_y[15]; + for(i=1; i<17; i++){ + left_border_y[i] = src_y[15+i* linesize]; + } + + *(qword*)(top_border_y)= *(qword*)(src_y + 16*linesize); + + left_border_cb[0] = top_border_cb[7]; + left_border_cr[0] = top_border_cr[7]; + for(i=1; i<9; i++){ + left_border_cb[i] = src_cb[7+i*uvlinesize]; + left_border_cr[i] = src_cr[7+i*uvlinesize]; + } + *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); + *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); +} + +static void xchg_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ + H264Mb* mb= h->mb; + H264slice* s = h->s; + + int temp8, i; + uint64_t temp64; + int deblock_left; + int deblock_top; + + uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y; + uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb; + uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr; + uint8_t* top_border_y_next = top_ls[mb->mb_x +1].unfiltered_y; + + uint8_t* left_border_y = left_ls.unfiltered_y; + uint8_t* left_border_cb = left_ls.unfiltered_cb; + uint8_t* left_border_cr = left_ls.unfiltered_cr; + + deblock_left = (mb->mb_x > 0); + deblock_top = (mb->mb_y > 0); + + src_y -= ( linesize + 1); + src_cb -= (uvlinesize + 1); + src_cr -= (uvlinesize + 1); + + #define XCHG(a,b,t,xchg)\ + t= a;\ + if(xchg)\ + a= b;\ + b= t; + + if(deblock_left){ + for(i = !deblock_top; i<16; i++){ + XCHG(left_border_y[i], src_y [i* linesize], temp8, xchg); + } + XCHG(left_border_y[i], src_y [i* linesize], temp8, 1); + + for(i = !deblock_top; i<8; i++){ + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg); + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg); + } + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1); + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1); + } + + if(deblock_top){ + XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg); + XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1); + if(mb->mb_x+1 < s->mb_width){ + XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1); + } + XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1); + XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1); + } +} + +void copy_top_borders(int mb_x, uint8_t *dst_y, uint8_t *dst_cb, uint8_t *dst_cr, int stride_y, int stride_c){ + qword *qsrc_y = (qword *) (top_ls[mb_x].top_borders_y); + dst_y-= 4*stride_y; + + *((qword *) (dst_y + 0*stride_y)) = *qsrc_y++; + *((qword *) (dst_y + 1*stride_y)) = *qsrc_y++; + *((qword *) (dst_y + 2*stride_y)) = *qsrc_y++; + *((qword *) (dst_y + 3*stride_y)) = *qsrc_y++; + + dst_cb-=2*stride_c; + uint64_t *dsrc_cb = (uint64_t *) (top_ls[mb_x].top_borders_cb); + *((uint64_t *) (dst_cb + 0*stride_c)) = *dsrc_cb++; + *((uint64_t *) (dst_cb + 1*stride_c)) = *dsrc_cb++; + + dst_cr-=2*stride_c; + uint64_t *dsrc_cr = (uint64_t *) (top_ls[mb_x].top_borders_cr); + *((uint64_t *) (dst_cr + 0*stride_c)) = *dsrc_cr++; + *((uint64_t *) (dst_cr + 1*stride_c)) = *dsrc_cr++; +} + +static void send_top_borders(H264Context_spu *h, int mb_x, uint8_t* dest_y, uint8_t* dest_cb, uint8_t* dest_cr, int stride_y, int stride_c){ + H264spe *spe= &h->spe; + //fill borders (unfiltered borders already filled in backup_mb_border) + dest_y+= 12*stride_y; + qword *qtop_y = (qword *) top_ls[mb_x].top_borders_y; + for(int i=0; i<4; i++){ + qword *qdest_y = (qword *) dest_y; + *qtop_y++ = *qdest_y; + dest_y+=stride_y; + } + dest_cb+= 6*stride_c; + dest_cr+= 6*stride_c; + uint64_t *dtop_cb = (uint64_t *) top_ls[mb_x].top_borders_cb; + uint64_t *dtop_cr = (uint64_t *) top_ls[mb_x].top_borders_cr; + for(int i=0; i<2; i++){ + uint64_t *ddest_cb = (uint64_t *) dest_cb; + uint64_t *ddest_cr = (uint64_t *) dest_cr; + + *dtop_cb++ = *ddest_cb; + *dtop_cr++ = *ddest_cr; + + dest_cb+=stride_c; + dest_cr+=stride_c; + } + uint8_t* top_border_tgt = spe->tgt_spe + (unsigned) &top_ls[mb_x]; + spu_dma_put(&top_ls[mb_x], (unsigned) top_border_tgt, sizeof(TopBorder), MBD_put); +} + +static void extend_edges_left(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c){ + for (int i=0; is; + + uint8_t *dma_y; + uint8_t *dma_cb; + uint8_t *dma_cr; + + uint8_t *extra_y = extra_edge_y; + uint8_t *extra_cb = extra_edge_cb; + uint8_t *extra_cr = extra_edge_cr; + + int pos = (mb_x+2) %4; + if (mb_x == 0){ + if (mb_y ==0){ + extend_edges_left(&dma_y_ls[32*64], &dma_cb_ls[16*32], &dma_cr_ls[16*32], 12, 6); + }else if (mb_y == s->mb_height -1){ + extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 20, 10); + }else { + extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 16, 8); + } + }else if (mb_x == s->mb_width-1){ + dma_y = &dma_y_ls [(pos+1)*16]; + dma_cb = &dma_cb_ls[(pos+1)*8]; + dma_cr = &dma_cr_ls[(pos+1)*8]; + if (mb_y ==0){ + dma_y += 32*64; + dma_cb += 16*32; + dma_cr += 16*32; + extra_y = extra_edge_y + 32*32; + extra_cb= extra_edge_cb + 16*16; + extra_cr= extra_edge_cr + 16*16; + + if (pos==2){ + extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 1); + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6); + }else if (pos==3){ + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6); + }else{ + extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 2); + } + }else if (mb_y == s->mb_height -1){ + if (pos==2){ + extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 1); + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10); + }else if (pos==3){ + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10); + }else{ + extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 2); + } + }else { + if (pos==2){ + extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1); + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8); + }else if (pos==3){ + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8); + }else{ + extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1); + } + } + } + + if (mb_y == 0){ + dma_y = &dma_y_ls [32*64]; + dma_cb = &dma_cb_ls[16*32]; + dma_cr = &dma_cr_ls[16*32]; + extra_y = extra_edge_y + 32*32; + extra_cb= extra_edge_cb + 16*16; + extra_cr= extra_edge_cr + 16*16; + + if (mb_x ==0){ + extend_edges_top (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8); + extend_edges_top (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8); + extend_edges_top (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8); + }else if (mb_x == s->mb_width -1){ + if (pos==2){ + extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); + extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); + extend_extra_edge_top(extra_y, extra_cb, extra_cr); + }else if (pos == 3){ + extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); + extend_extra_edge_top(extra_y, extra_cb, extra_cr); + }else{ + extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); + extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); + extend_edges_top (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8); + } + }else { + extend_edges_top (dma_y + pos*16, dma_cb + pos*8, dma_cr + pos*8); + } + }else if (mb_y == s->mb_height -1){ + dma_y = &dma_y_ls [19*64]; + dma_cb = &dma_cb_ls[9*32]; + dma_cr = &dma_cr_ls[9*32]; + extra_y = extra_edge_y + 19*32; + extra_cb= extra_edge_cb + 9*16; + extra_cr= extra_edge_cr + 9*16; + + if (mb_x ==0){ + extend_edges_bottom (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8); + extend_edges_bottom (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8); + extend_edges_bottom (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8); + }else if (mb_x == s->mb_width -1){ + if (pos==2){ + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); + extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); + extend_extra_edge_bottom(extra_y, extra_cb, extra_cr); + }else if (pos == 3){ + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); + extend_extra_edge_bottom(extra_y, extra_cb, extra_cr); + }else{ + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); + extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); + extend_edges_bottom (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8); + } + }else { + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); + } + } +} + +static void send_pic_data(H264Context_spu *h, int mb_x, int mb_y, int pos, int stride_y, int stride_c){ + H264slice *s = h->s; + int lines, lines_c; + int linesize = s->linesize; + int uvlinesize = s->uvlinesize; + + uint8_t* dst_y = s->dst_y + (mb_x-pos)*16 + (mb_y*16)*linesize; + uint8_t* dst_cb = s->dst_cb +(mb_x-pos)*8 + (mb_y*8)*uvlinesize; + uint8_t* dst_cr = s->dst_cr +(mb_x-pos)*8 + (mb_y*8)*uvlinesize; + + if (mb_y == 0){ + dst_y -= 32 *linesize; + dst_cb-= 16 *uvlinesize; + dst_cr-= 16 *uvlinesize; + }else { + dst_y -= 4 *linesize; + dst_cb-= 2 *uvlinesize; + dst_cr-= 2 *uvlinesize; + } + + if (mb_y == 0){ + lines = 12+32; lines_c=6+16; + }else if (mb_y == s->mb_height-1){ + lines = 20+32; lines_c=10+16; + }else{ + lines = 16; lines_c=8; + } + + put_list = put_list_buf; + put_dma_list(dma_y_ls, dst_y, stride_y, lines, linesize, MBD_pic); + put_dma_list(dma_cb_ls, dst_cb, stride_c, lines_c, uvlinesize, MBD_pic); + put_dma_list(dma_cr_ls, dst_cr, stride_c, lines_c, uvlinesize, MBD_pic); + + if (mb_x == s->mb_width-1 && pos>1){ + put_dma_list(extra_edge_y, dst_y+64, 32, lines, linesize, MBD_pic); + put_dma_list(extra_edge_cb, dst_cb+32, 16, lines_c, uvlinesize, MBD_pic); + put_dma_list(extra_edge_cr, dst_cr+32, 16, lines_c, uvlinesize, MBD_pic); + } +} + +void copy_data_and_send(H264Context_spu *h, int mb_x, int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ + H264slice *s = h->s; + int lines, lines_c; + int pos = (mb_x+2)%4; //4 slots in our 64 byte wide transfer buffer. Offset 2 for edge emulation + uint8_t *dma_y = &dma_y_ls[pos*16]; + uint8_t *dma_cb = &dma_cb_ls[pos*8]; + uint8_t *dma_cr = &dma_cr_ls[pos*8]; + + if (mb_y == 0){ + dma_y += 32*64; + dma_cb+= 16*32; + dma_cr+= 16*32; + }else{ + dest_y -= 4*stride_y; + dest_cb-= 2*stride_c; + dest_cr-= 2*stride_c; + } + + if (mb_y == 0){ + lines = 12; lines_c=6; + }else if (mb_y == s->mb_height-1){ + lines = 20; lines_c=10; + }else{ + lines = 16; lines_c=8; + } + + for(int i=0; imb_width-1){ + send_pic_data(h, mb_x, mb_y, pos, 64, 32); + } +} + +static void shift_left(int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ + int lines, lines_c; + if (mb_y > 0){ + lines =20; + lines_c=10; + dest_y -= 4*stride_y; + dest_cb -= 2*stride_c; + dest_cr -= 2*stride_c; + }else { + lines =16; + lines_c= 8; + } + + for (int i=0; is; + H264Mb *mb = h->mb; + const int mb_x= mb->mb_x; + const int mb_y= mb->mb_y; + const int mb_type= mb->mb_type; + + uint8_t *dest_y, *dest_cb, *dest_cr; //ls ptrs (abstracts the fact it is operating in a ls buffer) + + int i; + + void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); + + dest_y = dest_y_ls + 16 + 4*stride_y; + dest_cb = dest_cb_ls + 8 + 2*stride_c; + dest_cr = dest_cr_ls + 8 + 2*stride_c; + + if(IS_8x8DCT(mb_type)){ + idct_dc_add = ff_idct8_dc_add; + idct_add = h->dsp.h264_idct_add[0]; + } + else{ + idct_dc_add = ff_idct_dc_add; + idct_add = h->dsp.h264_idct_add[1]; + } + + if (mb_y>0){ + copy_top_borders(mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c); + } + + if(IS_INTRA(mb_type)){ + xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 1); + + h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cb, stride_c); + h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cr, stride_c); + + if(IS_INTRA4x4(mb_type)){ + if(IS_8x8DCT(mb_type)){ + + for(i=0; i<16; i+=4){ + uint8_t * const ptr= dest_y + block_offset[i]; + const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ]; + const int nnz = mb->non_zero_count_cache[ scan8[i] ]; + h->hpc.pred8x8l[ dir ](ptr, (mb->topleft_samples_available<topright_samples_available<mb[i*16]) + idct_dc_add(ptr, mb->mb + i*16, stride_y); + else{ + idct_add (ptr, mb->mb + i*16, stride_y); + } + } + } + }else{ + for(i=0; i<16; i++){ + uint8_t * const ptr= dest_y + block_offset[i]; + const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ]; + + uint8_t *topright; + int nnz, tr; + if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ + const int topright_avail= (mb->topright_samples_available<hpc.pred4x4[ dir ](ptr, topright, stride_y); + nnz = mb->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(nnz == 1 && mb->mb[i*16]) + idct_dc_add(ptr, mb->mb + i*16, stride_y); + else + idct_add (ptr, mb->mb + i*16, stride_y); + } + } + } + + }else{ + h->hpc.pred16x16[ mb->intra16x16_pred_mode ](dest_y , stride_y); + h264_luma_dc_dequant_idct_c(mb->mb, mb->dequant4_coeff_y); + } + xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 0); + + }else { + hl_motion(h, dest_y, dest_cb, dest_cr, stride_y, stride_c); + } + + if(!IS_INTRA4x4(mb_type)){ + if(IS_INTRA16x16(mb_type)){ + for(i=0; i<16; i++){ + if(mb->non_zero_count_cache[ scan8[i] ]) + idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); + else if(mb->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); + } + }else if(mb->cbp&15){ + const int incr = IS_8x8DCT(mb_type) ? 4 : 1; + for(i=0; i<16; i+=incr){ + int nnz = mb->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(nnz==1 && mb->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); + else + idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); + } + } + } + } + + if(mb->cbp&0x30){ + uint8_t *dest[2] = {dest_cb, dest_cr}; + chroma_dc_dequant_idct_c(mb->mb + 16*16, mb->dequant4_coeff_cb); + chroma_dc_dequant_idct_c(mb->mb + 16*16+4*16, mb->dequant4_coeff_cr); + + idct_add = h->dsp.h264_idct_add[1]; + idct_dc_add = ff_idct_dc_add; + for(i=16; i<16+8; i++){ + if(mb->non_zero_count_cache[ scan8[i] ]) + idct_add (dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c); + else if(mb->mb[i*16]) + idct_dc_add(dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c); + } + } + + // save unfiltered borders + backup_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c); + if (mb->deblock_mb){ + filter_mb( h, dest_y, dest_cb, dest_cr, stride_y, stride_c); + } + + if (mb_y < s->mb_height-1){ + if(mb_x>0){ + send_top_borders(h, mb_x-1, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); + } + if (mb_x == s->mb_width-1){ + send_top_borders(h, mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c); + } + } + update_tgt_spe_dep(h, 0); + + if (h->blocking){ + if (mb_x>0){ + copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); + wait_dma_id(MBD_pic); + } + if (mb_x == s->mb_width-1){ + copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); + wait_dma_id(MBD_pic); + } + + }else{ + if (mb_x>0){ + wait_dma_id(MBD_pic); + copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); + } + if (mb_x == s->mb_width-1){ + wait_dma_id(MBD_pic); + copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); + } + } + + if (mb_x < s->mb_width) + shift_left(mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); + +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_decode_mb_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_decode_mb_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2009 TUDelft + * + * Cell Parallel SPU - 2DWave Macroblock Decoding. + */ + +/** + * @file libavcodec/cell/spu/h264_main_spu.c + * Cell Parallel SPU - 2DWave Macroblock Decoding + * @author C C Chi + * + * SIMD kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + +#ifndef H264_DECODE_MB_SPU_H +#define H264_DECODE_MB_SPU_H + +#define CELL_SPE +#include "libavcodec/avcodec.h" +#include "types_spu.h" +#include "h264_types_spu.h" +#include "h264_mc_spu.h" +#include "h264_dma.h" +#include "dsputil_spu.h" +#include "h264_intra_spu.h" + +/** + * H264Context + */ +typedef struct H264Context_spu{ + DECLARE_ALIGNED_16(H264spe, spe); // contains simple type parameters that doesn't change + DECLARE_ALIGNED_16(H264Mb, mb_buf[3]); // contains simple type parameters that changes for macroblock + DECLARE_ALIGNED_16(H264slice, slice_buf[2]); // contains simple type parameters that changes for slice + + DSPContext_spu dsp; // struct that contains pointers to mc interpolations functions + H264PredContext_spu hpc; // struct that contains pointers to intra prediction functions + + H264slice *s; + int sl_idx; + int frames; + //mc arg buffer + H264mc mc_buf[2]; + H264mc *mc; //mc ptr to current decoded mb + int mc_idx; + int n_mc; //next mb_id to mc + int mb_proc; + int mb_total; + int curr_line; + + H264Mb* mb; //mb ptr to current decoded mb + int mb_id; //next mb_id to dma + int mb_dec; //mb_buf index - decoded mb + int mb_mc; //mb_buf index - prebuffer motion data + int mb_dma; //mb_buf index - target for dma mb data + int next_mb_idx; +/*// for deblocking filter + int edges[2]; + int start[2]; + int bS[2][4][4]; // dir, edge, bS; + int qp[2][4]; // dir, edge; + int chroma_qp[2][2][4]; // cb/cr, dir, edge; +*/ + int blocking; +}H264Context_spu; + +void print_output(H264Context_spu* h, const char* msg); +void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c); +void update_tgt_spe_dep(H264Context_spu *h, int end); + +// IDCT functions +void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); +void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); + +void ff_idct_dc_add(uint8_t *dst, DCTELEM *block, int stride); +void ff_idct8_dc_add(uint8_t *dst, DCTELEM *block, int stride); + +void ff_cropTbl_init(); +void add_pixels8_c(uint8_t *pixels, DCTELEM *block, int line_size); +void add_pixels4_c(uint8_t *pixels, DCTELEM *block, int line_size); +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul); +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul); +// Filter functions +//void calculate_bS_qp(H264Context_spu *h); + +// Motion compensation function +void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc); +void calc_mc_params(H264Mb *mb, H264mc *mc); +void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c); + + +// Function to get traces +void trace_event_SPU(int event, int id); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_direct_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_direct_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,332 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 direct mb/block decoding. + * @author Michael Niedermayer + */ +#define CELL_SPE +#include "libavcodec/avcodec.h" +#include "dsputil_spu.h" +#include "h264_tables.h" +#include "h264_types_spu.h" +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "mathops_spu.h" +#include "rectangle_spu.h" + +//#undef NDEBUG +#include +static void pred_spatial_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ + H264Mb *m = s->m; + int b4_stride = hc->b_stride; + const int mb_x = m->mb_x; + int mb_type_col[2]; + const int16_t (*l1mv0)[2], (*l1mv1)[2]; + const int8_t *l1ref0, *l1ref1; + const int is_b8x8 = IS_8X8(*mb_type); + unsigned int sub_mb_type= MB_TYPE_L0L1; + int i8, i4; + int ref[2]; + int mv[2]; + int list; + + //assert(h->ref_list[1][0].reference&3); + +#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) + + /* ref = min(neighbors) */ + for(list=0; list<2; list++){ + int left_ref = m->ref_cache[list][scan8[0] - 1]; + int top_ref = m->ref_cache[list][scan8[0] - 8]; + int refc = m->ref_cache[list][scan8[0] - 8 + 4]; + const int16_t *C= m->mv_cache[list][ scan8[0] - 8 + 4]; + if(refc == PART_NOT_AVAILABLE){ + refc = m->ref_cache[list][scan8[0] - 8 - 1]; + C = m-> mv_cache[list][scan8[0] - 8 - 1]; + } + ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); + if(ref[list] >= 0){ + //this is just pred_motion() but with the cases removed that cannot happen for direct blocks + const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ]; + const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ]; + + int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); + if(match_count > 1){ //most common + mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]), + mid_pred(A[1], B[1], C[1]) ); + }else { + assert(match_count==1); + if(left_ref==ref[list]){ + mv[list]= AV_RN32A(A); + }else if(top_ref==ref[list]){ + mv[list]= AV_RN32A(B); + }else{ + mv[list]= AV_RN32A(C); + } + } + }else{ + int mask= ~(MB_TYPE_L0 << (2*list)); + mv[list] = 0; + ref[list] = -1; + if(!is_b8x8) + *mb_type &= mask; + sub_mb_type &= mask; + } + } + + if(ref[0] < 0 && ref[1] < 0){ + ref[0] = ref[1] = 0; + if(!is_b8x8) + *mb_type |= MB_TYPE_L0L1; + sub_mb_type |= MB_TYPE_L0L1; + } + + if(!(is_b8x8|mv[0]|mv[1])){ + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); + fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); + fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); + fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; + return; + } + + mb_type_col[0] = + mb_type_col[1] = hc->list1_mb_type[mb_x]; + + sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ + *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ + *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); + }else{ + if(!s->direct_8x8_inference_flag){ + /* FIXME save sub mb types from previous frames (or derive from MVs) + * so we know exactly what block size to use */ + sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ + } + *mb_type |= MB_TYPE_8x8; + } + +// l1mv0 = (void *) &hc->list1_motion_val[0][4*mb_x]; +// l1mv1 = (void *) &hc->list1_motion_val[1][4*mb_x]; + l1mv0 = (void *) hc->list1_motion_val[0]; + l1mv1 = (void *) hc->list1_motion_val[1]; + l1ref0 = &hc->list1_ref_index [0][4*mb_x]; + l1ref1 = &hc->list1_ref_index [1][4*mb_x]; +// if(!b8_stride){ +// if(m->mb_y&1){ +// l1ref0 += 2; +// l1ref1 += 2; +// l1mv0 += 2*b4_stride; +// l1mv1 += 2*b4_stride; +// } +// } + + if(IS_16X16(*mb_type)){ + int a,b; + + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); + fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); + if(!IS_INTRA(mb_type_col[0]) && ( (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1) + || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1 + ))){ + a=b=0; + if(ref[0] > 0) + a= mv[0]; + if(ref[1] > 0) + b= mv[1]; + }else{ + a= mv[0]; + b= mv[1]; + } + fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, a, 4); + fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, b, 4); + }else{ + int n=0; + for(i8=0; i8<4; i8++){ + const int x8 = i8&1; + const int y8 = i8>>1; + + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) + continue; + m->sub_mb_type[i8] = sub_mb_type; + + fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4); + fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4); + fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1); + fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1); + + /* col_zero_flag */ + if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 )) + ){ + const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1; + if(IS_SUB_8X8(sub_mb_type)){ +// const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; + const int16_t *mv_col = l1mv[x8*3 + y8*3*4]; + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ + if(ref[0] == 0) + fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); + if(ref[1] == 0) + fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); + n+=4; + } + }else{ + int k=0; + for(i4=0; i4<4; i4++){ + //const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4]; + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ + if(ref[0] == 0) + AV_ZERO32(m->mv_cache[0][scan8[i8*4+i4]]); + if(ref[1] == 0) + AV_ZERO32(m->mv_cache[1][scan8[i8*4+i4]]); + k++; + } + } + if(!(k&3)) + m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8; + n+=k; + } + } + } + if(!is_b8x8 && !(n&15)){ + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; + } + } +} + +static void pred_temp_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ + H264Mb *m = s->m; + const int mb_x = m->mb_x; + int b4_stride = hc->b_stride; + int mb_type_col[2]; + const int16_t (*l1mv0)[2], (*l1mv1)[2]; + const int8_t *l1ref0, *l1ref1; + const int is_b8x8 = IS_8X8(*mb_type); + unsigned int sub_mb_type; + int i8, i4; + const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]}; + const int *dist_scale_factor = s->dist_scale_factor; + + mb_type_col[0] = + mb_type_col[1] = hc->list1_mb_type[mb_x]; + + sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ + *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ + *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); + }else{ + if(!s->direct_8x8_inference_flag){ + /* FIXME save sub mb types from previous frames (or derive from MVs) + * so we know exactly what block size to use */ + sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ + } + *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; + } + +// l1mv0 = (void *) &hc->list1_motion_val[0][4*mb_x]; +// l1mv1 = (void *) &hc->list1_motion_val[1][4*mb_x]; + l1mv0 = (void *) hc->list1_motion_val[0]; + l1mv1 = (void *) hc->list1_motion_val[1]; + l1ref0 = &hc->list1_ref_index [0][4*mb_x]; + l1ref1 = &hc->list1_ref_index [1][4*mb_x]; + + /* one-to-one mv scaling */ + if(IS_16X16(*mb_type)){ + int ref, mv0, mv1; + + fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1); + if(IS_INTRA(mb_type_col[0])){ + ref=mv0=mv1=0; + }else{ + const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]] + : map_col_to_list0[1][l1ref1[0]]; + const int scale = dist_scale_factor[ref0]; + const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0]; + int mv_l0[2]; + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; + ref= ref0; + mv0= pack16to32(mv_l0[0],mv_l0[1]); + mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]); + } + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1); + fill_rectangle(&m-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4); + fill_rectangle(&m-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4); + }else{ + for(i8=0; i8<4; i8++){ + const int x8 = i8&1; + const int y8 = i8>>1; + int ref0, scale; + const int16_t (*l1mv)[2]= l1mv0; + + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) + continue; + m->sub_mb_type[i8] = sub_mb_type; + fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1); + if(IS_INTRA(mb_type_col[0])){ + fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1); + fill_rectangle(&m-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); + fill_rectangle(&m-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); + continue; + } + + ref0 = l1ref0[i8]; + if(ref0 >= 0) + ref0 = map_col_to_list0[0][ref0 ]; + else{ + ref0 = map_col_to_list0[1][l1ref1[i8]]; + l1mv= l1mv1; + } + scale = dist_scale_factor[ref0]; + + fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1); + if(IS_SUB_8X8(sub_mb_type)){ +// const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; + const int16_t *mv_col = l1mv[x8*3 + y8*3*4]; + int mx = (scale * mv_col[0] + 128) >> 8; + int my = (scale * mv_col[1] + 128) >> 8; + fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4); + fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4); + }else + for(i4=0; i4<4; i4++){ +// const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4]; + int16_t *mv_l0 = m->mv_cache[0][scan8[i8*4+i4]]; + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; + AV_WN32A(m->mv_cache[1][scan8[i8*4+i4]], + pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1])); + } + } + } +} + +void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ + if(s->direct_spatial_mv_pred){ + pred_spatial_direct_motion(hc, s, mb_type); + }else{ + pred_temp_direct_motion(hc, s, mb_type); + } +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_direct_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_direct_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,8 @@ +#ifndef H264_DIRECT_H +#define H264_DIRECT_H + +#include "h264_types_spu.h" + +void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_dma.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_dma.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,74 @@ +#include +#include "h264_dma.h" + +DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]); +dma_list_elem_t* put_list; + +DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]); +dma_list_elem_t* get_list; + +inline void spu_dma_get(void *ls, unsigned ea, int size, int tag){ + mfc_get(ls, ea, size, tag, 0, 0); +} + +inline void spu_dma_put(void *ls, unsigned ea, int size, int tag){ + mfc_put(ls, ea, size, tag, 0, 0); +} + +inline void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag){ + mfc_putb(ls, ea, size, tag, 0, 0); +} + +// Function that wait to finish a DMA transfer with especific id +inline void wait_dma_id(int id){ + spu_writech(MFC_WrTagMask, 1<< id); + (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); +} + +// Functions to get/put a block from/to main memory +void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier) +{ + unsigned int i = 0; + unsigned int listsize; + unsigned int ea_low; + + dma_list_elem_t* list = get_list; + get_list+=h; + + ea_low=(uint32_t) mfc_ea2l(ea); + + /* Create the list, size of each list id the "width" parameter defined by the user */ + for ( i=0; i + * + * SIMD kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + + +#include +#include +#include + +#include "h264_filter_spu.h" +#include "h264_decode_mb_spu.h" +// To use scan8 table +#include "h264_mc_spu.h" + + +int get_chroma_qp(H264Context_spu *h, int t, int qscale){ + return h->slice.chroma_qp_table[t][qscale]; +} + +static inline int clip(int a, int amin, int amax){ + if (a < amin) + return amin; + else if (a > amax) + return amax; + else + return a; +} + +static inline vsint16_t clip_altivec(vsint16_t a, vsint16_t amin, vsint16_t amax){ + vector unsigned short min_mask,max_mask; + min_mask = spu_cmpgt(amin, a); + max_mask = spu_cmpgt(a, amax); + + return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask); +} + +static inline vsint16_t clip_uint8_altivec(vsint16_t a){ + const vsint16_t amax = {255,255,255,255,255,255,255,255}; + const vsint16_t amin = {0, 0, 0, 0, 0, 0, 0, 0}; + vector unsigned short min_mask,max_mask; + min_mask = spu_cmpgt(amin, a); + max_mask = spu_cmpgt(a, amax); + + return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask); +} + +static inline void h264_loop_filter_chroma(vsint16_t *pix, int alpha, int beta, int8_t *tc0){ + + short a = (short) tc0[0]; + short b = (short) tc0[1]; + short c = (short) tc0[2]; + short d = (short) tc0[3]; + const vsint16_t vec_tc0 = {a,a,b,b,c,c,d,d}; + const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0}; + vector unsigned short mask_B0; + + mask_B0 = spu_cmpgt(vec_v0, vec_tc0); + + const vsint16_t p0 = pix[-1]; + const vsint16_t p1 = pix[-2]; + const vsint16_t q0 = pix[0]; + const vsint16_t q1 = pix[1]; + + const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; + const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; + const vsint16_t v_2 = {2,2,2,2,2,2,2,2}; + const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; + const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; + + vsint16_t rp0; + vsint16_t rq0; + vsint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0; + vector unsigned short mask_B1, mask_tmp; + vsint16_t i_delta; + + abs_p0mq0 = (vector signed short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); + abs_p1mp0 = (vector signed short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); + abs_q1mq0 = (vector signed short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); + + mask_B1 = spu_cmpgt(v_alpha, abs_p0mq0); + mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); + mask_B1 = spu_and(mask_B1, mask_tmp); + mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); + mask_B1 = spu_and(mask_B1, mask_tmp); + + + i_delta = clip_altivec(spu_rlmaska(spu_add(spu_sl(spu_sub(q0,p0 ), (vuint16_t)v_2), spu_add(spu_sub(p1,q1),v_4)), (vsint16_t)-v_3), -vec_tc0, vec_tc0); + + rp0 = clip_uint8_altivec( spu_add(p0,i_delta)); + rq0 = clip_uint8_altivec( spu_sub(q0,i_delta)); + + pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0); + pix[0] = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0); +} + +static void h264_v_loop_filter_luma_c(vsint16_t *pix, int alpha, int beta, int8_t *tc0, int inc_low2high){ + + short a = (short) tc0[0 + inc_low2high]; + short b = (short) tc0[1 + inc_low2high]; + const vsint16_t vec_tc0 = {a,a,a,a,b,b,b,b}; + const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0}; + vector unsigned short mask_B0; + + mask_B0 = spu_cmpgt(vec_v0, vec_tc0); + const vsint16_t p0 = pix[-1]; + const vsint16_t p1 = pix[-2]; + const vsint16_t p2 = pix[-3]; + const vsint16_t q0 = pix[0]; + const vsint16_t q1 = pix[1]; + const vsint16_t q2 = pix[2]; + + const vuint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; + const vuint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; + + const vuint16_t v_1 = {1,1,1,1,1,1,1,1}; + const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; + const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; + const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; + + vsint16_t rp0, rp1; + vsint16_t rq0, rq1; + vsint16_t tc0_B2P, tc0_B2Q, rtc0; + vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0; + vector unsigned short mask_B1, mask_B2P, mask_B2Q, mask_tmp; + vsint16_t i_delta, i_delta2; + + abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); + abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); + abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); + abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0); + abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0); + + mask_B1 = spu_cmpgt(v_alpha, abs_p0mq0); + mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); + mask_B1 = spu_and(mask_B1, mask_tmp); + mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); + mask_B1 = spu_and(mask_B1, mask_tmp); + + mask_B2P = spu_cmpgt(v_beta, abs_p2mp0); + mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0); + + rp1 = spu_add(p1, clip_altivec(spu_sub(spu_rlmaska(spu_add(p2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), p1), -vec_tc0, vec_tc0 )); + rq1 = spu_add(q1, clip_altivec(spu_sub(spu_rlmaska(spu_add(q2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), q1), -vec_tc0, vec_tc0 )); + + tc0_B2P = spu_add(vec_tc0, (vsint16_t) v_1); + tc0_B2P = spu_sel(vec_tc0, tc0_B2P, mask_B2P); + + tc0_B2Q = spu_add(tc0_B2P, (vsint16_t) v_1); + rtc0 = spu_sel(tc0_B2P, tc0_B2Q, mask_B2Q); + i_delta2 = spu_add(spu_sub(p1,q1),v_4); + i_delta = spu_sl(spu_sub(q0,p0 ), v_2); + i_delta = spu_add(i_delta,i_delta2 ); + i_delta = spu_rlmaska(i_delta, (vsint16_t)-v_3); + i_delta = clip_altivec(i_delta, -rtc0, rtc0); + + rp0 = clip_uint8_altivec( spu_add(p0,i_delta)); /* p0' */ + rq0 = clip_uint8_altivec( spu_sub(q0,i_delta)); /* q0' */ + + pix[-2] = spu_sel(spu_sel(p1,spu_sel(p1,rp1,mask_B2P) ,mask_B1), p1,mask_B0); + pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0); + pix[0] = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0); + pix[1] = spu_sel(spu_sel(q1,spu_sel(q1,rq1,mask_B2Q) ,mask_B1), q1,mask_B0); +} + + + +static inline void h264_loop_filter_chroma_intra(vsint16_t *pix, int alpha, int beta){ + + const vuint16_t p0 = (vuint16_t) pix[-1]; + const vuint16_t p1 = (vuint16_t) pix[-2]; + const vuint16_t q0 = (vuint16_t) pix[0]; + const vuint16_t q1 = (vuint16_t) pix[1]; + + const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; + const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; + const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; + + vuint16_t rp0; + vuint16_t rq0; + vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0; + vector unsigned short mask_B0, mask_tmp; + + abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); + abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); + abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); + + mask_B0 = spu_cmpgt(v_alpha, (vsint16_t)abs_p0mq0); + mask_tmp = spu_cmpgt(v_beta, (vsint16_t)abs_p1mp0); + mask_B0 = spu_and(mask_B0, mask_tmp); + mask_tmp = spu_cmpgt( v_beta, (vsint16_t)abs_q1mq0); + mask_B0 = spu_and(mask_B0, mask_tmp); + + rp0 = spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2; + rp0 = spu_rlmaska(rp0, (vsint16_t)-v_2); + rq0 = spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2; + rq0 = spu_rlmaska(rq0, (vsint16_t)-v_2); + + pix[-1] = (vsint16_t) spu_sel(p0, rp0, mask_B0); + pix[0] = (vsint16_t) spu_sel(q0, rq0, mask_B0); +} +int slice_alpha_c0_offset; +int slice_beta_offset; +static void filter_mb_edgecv(vsint16_t *pix, int bS[4], int qp ) { + int i; + const int index_a = qp + slice_alpha_c0_offset; + const int alpha = (alpha_table+52)[index_a]; + const int beta = (beta_table+52)[qp + slice_beta_offset]; + + if( bS[0] < 4 ) { + int8_t tc[4]; + for(i=0; i<4; i++) + tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0; + h264_loop_filter_chroma(pix, alpha, beta, tc); + } else { + h264_loop_filter_chroma_intra(pix, alpha, beta); + } +} + +static void filter_mb_edgeh(vsint16_t *pix, int bS[4], int qp, int inc_low2high ) { + int i; + const int index_a = qp + slice_alpha_c0_offset; + const int alpha = (alpha_table+52)[index_a]; + const int beta = (beta_table+52)[qp + slice_beta_offset]; + + if( bS[0] < 4 ) { + int8_t tc[4]; + for(i=0; i<4; i++) + tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1; + h264_v_loop_filter_luma_c(pix, alpha, beta, tc, inc_low2high); + } else { + + const vuint16_t p0 = (vuint16_t) pix[-1]; + const vuint16_t p1 = (vuint16_t) pix[-2]; + const vuint16_t p2 = (vuint16_t) pix[-3]; + const vuint16_t p3 = (vuint16_t) pix[-4]; + const vuint16_t q0 = (vuint16_t) pix[0]; + const vuint16_t q1 = (vuint16_t) pix[1]; + const vuint16_t q2 = (vuint16_t) pix[2]; + const vuint16_t q3 = (vuint16_t) pix[3]; + + const vuint16_t v_alpha = {(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha}; + const vuint16_t v_beta = {(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta}; + const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; + const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; + const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; + + vuint16_t rp0_B1f, rp0_B2t, rp0_B2f, rp1_B2t, rp2_B2t; + vuint16_t rq0_B1f, rq0_B2t, rq0_B2f, rq1_B2t, rq2_B2t; + vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0; + vuint16_t v_alpha_2 = spu_rlmaska(v_alpha, (vsint16_t)-v_2); + vector unsigned short mask_B0, mask_B1, mask_B2P, mask_B2Q, mask_tmp; + + v_alpha_2 = spu_add(v_alpha_2, v_2); + + abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); + abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); + abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); + abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0); + abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0); + + mask_B0 = spu_cmpgt(v_alpha, abs_p0mq0); + mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); + mask_B0 = spu_and(mask_B0, mask_tmp); + mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); + mask_B0 = spu_and(mask_B0, mask_tmp); + + mask_B1 = spu_cmpgt(v_alpha_2, abs_p0mq0); + mask_B2P = spu_cmpgt(v_beta,abs_p2mp0); + mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0); + + rp0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p2,p1),spu_add(p1,p0)),spu_add(spu_add(p0,q0),spu_add(q0,q1))),(vuint16_t)v_4),(vsint16_t) -v_3); + //( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; + rp1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p2,p1),spu_add(q0,p0)),v_2),(vsint16_t)-v_2);//( p2 + p1 + p0 + q0 + 2 ) >> 2; + rp2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p3,p3),spu_add(p2,p2)),spu_add(spu_add(p2,p1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3); + //( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; + rq0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p1,p0),spu_add(p0,q0)),spu_add(spu_add(q0,q1),spu_add(q1,q2))),(vuint16_t)v_4),(vsint16_t)-v_3); + + //( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; + rq1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p0,q0),spu_add(q1,q2)),v_2),(vsint16_t)-v_2);//( p0 + q0 + q1 + q2 + 2 ) >> 2; + rq2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(q3,q3),spu_add(q2,q2)),spu_add(spu_add(q2,q1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3); + //( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; + rp0_B1f = + rp0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2),(vsint16_t)-v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2; + rq0_B1f = + rq0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2),(vsint16_t)-v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2; + + pix[-1] = (vsint16_t) spu_sel(p0, spu_sel(rp0_B1f, spu_sel(rp0_B2f, rp0_B2t, mask_B2P), mask_B1), mask_B0); + pix[-2] = (vsint16_t) spu_sel(p1, spu_sel(p1, spu_sel(p1, rp1_B2t, mask_B2P), mask_B1), mask_B0); + pix[-3] = (vsint16_t) spu_sel(p2, spu_sel(p2, spu_sel(p2, rp2_B2t, mask_B2P), mask_B1), mask_B0); + pix[0] = (vsint16_t) spu_sel(q0, spu_sel(rq0_B1f, spu_sel(rq0_B2f, rq0_B2t, mask_B2Q), mask_B1), mask_B0); + pix[1] = (vsint16_t) spu_sel(q1, spu_sel(q1, spu_sel(q1, rq1_B2t,mask_B2Q), mask_B1), mask_B0); + pix[2] = (vsint16_t) spu_sel(q2, spu_sel(q2, spu_sel(q2, rq2_B2t,mask_B2Q), mask_B1), mask_B0); + } +} + +// This function gets bS and qp for luma and chroma before the filter +void calculate_bS_qp(H264Context_spu *h){ + H264mb* mb = &h->mb; + H264slice* slice = h->slice; + int dir; + const int mvy_limit = 4; + /* FIXME: A given frame may occupy more than one position in + * the reference list. So ref2frm should be populated with + * frame numbers, not indices. */ + + int (*ref2frm)[64] = slice->ref2frm; + int mb_x = mb->mb_x; + int mb_y = mb->mb_y; + int mb_type =mb->mb_type; + /* dir : 0 -> vertical edge, 1 -> horizontal edge */ + for( dir = 0; dir < 2; dir++ ){ + int edge; + const int mbm_type = dir == 0 ? mb->mb_type_xy_n1 : mb->mb_type_top; + const int8_t qscale_mbm = dir == 0 ? mb->qscale_mbxy_n1 : mb->qscale_mbxy_top; + + // how often to recheck mv-based bS when iterating between edges + const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :(mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0; + // how often to recheck mv-based bS when iterating along each edge + const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); + + h->edges[dir] = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4; + + if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0)) + h->start[dir] =1; + else + h->start[dir] =0; + + /* Calculate bS */ + for( edge = h->start[dir]; edge < h->edges[dir]; edge++ ) { + /* mbn_xy: neighbor macroblock */ + const int mbn_type = edge > 0 ? mb_type : mbm_type; + const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm; + int* bS = h->bS[dir][edge]; + + if( (edge&1) && IS_8x8DCT(mb_type) ){ + bS[0] = bS[1] = bS[2] = bS[3] = 0; //extra code due to decoupling + continue; + } + if( IS_INTRA(mb_type) || + IS_INTRA(mbn_type) ) { + int value; + if (edge == 0) { + value = 4; + } else { + value = 3; + } + bS[0] = bS[1] = bS[2] = bS[3] = value; + } else { + int i, l; + int mv_done; + + if( edge & mask_edge ) { + bS[0] = bS[1] = bS[2] = bS[3] = 0; + mv_done = 1; + } + else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { + int b_idx= 8 + 4 + edge * (dir ? 8:1); + int bn_idx= b_idx - (dir ? 8:1); + int v = 0; + + for( l = 0; !v && l < 1 + (slice->slice_type_nos == FF_B_TYPE); l++ ) { + v |= ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] || + FFABS(mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || + FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit; + } + bS[0] = bS[1] = bS[2] = bS[3] = v; + + mv_done = 1; + } + else + mv_done = 0; + + for( i = 0; i < 4; i++ ) { + int x = dir == 0 ? edge : i; + int y = dir == 0 ? i : edge; + int b_idx= 8 + 4 + x + 8*y; + int bn_idx= b_idx - (dir ? 8:1); + + if( mb->non_zero_count_cache[b_idx] != 0 || + mb->non_zero_count_cache[bn_idx] != 0 ) { + bS[i] = 2; + } + else if(!mv_done) + { + bS[i] = 0; + for( l = 0; l < 1 + (slice->slice_type == B_TYPE); l++ ) { + if( ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] || + FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || + FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) { + bS[i] = 1; + break; + } + } + } + } + + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) + continue; + } + + /* Filter edge */ + // Do not use s->qscale as luma quantizer because it has not the same + // value in IPCM macroblocks. + h->qp[dir][edge] = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1; + h->chroma_qp[0][dir][edge] = ( mb->chroma_qp[0] + get_chroma_qp(h, 0, qscale_mbn_xy ) + 1 ) >> 1; + + h->chroma_qp[1][dir][edge] = ( mb->chroma_qp[1] + get_chroma_qp(h, 1, qscale_mbn_xy ) + 1 ) >> 1; + } + slice_alpha_c0_offset=slice->slice_alpha_c0_offset; + slice_beta_offset= slice->slice_beta_offset; + } +} + + +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7,merge_h,merge_l) \ + b0 = spu_shuffle( a0, a4, merge_h); \ + b1 = spu_shuffle( a0, a4, merge_l ); \ + b2 = spu_shuffle( a1, a5, merge_h ); \ + b3 = spu_shuffle( a1, a5, merge_l ); \ + b4 = spu_shuffle( a2, a6, merge_h ); \ + b5 = spu_shuffle( a2, a6, merge_l ); \ + b6 = spu_shuffle( a3, a7, merge_h ); \ + b7 = spu_shuffle( a3, a7, merge_l ); \ + a0 = spu_shuffle( b0, b4, merge_h ); \ + a1 = spu_shuffle( b0, b4, merge_l ); \ + a2 = spu_shuffle( b1, b5, merge_h ); \ + a3 = spu_shuffle( b1, b5, merge_l ); \ + a4 = spu_shuffle( b2, b6, merge_h ); \ + a5 = spu_shuffle( b2, b6, merge_l); \ + a6 = spu_shuffle( b3, b7, merge_h ); \ + a7 = spu_shuffle( b3, b7, merge_l ); \ + b0 = spu_shuffle( a0, a4, merge_h ); \ + b1 = spu_shuffle( a0, a4, merge_l ); \ + b2 = spu_shuffle( a1, a5, merge_h ); \ + b3 = spu_shuffle( a1, a5, merge_l); \ + b4 = spu_shuffle( a2, a6, merge_h ); \ + b5 = spu_shuffle( a2, a6, merge_l ); \ + b6 = spu_shuffle( a3, a7, merge_h ); \ + b7 = spu_shuffle( a3, a7, merge_l ) + +void filter_mb_spu(vsint16_t *img_y, vsint16_t *img_cb, vsint16_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int edges[2], int bS[2][4][4], int qp[2][4], int chroma_qp[2][2][4], int start[2]){ + + int dir,x; + vsint16_t o_vec_img_y[(16+8)*2]; + vsint16_t t_vec_img_y[(16+8)*2]; + vsint16_t *vec_img_y_o = o_vec_img_y; + vsint16_t *vec_img_y_t = t_vec_img_y; + + vsint16_t o_vec_img_cb[8+8+4]; + vsint16_t t_vec_img_cb[8+8]; + vsint16_t *vec_img_cb_o = &o_vec_img_cb[2]; + vsint16_t *vec_img_cb_t = t_vec_img_cb; + + vsint16_t o_vec_img_cr[8+8+4]; + vsint16_t t_vec_img_cr[8+8]; + vsint16_t *vec_img_cr_o = &o_vec_img_cr[2]; + vsint16_t *vec_img_cr_t = t_vec_img_cr; + + vuint8_t *pvec_tmp; + + const vuint8_t patt_high = {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}; + const vuint8_t patt_low = {16, 8, 17, 9, 18, 10, 19, 11, 20, 12, 21, 13, 22, 14, 23, 15}; + const vuint8_t patt_unpack={ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; + const vuint8_t patt_pack_hw={0, 1, 2, 3, 4, 5, 6, 7, 17, 19, 21, 23, 25, 27, 29, 31}; + const vuint8_t patt_pack_chroma_aligned={0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; + const vuint8_t patt_pack_chroma_unaligned={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F}; + const vuint8_t v_0 = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + const vuint8_t mergehu16 = {0x00,0x01,0x10,0x11,0x02,0x03,0x12,0x13,0x04,0x05,0x14,0x15,0x06,0x07,0x16,0x17}; + const vuint8_t mergelu16 = {0x08,0x09,0x18,0x19,0x0A,0x0B,0x1A,0x1B,0x0C,0x0D,0x1C,0x1D,0x0E,0x0F,0x1E,0x1F}; + vuint8_t store_chroma, store_chroma_n1, load_chroma, load_chroma_n1; + int mb_xy_n1; + const int unalign_chroma = (unsigned int) img_cb & 15; + + if(unalign_chroma==0){ + load_chroma = patt_high; + load_chroma_n1 = patt_low; // for load chroma mb_x-1 + store_chroma = patt_pack_chroma_aligned; + store_chroma_n1 = patt_pack_chroma_unaligned; // for store chroma mb_x-1 + mb_xy_n1 = 1; // si no hay desalineamineto se necesita el bloque anterior para filtrar horizontalmente + } + else{ + load_chroma = patt_low; + load_chroma_n1 = patt_high; // for load mb_x-1 + store_chroma = patt_pack_chroma_unaligned; + store_chroma_n1 = patt_pack_chroma_aligned; // for store chroma mb_x-1 + mb_xy_n1 = 0; // si hay desalineamineto 8 no se necesita el bloque anterior + } + + /* dir : 0 -> vertical edge, 1 -> horizontal edge */ + + // LOAD MB_X -1 + + for (x = 0; x < 16; x++){ //Unpack Memory to 8 positions vector + vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize - 1], v_0 , patt_low); + } + + for (x = 0; x < 8; x++){ //Unpack Memory to 8 positions vector + vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cb[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1); + vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cr[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1); + } + + VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16); + + vec_img_y_t = &vec_img_y_t[8]; + vec_img_y_o = &vec_img_y_o[8]; + vec_img_cb_t = &vec_img_cb_t[8]; + vec_img_cb_o = &vec_img_cb_o[10]; + vec_img_cr_t = &vec_img_cr_t[8]; + vec_img_cr_o = &vec_img_cr_o[10]; + + //LOAD CURRENT MB + for (x = 0; x < 16; x++){ //Unpack Memory to 8 positions vector + pvec_tmp = (vuint8_t *) &img_y[x*linesize]; + vec_img_y_o[x] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_high); + vec_img_y_o[x+24] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_low); + } + + for (x = 0; x < 8; x++){ //Unpack Memory to 8 positions vector + vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma); + vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma); + } + + //TRANSPOSE MATRIX + + VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31], vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39], vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16); + + //PROCESS + dir = 0; + { + int edge; + for( edge = start[dir]; edge < edges[dir]; edge++ ) { + if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0) + { + filter_mb_edgeh( &vec_img_y_t[4*edge ], bS[dir][edge], qp[dir][edge],0);//low + filter_mb_edgeh( &vec_img_y_t[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high + + if( (edge&1) == 0 ) { + filter_mb_edgecv( &vec_img_cb_t[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] ); + filter_mb_edgecv( &vec_img_cr_t[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] ); + } + } + } + } + + //SAVE MB_X -1 RESULTS + + VEC_TRANSPOSE_8(vec_img_y_t[-8], vec_img_y_t[-7], vec_img_y_t[-6], vec_img_y_t[-5], vec_img_y_t[-4], vec_img_y_t[-3], vec_img_y_t[-2], vec_img_y_t[-1], vec_img_y_o[-8], vec_img_y_o[-7], vec_img_y_o[-6], vec_img_y_o[-5], vec_img_y_o[-4], vec_img_y_o[-3], vec_img_y_o[-2], vec_img_y_o[-1],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_t[16], vec_img_y_t[17], vec_img_y_t[18], vec_img_y_t[19], vec_img_y_t[20], vec_img_y_t[21], vec_img_y_t[22], vec_img_y_t[23], vec_img_y_o[16], vec_img_y_o[17], vec_img_y_o[18], vec_img_y_o[19], vec_img_y_o[20], vec_img_y_o[21], vec_img_y_o[22], vec_img_y_o[23],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cb_t[ -8], vec_img_cb_t[-7], vec_img_cb_t[-6], vec_img_cb_t[-5], vec_img_cb_t[-4], vec_img_cb_t[-3], vec_img_cb_t[-2], vec_img_cb_t[-1], vec_img_cb_o[-10], vec_img_cb_o[-9], vec_img_cb_o[-8], vec_img_cb_o[-7], vec_img_cb_o[-6], vec_img_cb_o[-5], vec_img_cb_o[-4], vec_img_cb_o[-3],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cr_t[ -8], vec_img_cr_t[-7], vec_img_cr_t[-6], vec_img_cr_t[-5], vec_img_cr_t[-4], vec_img_cr_t[-3], vec_img_cr_t[-2], vec_img_cr_t[-1], vec_img_cr_o[-10], vec_img_cr_o[-9], vec_img_cr_o[-8], vec_img_cr_o[-7], vec_img_cr_o[-6], vec_img_cr_o[-5], vec_img_cr_o[-4], vec_img_cr_o[-3],mergehu16, mergelu16); + + for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory + img_y[x*linesize - 1] = spu_shuffle(img_y[x*linesize - 1], vec_img_y_o[-8+x], patt_pack_hw); + } + + for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory + img_y[(x+8)*linesize - 1] = spu_shuffle(img_y[(x+8)*linesize - 1], vec_img_y_o[16+x], patt_pack_hw); + } + + for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory + img_cb[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cb[x*uvlinesize - mb_xy_n1], vec_img_cb_o[-10+x], store_chroma_n1); + img_cr[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cr[x*uvlinesize - mb_xy_n1], vec_img_cr_o[-10+x], store_chroma_n1); + } + + //TRANSPOSE MATRIX + + VEC_TRANSPOSE_8(vec_img_y_t[ 0], vec_img_y_t[ 1], vec_img_y_t[ 2], vec_img_y_t[ 3], vec_img_y_t[ 4], vec_img_y_t[ 5], vec_img_y_t[ 6], vec_img_y_t[ 7], vec_img_y_o[ 0], vec_img_y_o[ 1], vec_img_y_o[ 2], vec_img_y_o[ 3], vec_img_y_o[ 4], vec_img_y_o[ 5], vec_img_y_o[ 6], vec_img_y_o[ 7],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15], vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31], vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39], vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7], vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7],mergehu16, mergelu16); + + VEC_TRANSPOSE_8(vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7], vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7],mergehu16, mergelu16); + + + //LOAD MB_Y - 1 + for (x = -4; x < 0; x++){ //Unpack Memory to 8 positions vector + vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_high); + vec_img_y_o[x+24] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_low); + } + + for (x = -2; x < 0; x++){ //Unpack Memory to 8 positions vector + vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma); + vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma); + } + + //PROCESS + dir = 1; + { + int edge; + for( edge = start[dir]; edge < edges[dir]; edge++ ) { + if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0) + { + filter_mb_edgeh( &vec_img_y_o[4*edge ], bS[dir][edge], qp[dir][edge],0);//low + filter_mb_edgeh( &vec_img_y_o[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high + if( (edge&1) == 0 ) { + filter_mb_edgecv( &vec_img_cb_o[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] ); + filter_mb_edgecv( &vec_img_cr_o[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] ); + } + } + } + + for (x = -3; x < 16; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory + img_y[x*linesize] = spu_shuffle(vec_img_y_o[x], vec_img_y_o[x+24], patt_unpack); + } + + for (x = -1; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory + img_cb[x*uvlinesize] = spu_shuffle(img_cb[x*uvlinesize], vec_img_cb_o[x], store_chroma); + img_cr[x*uvlinesize] = spu_shuffle(img_cr[x*uvlinesize], vec_img_cr_o[x], store_chroma); + } + } +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_idct_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_idct_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2009 TUDelft + * + * Cell Parallel SPU - Macroblock Decoding. + */ + +/** + * @file libavcodec/cell/spu/h264_main_spu.c + * Cell Parallel SPU - Macroblock Decoding + * @author C C Chi + * + * SIMD kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + +#include +#include "types_spu.h" +#include "h264_tables.h" +#include "h264_idct_spu.h" +#include "h264_intra_spu.h" + +/*********************************************************************** + * ff_h264_idct_add_spu + *********************************************************************** + * h264 idct 4x4 transform with SPU SIMD intrinsics + * using the factorized algorithm + * Mauricio Alvarez: alvarez@ac.upc.edu + * - DCTELEM* block: transformed coefficients are stored consecutvely in memory, + * - for the 4x4 transform the structure is like that: + * || coef_00 | coef_01 || coef_02 | coef_03 ||..||coef_0F|| + * - Usually the DCTELEM block is declared with an alignment modificator in such a way + * that the array is 128 bit (16 byte, 8 short) aligned. + * - The dst pointer can be unaligned with unaligment as a multiple of 4. + ***********************************************************************/ + +// idct_dc +void ff_idct_dc_add(uint8_t *dst, short *block, int stride){ + int i, j; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 4; j++ ){ + for( i = 0; i < 4; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} + +void ff_idct8_dc_add(uint8_t *dst, short *block, int stride){ + int i, j; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 8; j++ ){ + for( i = 0; i < 8; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} + +// add without idct + +void add_pixels8_c(uint8_t *pixels, short *block, int line_size) +{ + int i; + for(i=0;i<8;i++) { + pixels[0] += block[0]; + pixels[1] += block[1]; + pixels[2] += block[2]; + pixels[3] += block[3]; + pixels[4] += block[4]; + pixels[5] += block[5]; + pixels[6] += block[6]; + pixels[7] += block[7]; + pixels += line_size; + block += 8; + } +} + +void add_pixels4_c(uint8_t *pixels, short *block, int line_size) +{ + int i; + for(i=0;i<4;i++) { + pixels[0] += block[0]; + pixels[1] += block[1]; + pixels[2] += block[2]; + pixels[3] += block[3]; + pixels += line_size; + block += 4; + } +} + +void h264_luma_dc_dequant_idct_c(short *block, int qmul){ + #define stride 16 + int i; + int temp[16]; //FIXME check if this is a good idea + static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; + static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; + + for(i=0; i<4; i++){ + const int offset= y_offset[i]; + const int z0= block[offset+stride*0] + block[offset+stride*4]; + const int z1= block[offset+stride*0] - block[offset+stride*4]; + const int z2= block[offset+stride*1] - block[offset+stride*5]; + const int z3= block[offset+stride*1] + block[offset+stride*5]; + + temp[4*i+0]= z0+z3; + temp[4*i+1]= z1+z2; + temp[4*i+2]= z1-z2; + temp[4*i+3]= z0-z3; + } + + for(i=0; i<4; i++){ + const int offset= x_offset[i]; + const int z0= temp[4*0+i] + temp[4*2+i]; + const int z1= temp[4*0+i] - temp[4*2+i]; + const int z2= temp[4*1+i] - temp[4*3+i]; + const int z3= temp[4*1+i] + temp[4*3+i]; + + block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual + block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); + block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); + block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); + } +} +#undef stride + +void chroma_dc_dequant_idct_c(short *block, int qmul){ + const int stride= 16*2; + const int xStride= 16; + int a,b,c,d,e; + + a= block[stride*0 + xStride*0]; + b= block[stride*0 + xStride*1]; + c= block[stride*1 + xStride*0]; + d= block[stride*1 + xStride*1]; + + e= a-b; + a= a+b; + b= c-d; + c= c+d; + + block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; + block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; + block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; + block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; +} + +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride) +{ + vsint16_t __vz0, __vz1, __vz2, __vz3; // used as temporal storage in for VEC_1D_DCT + vsint16_t va0, va1, va2, va3; + vsint16_t vtmp0, vtmp1, vtmp2, vtmp3; + vuint16_t sat; + vuint8_t va_u8; + vsint16_t vdst_ss; + vuint8_t dstperm; + vuint8_t vdst, vdst_orig, vfdst; + const int16_t imax = 255; + const vsint32_t vzero = spu_splats(0); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + const int shift_dst = (unsigned int) dst & 15; + const vuint8_t packu16 = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F); + const vuint8_t mergehu8 = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17); + //for optimized matrix transpose: + const vuint8_t tr0 =AVV(0x00,0x01,0x08,0x09,0x10,0x11,0x18,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); + const vuint8_t tr1 =AVV(0x02,0x03,0x0A,0x0B,0x12,0x13,0x1A,0x1B,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); + const vuint8_t tr2 =AVV(0x04,0x05,0x0C,0x0D,0x14,0x15,0x1C,0x1D,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); + const vuint8_t tr3 =AVV(0x06,0x07,0x0E,0x0F,0x16,0x17,0x1E,0x1F,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); + const vuint8_t conc =AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17); + + block[0] += 32; // add 32 as a DC-level for rounding + + //load matrix + vtmp0 = *(vsint16_t *)(block); + vtmp1 = spu_rlqwbyte(vtmp0,8); + vtmp2 = *(vsint16_t *)(block+8); + vtmp3 = spu_rlqwbyte(vtmp2,8); + + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); + + //concatenate first two rows of matrix + va0=spu_shuffle(va0,va1,conc); + //concatenate last two rows of matrix + va2=spu_shuffle(va2,va3,conc); + + //do transpose starting from two vectors, storing as four vectors of which the second part is unused + vtmp0 = spu_shuffle( va0, va2, tr0); + vtmp1 = spu_shuffle( va0, va2, tr1); + vtmp2 = spu_shuffle( va0, va2, tr2); + vtmp3 = spu_shuffle( va0, va2, tr3); + + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); + + // division by 64 + va0 = spu_rlmaska(va0,-6); + va1 = spu_rlmaska(va1,-6); + va2 = spu_rlmaska(va2,-6); + va3 = spu_rlmaska(va3,-6); + + switch (shift_dst){ + case 0: { + dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); + } break; + case 4: { + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); + } break; + case 8: { + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F); + } break; + case 12: { + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13); + } break; + default: { + dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); + } break; + } + + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm); +} + +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride) +{ + vsint16_t va0, va1, va2, va3, va4, va5, va6, va7; + vsint16_t vza0, vza1, vza2, vza3, vza4, vza5, vza6, vza7, vzal,vzah; + vsint16_t vzb0, vzb1, vzb2, vzb3, vzb4, vzb5, vzb6, vzb7; + vsint16_t vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6, vtmp7; + vuint16_t sat; + vuint8_t va_u8; + const int block_stride=8; + vsint16_t vdst_ss; + const int16_t imax = 255; + const vsint32_t vzero = spu_splats(0); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint8_t vdst, vdst_orig, vfdst; + vuint8_t dstperm; + const int shift_dst = (unsigned int) dst & 15; + const vuint8_t packu16 = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F); + const vuint8_t mergehu8 = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17); + const vuint8_t m1 = AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17); + const vuint8_t m2 = AVV(0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F); + const vuint8_t m3 = AVV(0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x18,0x19,0x1A,0x1B); + const vuint8_t m4 = AVV(0x14,0x15,0x16,0x17,0x04,0x05,0x06,0x07,0x1C,0x1D,0x1E,0x1F,0x0C,0x0D,0x0E,0x0F); + const vuint8_t m5 = AVV(0x00,0x01,0x10,0x11,0x04,0x05,0x14,0x15,0x08,0x09,0x18,0x19,0x0C,0x0D,0x1C,0x1D); + const vuint8_t m6 = AVV(0x12,0x13,0x02,0x03,0x16,0x17,0x06,0x07,0x1A,0x1B,0x0A,0x0B,0x1E,0x1F,0x0E,0x0F); + + block[0] += 32; // add 32 as a DC-level for rounding + + vtmp0 = *(vsint16_t *)(block); + vtmp1 = *(vsint16_t *)(block + block_stride); + vtmp2 = *(vsint16_t *)(block + 2*block_stride); + vtmp3 = *(vsint16_t *)(block + 3*block_stride); + vtmp4 = *(vsint16_t *)(block + 4*block_stride); + vtmp5 = *(vsint16_t *)(block + 5*block_stride); + vtmp6 = *(vsint16_t *)(block + 6*block_stride); + vtmp7 = *(vsint16_t *)(block + 7*block_stride); + + VEC_1D_DCT8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7); + VEC_TRANSPOSE_8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7,va0,va1,va2,va3,va4,va5,va6,va7); + VEC_1D_DCT8(va0, va1, va2, va3, va4, va5, va6, va7); + + va0 = spu_rlmaska(va0,-6); + va1 = spu_rlmaska(va1,-6); + va2 = spu_rlmaska(va2,-6); + va3 = spu_rlmaska(va3,-6); + va4 = spu_rlmaska(va4,-6); + va5 = spu_rlmaska(va5,-6); + va6 = spu_rlmaska(va6,-6); + va7 = spu_rlmaska(va7,-6); + + if (shift_dst==8) + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17); + else dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); + + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va4,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va5,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va6,dstperm); + dst += stride; + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va7,dstperm); + +} + +/* + +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride){ + int i; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + + block[0] += 32; + + for(i=0; i<4; i++){ + const int z0= block[0 + 4*i] + block[2 + 4*i]; + const int z1= block[0 + 4*i] - block[2 + 4*i]; + const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i]; + const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1); + + block[0 + 4*i]= z0 + z3; + block[1 + 4*i]= z1 + z2; + block[2 + 4*i]= z1 - z2; + block[3 + 4*i]= z0 - z3; + } + + for(i=0; i<4; i++){ + const int z0= block[i + 4*0] + block[i + 4*2]; + const int z1= block[i + 4*0] - block[i + 4*2]; + const int z2= (block[i + 4*1]>>1) - block[i + 4*3]; + const int z3= block[i + 4*1] + (block[i + 4*3]>>1); + + dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ]; + dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ]; + dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ]; + dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ]; + } +} + +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride){ + int i; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + + block[0] += 32; + + for( i = 0; i < 8; i++ ) + { + const int a0 = block[0+i*8] + block[4+i*8]; + const int a2 = block[0+i*8] - block[4+i*8]; + const int a4 = (block[2+i*8]>>1) - block[6+i*8]; + const int a6 = (block[6+i*8]>>1) + block[2+i*8]; + + const int b0 = a0 + a6; + const int b2 = a2 + a4; + const int b4 = a2 - a4; + const int b6 = a0 - a6; + + const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1); + const int a3 = block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1); + const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1); + const int a7 = block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1); + + const int b1 = (a7>>2) + a1; + const int b3 = a3 + (a5>>2); + const int b5 = (a3>>2) - a5; + const int b7 = a7 - (a1>>2); + + block[0+i*8] = b0 + b7; + block[7+i*8] = b0 - b7; + block[1+i*8] = b2 + b5; + block[6+i*8] = b2 - b5; + block[2+i*8] = b4 + b3; + block[5+i*8] = b4 - b3; + block[3+i*8] = b6 + b1; + block[4+i*8] = b6 - b1; + } + for( i = 0; i < 8; i++ ) + { + const int a0 = block[i+0*8] + block[i+4*8]; + const int a2 = block[i+0*8] - block[i+4*8]; + const int a4 = (block[i+2*8]>>1) - block[i+6*8]; + const int a6 = (block[i+6*8]>>1) + block[i+2*8]; + + const int b0 = a0 + a6; + const int b2 = a2 + a4; + const int b4 = a2 - a4; + const int b6 = a0 - a6; + + const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1); + const int a3 = block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1); + const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1); + const int a7 = block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1); + + const int b1 = (a7>>2) + a1; + const int b3 = a3 + (a5>>2); + const int b5 = (a3>>2) - a5; + const int b7 = a7 - (a1>>2); + + dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ]; + dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ]; + dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ]; + dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ]; + dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ]; + dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ]; + dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ]; + dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; + } +}*/ + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_idct_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_idct_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,141 @@ +#ifndef H264_IDCT_SPU_H +#define H264_IDCT_SPU_H + +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride); +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride); + +/*********************************************************************** + * VEC_1D_IDCT + *********************************************************************** + * 1-dimensional 4x4 H264 integer DCT inverse transform. + * Actually source and destination are 8x4. The low elements of the + * source are discarded and the low elements of the destination mustn't + * be used. + * __vz0-__vz3 registers need to be declared in the caller function + ***********************************************************************/ +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ + /* 1st stage */ \ + __vz0 = spu_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ + __vz1 = spu_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ + __vz2 = spu_rlmaska(vb1,-1); \ + __vz2 = spu_sub(__vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ + __vz3 = spu_rlmaska(vb3,-1); \ + __vz3 = spu_add(vb1,__vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ + \ + /* 2nd stage: output */ \ + va0 = spu_add(__vz0,__vz3); /* x[0] = temp[0] + temp[3] */ \ + va1 = spu_add(__vz1,__vz2); /* x[1] = temp[1] + temp[2] */ \ + va2 = spu_sub(__vz1,__vz2); /* x[2] = temp[1] - temp[2] */ \ + va3 = spu_sub(__vz0,__vz3) /* x[3] = temp[0] - temp[3] */ + +/*********************************************************************** + * VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8 + *********************************************************************** + * load a vuint8_t vector from a unaligned memory position p + * Converts the vector to vsint16_t + * Adds the loaded and converted vector to a defined vector va + * converts back the result to vuint8_t and store it to memory + **********************************************************************/ + +#define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,shift,va,align_dst) \ + vdst_orig = *(vuint8_t *) (p); \ + vdst = spu_or(spu_slqwbyte(vdst_orig, shift),(vuint8_t) vzero); \ + vdst_ss = (vsint16_t) spu_shuffle((vuint8_t)vzero,vdst,mergehu8); \ + va = spu_add(va,vdst_ss); \ + sat = spu_cmpgt(va,(vsint16_t)vzero); \ + va = spu_and(va,(vsint16_t)sat); \ + sat = spu_cmpgt(va,vmax); \ + va = spu_sel(va,vmax,sat); \ + va_u8 = (vuint8_t) spu_shuffle(va,(vsint16_t) vzero,packu16); \ + vfdst = spu_shuffle(vdst_orig, va_u8, align_dst); \ + *(vuint8_t *) (dst) = vfdst + +/*********************************************************************** + * VEC_TRANSPOSE_8 + *********************************************************************** + * Transposes a 8x8 matrix of s16 vectors + **********************************************************************/ +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ + b0 = spu_shuffle( a0, a4, m1 ); \ + b1 = spu_shuffle( a1, a5, m1 ); \ + b2 = spu_shuffle( a2, a6, m1 ); \ + b3 = spu_shuffle( a3, a7, m1 ); \ + b4 = spu_shuffle( a4, a0, m2 ); \ + b5 = spu_shuffle( a5, a1, m2 ); \ + b6 = spu_shuffle( a6, a2, m2 ); \ + b7 = spu_shuffle( a7, a3, m2 ); \ + a0 = spu_shuffle( b0, b2, m3 ); \ + a1 = spu_shuffle( b1, b3, m3 ); \ + a2 = spu_shuffle( b2, b0, m4 ); \ + a3 = spu_shuffle( b3, b1, m4 ); \ + a4 = spu_shuffle( b4, b6, m3 ); \ + a5 = spu_shuffle( b5, b7, m3 ); \ + a6 = spu_shuffle( b6, b4, m4 ); \ + a7 = spu_shuffle( b7, b5, m4 ); \ + b0 = spu_shuffle( a0, a1, m5 ); \ + b1 = spu_shuffle( a1, a0, m6 ); \ + b2 = spu_shuffle( a2, a3, m5 ); \ + b3 = spu_shuffle( a3, a2, m6 ); \ + b4 = spu_shuffle( a4, a5, m5 ); \ + b5 = spu_shuffle( a5, a4, m6 ); \ + b6 = spu_shuffle( a6, a7, m5 ); \ + b7 = spu_shuffle( a7, a6, m6 ) + +/*********************************************************************** + * VEC_1D_IDCT8 + *********************************************************************** + * 1-dimensional 8x8 H264 integer DCT inverse transform. + ***********************************************************************/ +#define VEC_1D_DCT8(vb0,vb1,vb2,vb3,vb4,vb5,vb6,vb7) \ + vza0 = spu_add(vb0,vb4); /* a[0] = Y[0] + Y[4] */ \ + vza2 = spu_sub(vb0,vb4); /* a[2] = Y[0] - Y[4] */ \ + vza4 = spu_rlmaska(vb2,-1); \ + vza4 = spu_sub(vza4,vb6); /* a[4] = Y[2]>>1 - Y[6] */ \ + vza6 = spu_rlmaska(vb6,-1 ); \ + vza6 = spu_add(vb2,vza6); /* a[6] = Y[2] + Y[6]>>1 */ \ + \ + vzb0 = spu_add(vza0,vza6); /* b[0] = a[0] + a[6] */ \ + vzb2 = spu_add(vza2,vza4); /* b[2] = a[2] + a[4] */ \ + vzb4 = spu_sub(vza2,vza4); /* b[4] = a[2] - a[4] */ \ + vzb6 = spu_sub(vza0,vza6); /* b[6] = a[0] - a[6] */ \ + \ + vza1 = spu_rlmaska(vb7,-1); \ + vzal = spu_add(vza1,vb7); \ + vzah = spu_sub(vb5,vb3); \ + vza1 = spu_sub(vzah,vzal); /* a1 = (-Y[3] + Y[5]) - (Y[7] + (Y[7]>>1)) */ \ + \ + vza3 = spu_rlmaska(vb3,-1); \ + vzal = spu_add(vza3,vb3); \ + vzah = spu_add(vb1,vb7); \ + vza3 = spu_sub(vzah,vzal); /* a3 = (Y[1] + Y[7]) - (Y[3] + (Y[3]>>1)) */ \ + \ + vza5 = spu_rlmaska(vb5,-1); \ + vzal = spu_add(vza5,vb5); \ + vzah = spu_sub(vb7,vb1); \ + vza5 = spu_add(vzah,vzal); /* a5 = (-Y[1] + Y[7]) + (Y[5] + Y[5]>>1)) */ \ + \ + vza7 = spu_rlmaska(vb1,-1); \ + vzal = spu_add(vza7,vb1); \ + vzah = spu_add(vb3,vb5); \ + vza7 = spu_add(vzah,vzal); /* a7 = (Y[3] + Y[5]) + (Y[1] + (Y[1]>>1)) */ \ + \ + vzb1 = spu_rlmaska(vza7,-2); \ + vzb1 = spu_add(vzb1,vza1); /* b1 = (a7>>2) + a1 */ \ + vzb3 = spu_rlmaska(vza5,-2); \ + vzb3 = spu_add(vzb3,vza3); /* b3 = a3 + (a5>>2) */ \ + vzb5 = spu_rlmaska(vza3,-2); \ + vzb5 = spu_sub(vzb5,vza5); /* b5 = (a3>>2) - a5 */ \ + vzb7 = spu_rlmaska(vza1,-2); \ + vzb7 = spu_sub(vza7,vzb7); /* b7 = a7 - (a1>>2) */ \ + \ + vb0 = spu_add(vzb0,vzb7); /* src[i][0] = b0 + b7 */ \ + vb7 = spu_sub(vzb0,vzb7); /* src[i][7] = b0 - b7 */ \ + vb1 = spu_add(vzb2,vzb5); /* src[i][1] = b2 + b5 */ \ + vb6 = spu_sub(vzb2,vzb5); /* src[i][6] = b2 - b5 */ \ + vb2 = spu_add(vzb4,vzb3); /* src[i][2] = b4 + b3 */ \ + vb5 = spu_sub(vzb4,vzb3); /* src[i][5] = b4 - b3 */ \ + vb3 = spu_add(vzb6,vzb1); /* src[i][3] = b6 + b1 */ \ + vb4 = spu_sub(vzb6,vzb1); /* src[i][4] = b6 - b1 */ + + +#endif /*H264_IDCT_SPU_H*/ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_intra_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_intra_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,802 @@ +#include "types_spu.h" +#include "h264_tables.h" +#include "h264_intra_spu.h" +#include + +void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const uint32_t a= ((uint32_t*)(src-stride))[0]; + ((uint32_t*)(src+0*stride))[0]= a; + ((uint32_t*)(src+1*stride))[0]= a; + ((uint32_t*)(src+2*stride))[0]= a; + ((uint32_t*)(src+3*stride))[0]= a; +} + +void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; + ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; + ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; + ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; +} + +void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; +} + +void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; + + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; +} + +void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; + + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; +} + +void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; +} + + +#define LOAD_TOP_RIGHT_EDGE\ + const int t4= topright[0];\ + const int t5= topright[1];\ + const int t6= topright[2];\ + const int t7= topright[3];\ + +#define LOAD_LEFT_EDGE\ + const int l0= src[-1+0*stride];\ + const int l1= src[-1+1*stride];\ + const int l2= src[-1+2*stride];\ + const int l3= src[-1+3*stride];\ + +#define LOAD_TOP_EDGE\ + const int t0= src[ 0-1*stride];\ + const int t1= src[ 1-1*stride];\ + const int t2= src[ 2-1*stride];\ + const int t3= src[ 3-1*stride];\ + +void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; + src[0+2*stride]= + src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; + src[0+1*stride]= + src[1+2*stride]= + src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; + src[0+0*stride]= + src[1+1*stride]= + src[2+2*stride]= + src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[1+0*stride]= + src[2+1*stride]= + src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[2+0*stride]= + src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; +} + +void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE +// LOAD_LEFT_EDGE + + src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; + src[1+0*stride]= + src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; + src[2+0*stride]= + src[1+1*stride]= + src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; + src[3+0*stride]= + src[2+1*stride]= + src[1+2*stride]= + src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; + src[3+1*stride]= + src[2+2*stride]= + src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; + src[3+2*stride]= + src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; + src[3+3*stride]=(t6 + 3*t7 + 2)>>2; +} + +void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + (void) l3; + + src[0+0*stride]= + src[1+2*stride]=(lt + t0 + 1)>>1; + src[1+0*stride]= + src[2+2*stride]=(t0 + t1 + 1)>>1; + src[2+0*stride]= + src[3+2*stride]=(t1 + t2 + 1)>>1; + src[3+0*stride]=(t2 + t3 + 1)>>1; + src[0+1*stride]= + src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[1+1*stride]= + src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[2+1*stride]= + src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; + src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; +} + +void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + (void) t7; + + src[0+0*stride]=(t0 + t1 + 1)>>1; + src[1+0*stride]= + src[0+2*stride]=(t1 + t2 + 1)>>1; + src[2+0*stride]= + src[1+2*stride]=(t2 + t3 + 1)>>1; + src[3+0*stride]= + src[2+2*stride]=(t3 + t4+ 1)>>1; + src[3+2*stride]=(t4 + t5+ 1)>>1; + src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[1+1*stride]= + src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[2+1*stride]= + src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; + src[3+1*stride]= + src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; + src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; +} + +void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + LOAD_LEFT_EDGE + + src[0+0*stride]=(l0 + l1 + 1)>>1; + src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; + src[2+0*stride]= + src[0+1*stride]=(l1 + l2 + 1)>>1; + src[3+0*stride]= + src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; + src[2+1*stride]= + src[0+2*stride]=(l2 + l3 + 1)>>1; + src[3+1*stride]= + src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; + src[3+2*stride]= + src[1+3*stride]= + src[0+3*stride]= + src[2+2*stride]= + src[2+3*stride]= + src[3+3*stride]=l3; +} + +void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + (void) t3; + + src[0+0*stride]= + src[2+1*stride]=(lt + l0 + 1)>>1; + src[1+0*stride]= + src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[0+1*stride]= + src[2+2*stride]=(l0 + l1 + 1)>>1; + src[1+1*stride]= + src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; + src[0+2*stride]= + src[2+3*stride]=(l1 + l2+ 1)>>1; + src[1+2*stride]= + src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; + src[0+3*stride]=(l2 + l3 + 1)>>1; + src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; +} + +void ff_pred16x16_vertical_c(uint8_t *src, int stride){ + int i; + const vuint32_t v= *((vuint32_t*)(src-stride)); + for(i=0; i<4; i++){ + *((vuint32_t*) src ) =v; + *((vuint32_t*)(src + stride)) =v; + *((vuint32_t*)(src + 2*stride)) =v; + *((vuint32_t*)(src + 3*stride)) =v; + src+= 4*stride; + } + + /*const uint32_t a= ((uint32_t*)(src-stride))[0]; + const uint32_t b= ((uint32_t*)(src-stride))[1]; + const uint32_t c= ((uint32_t*)(src-stride))[2]; + const uint32_t d= ((uint32_t*)(src-stride))[3]; + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= a; + ((uint32_t*)(src+i*stride))[1]= b; + ((uint32_t*)(src+i*stride))[2]= c; + ((uint32_t*)(src+i*stride))[3]= d; + }*/ +} + +void ff_pred16x16_horizontal_c(uint8_t *src, int stride){ + int i; + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; + } +} + +void ff_pred16x16_dc_c(uint8_t *src, int stride){ + int i; + int dc=0; + for(i=0;i<16; i++){ + dc+= src[-1+i*stride]; + } + + for(i=0;i<16; i++){ + dc+= src[i-stride]; + } + dc= 0x01010101*((dc + 16)>>5); + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= dc; + } +} + +void ff_pred16x16_left_dc_c(uint8_t *src, int stride){ + int i; + + int dc=0; + for(i=0;i<16; i++){ + dc+= src[-1+i*stride]; + } + dc= 0x01010101*((dc + 8)>>4); + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= dc; + } +} + +void ff_pred16x16_top_dc_c(uint8_t *src, int stride){ + int i; + int dc0=0; + for(i=0;i<16; i++){ + dc0+= src[i-stride]; + } + + dc0= 0x01010101*((dc0 + 8)>>4); + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= dc0; + } +} + +void ff_pred16x16_128_dc_c(uint8_t *src, int stride){ + int i; + + /*const vuint32_t v= AVV(0x01010101U*128U, 0x01010101U*128U,0x01010101U*128U,0x01010101U*128U); + for(i=0; i<4; i++){ + *((vuint32_t*) src ) =v; + *((vuint32_t*)(src + stride)) =v; + *((vuint32_t*)(src + 2*stride)) =v; + *((vuint32_t*)(src + 3*stride)) =v; + src+= 4*stride; + }*/ + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U; + } +} + +void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){ + int i, j, k; + int a; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + const uint8_t * const src0 = src+7-stride; + const uint8_t *src1 = src+8*stride-1; + const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + for(k=2; k<=8; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + if(svq3){ + H = ( 5*(H/4) ) / 16; + V = ( 5*(V/4) ) / 16; + + /* required for 100% accuracy */ + i = H; H = V; V = i; + }else{ + H = ( 5*H+32 ) >> 6; + V = ( 5*V+32 ) >> 6; + } + + a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); + for(j=16; j>0; --j) { + int b = a; + a += V; + for(i=-16; i<0; i+=4) { + src[16+i] = cm[ (b ) >> 5 ]; + src[17+i] = cm[ (b+ H) >> 5 ]; + src[18+i] = cm[ (b+2*H) >> 5 ]; + src[19+i] = cm[ (b+3*H) >> 5 ]; + b += 4*H; + } + src += stride; + } +} + +void ff_pred16x16_plane_c(uint8_t *src, int stride){ + pred16x16_plane_compat_c(src, stride, 0); +} + +void ff_pred8x8_vertical_c(uint8_t *src, int stride){ + int i; + const uint32_t a= ((uint32_t*)(src-stride))[0]; + const uint32_t b= ((uint32_t*)(src-stride))[1]; + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= a; + ((uint32_t*)(src+i*stride))[1]= b; + } +} + +void ff_pred8x8_horizontal_c(uint8_t *src, int stride){ + int i; + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; + } +} + +void ff_pred8x8_128_dc_c(uint8_t *src, int stride){ + int i; + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; + } +} + +void ff_pred8x8_left_dc_c(uint8_t *src, int stride){ + int i; + int dc0, dc2; + + dc0=dc2=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride]; + dc2+= src[-1+(i+4)*stride]; + } + dc0= 0x01010101*((dc0 + 2)>>2); + dc2= 0x01010101*((dc2 + 2)>>2); + + for(i=0; i<4; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= dc0; + } + for(i=4; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= dc2; + } +} + +void ff_pred8x8_top_dc_c(uint8_t *src, int stride){ + int i; + int dc0, dc1; + + dc0=dc1=0; + for(i=0;i<4; i++){ + dc0+= src[i-stride]; + dc1+= src[4+i-stride]; + } + dc0= 0x01010101*((dc0 + 2)>>2); + dc1= 0x01010101*((dc1 + 2)>>2); + + for(i=0; i<4; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc1; + } + for(i=4; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc1; + } +} + + +void ff_pred8x8_dc_c(uint8_t *src, int stride){ + int i; + int dc0, dc1, dc2, dc3; + + dc0=dc1=dc2=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride] + src[i-stride]; + dc1+= src[4+i-stride]; + dc2+= src[-1+(i+4)*stride]; + } + dc3= 0x01010101*((dc1 + dc2 + 4)>>3); + dc0= 0x01010101*((dc0 + 4)>>3); + dc1= 0x01010101*((dc1 + 2)>>2); + dc2= 0x01010101*((dc2 + 2)>>2); + + for(i=0; i<4; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc1; + } + for(i=4; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= dc2; + ((uint32_t*)(src+i*stride))[1]= dc3; + } +} + +void ff_pred8x8_plane_c(uint8_t *src, int stride){ + int j, k; + int a; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + const uint8_t * const src0 = src+3-stride; + const uint8_t *src1 = src+4*stride-1; + const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + for(k=2; k<=4; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + H = ( 17*H+16 ) >> 5; + V = ( 17*V+16 ) >> 5; + + a = 16*(src1[0] + src2[8]+1) - 3*(V+H); + for(j=8; j>0; --j) { + int b = a; + a += V; + src[0] = cm[ (b ) >> 5 ]; + src[1] = cm[ (b+ H) >> 5 ]; + src[2] = cm[ (b+2*H) >> 5 ]; + src[3] = cm[ (b+3*H) >> 5 ]; + src[4] = cm[ (b+4*H) >> 5 ]; + src[5] = cm[ (b+5*H) >> 5 ]; + src[6] = cm[ (b+6*H) >> 5 ]; + src[7] = cm[ (b+7*H) >> 5 ]; + src += stride; + } +} + + +#define SRC(x,y) src[(x)+(y)*stride] +#define PL(y) \ + const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; +#define PREDICT_8x8_LOAD_LEFT \ + const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ + + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ + PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ + const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 + +#define PT(x) \ + const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOP \ + const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ + + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ + PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ + const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ + + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 + +#define PTR(x) \ + t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOPRIGHT \ + int t8, t9, t10, t11, t12, t13, t14, t15; \ + if(has_topright) { \ + PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ + t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ + } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); + +#define PREDICT_8x8_LOAD_TOPLEFT \ + const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 + +#define PREDICT_8x8_DC(v) \ + int y; \ + for( y = 0; y < 8; y++ ) { \ + ((uint32_t*)src)[0] = \ + ((uint32_t*)src)[1] = v; \ + src += stride; \ + } + +static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + (void) has_topright; + (void) has_topleft; + PREDICT_8x8_DC(0x80808080); +} +static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + (void) has_topright; + PREDICT_8x8_LOAD_LEFT; + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; + PREDICT_8x8_DC(dc); +} +static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + PREDICT_8x8_LOAD_TOP; + const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; + PREDICT_8x8_DC(dc); +} +static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOP; + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 + +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; + PREDICT_8x8_DC(dc); +} +static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + (void) has_topright; + PREDICT_8x8_LOAD_LEFT; +#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ + ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y + ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); +#undef ROW +} +static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + int y; + PREDICT_8x8_LOAD_TOP; + src[0] = t0; + src[1] = t1; + src[2] = t2; + src[3] = t3; + src[4] = t4; + src[5] = t5; + src[6] = t6; + src[7] = t7; + for( y = 1; y < 8; y++ ) + *(uint64_t*)(src+y*stride) = *(uint64_t*)src; +} +static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; + SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; + SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; + SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; + SRC(7,7)= (t14 + 3*t15 + 2) >> 2; +} +static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; + SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; + +} +static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + (void) l7; + SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; + SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; + SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; + SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; + SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; + SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; + SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; + SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(7,0)= (t6 + t7 + 1) >> 1; +} +static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + (void) t7; + SRC(0,7)= (l6 + l7 + 1) >> 1; + SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; + SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; + SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; + SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; + SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; + SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; + SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; + SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; + SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; + SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; + SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; + SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; + SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; + SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; +} +static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + t1 + 1) >> 1; + SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; + SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; + SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; + SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; + SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; + SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; + SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; + SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; + SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; + SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(7,6)= (t10 + t11 + 1) >> 1; + SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; +} +static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride) +{ + (void) has_topright; + PREDICT_8x8_LOAD_LEFT; + SRC(0,0)= (l0 + l1 + 1) >> 1; + SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; + SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; + SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; + SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; + SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; + SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; + SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; + SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= + SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= + SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= + SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; +} +#undef PREDICT_8x8_LOAD_LEFT +#undef PREDICT_8x8_LOAD_TOP +#undef PREDICT_8x8_LOAD_TOPLEFT +#undef PREDICT_8x8_LOAD_TOPRIGHT +#undef PREDICT_8x8_DC +#undef PTR +#undef PT +#undef PL +#undef SRC + +void init_pred_ptrs(H264PredContext_spu *i){ + + i->pred4x4[VERT_PRED ]= pred4x4_vertical_c; + i->pred4x4[HOR_PRED ]= pred4x4_horizontal_c; + i->pred4x4[DC_PRED ]= pred4x4_dc_c; + i->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c; + i->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c; + i->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c; + i->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c; + i->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c; + i->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c; + i->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c; + i->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c; + i->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c; + + i->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c; + i->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c; + i->pred8x8l[DC_PRED ]= pred8x8l_dc_c; + i->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c; + i->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c; + i->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c; + i->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c; + i->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c; + i->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c; + i->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c; + i->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c; + i->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c; + + + i->pred8x8[VERT_PRED8x8 ]= ff_pred8x8_vertical_c; + i->pred8x8[HOR_PRED8x8 ]= ff_pred8x8_horizontal_c; + i->pred8x8[PLANE_PRED8x8 ]= ff_pred8x8_plane_c; + i->pred8x8[DC_PRED8x8 ]= ff_pred8x8_dc_c; + i->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c; + i->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c; + i->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c; + + i->pred16x16[DC_PRED8x8 ]= ff_pred16x16_dc_c; + i->pred16x16[VERT_PRED8x8 ]= ff_pred16x16_vertical_c; + i->pred16x16[HOR_PRED8x8 ]= ff_pred16x16_horizontal_c; + i->pred16x16[PLANE_PRED8x8 ]= ff_pred16x16_plane_c; + i->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c; + i->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c; + i->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c; + +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_intra_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_intra_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,48 @@ +#ifndef H264_INTRA_SPU_H +#define H264_INTRA_SPU_H + +#define MAX_NEG_CROP 1024 + +// For Intra mode +#define MB_TYPE_INTRA4x4 0x0001 +#define IS_INTRA(a) ((a)&7) +#define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4) + +#define CODEC_FLAG_GRAY 0x2000 + +#define VERT_PRED 0 +#define HOR_PRED 1 +#define DC_PRED 2 +#define DIAG_DOWN_LEFT_PRED 3 +#define DIAG_DOWN_RIGHT_PRED 4 +#define VERT_RIGHT_PRED 5 +#define HOR_DOWN_PRED 6 +#define VERT_LEFT_PRED 7 +#define HOR_UP_PRED 8 + +#define LEFT_DC_PRED 9 +#define TOP_DC_PRED 10 +#define DC_128_PRED 11 + + +#define DC_PRED8x8 0 +#define HOR_PRED8x8 1 +#define VERT_PRED8x8 2 +#define PLANE_PRED8x8 3 + +#define LEFT_DC_PRED8x8 4 +#define TOP_DC_PRED8x8 5 +#define DC_128_PRED8x8 6 + +typedef struct H264PredContext_spu{ + + intra_pred4x4 pred4x4[9+3]; + intra_pred16x16 pred16x16[4+3]; + intra_pred8x8 pred8x8[4+3]; + intra_pred8x8l pred8x8l[9+3]; + +}H264PredContext_spu; + +void init_pred_ptrs(H264PredContext_spu *i); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_luma_template_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_luma_template_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1560 @@ +static void PREFIX_h264_qpel16_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { + + register int i; + + const int16_t i20ss= 20; + const int16_t i5ss= 5; + const int16_t i16ss= 16; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t v16ss = spu_splats(i16ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const int shift_src =(unsigned int) src & 15; + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + uint8_t *srcbis = src - (STRIDE_Y * 2); + + const vuint8_t srcM2a = *(vuint8_t *)(srcbis); + const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcM1a = *(vuint8_t *)(srcbis); + const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP0a = *(vuint8_t *)(srcbis); + const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP1a = *(vuint8_t *)(srcbis); + const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP2a = *(vuint8_t *)(srcbis); + const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); + + srcbis += STRIDE_Y; + + vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); + vsint16_t srcM2ssB = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); + vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); + vsint16_t srcM1ssB = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); + vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); + vsint16_t srcP0ssB = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); + vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); + vsint16_t srcP1ssB = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); + vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); + vsint16_t srcP2ssB = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); + + for (i = 0 ; i < h ; i++) { + const vuint8_t srcP3a = *(vuint8_t *)(srcbis); + const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); + + const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); + const vsint16_t srcP3ssB = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); + srcbis += STRIDE_Y; + + const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); + const vsint16_t sum1B = spu_add(srcP0ssB, srcP1ssB); + const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); + const vsint16_t sum2B = spu_add(srcM1ssB, srcP2ssB); + const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); + const vsint16_t sum3B = spu_add(srcM2ssB, srcP3ssB); + + srcM2ssA = srcM1ssA; + srcM2ssB = srcM1ssB; + srcM1ssA = srcP0ssA; + srcM1ssB = srcP0ssB; + srcP0ssA = srcP1ssA; + srcP0ssB = srcP1ssB; + srcP1ssA = srcP2ssA; + srcP1ssB = srcP2ssB; + srcP2ssA = srcP3ssA; + srcP2ssB = srcP3ssB; + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, v16ss); + + const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); + const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); + const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); + const vsint16_t pp1B = spu_add(pp1B3, v16ss); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); + const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); + const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); + + const vsint16_t pp3A = spu_add(sum3A, pp1A); + const vsint16_t pp3B = spu_add(sum3B, pp1B); + + const vsint16_t psumA = spu_sub(pp3A, pp2A); + const vsint16_t psumB = spu_sub(pp3B, pp2B); + + vsint16_t sumA = spu_rlmask(psumA, -5); + vsint16_t sumB = spu_rlmask(psumB, -5); + + //Saturation to 0 and 255 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); + sumA = spu_and(sumA,(vsint16_t)sat); + sat = spu_cmpgt(sumA,vmax); + sumA = spu_sel(sumA,vmax,sat); + sat = spu_cmpgt(sumB,(vsint16_t)vzero); + sumB = spu_and(sumB,(vsint16_t)sat); + sat = spu_cmpgt(sumB,vmax); + sumB = spu_sel(sumB,vmax,sat); + + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu); + + /* 16x16 dest luma blocks are alway aligned */ + const vuint8_t vdst = *(vuint8_t *)dst; + + vuint8_t fsum; + OP_U8_SPU(fsum, sum, vdst); + + *(vuint8_t *)dst=fsum; + + dst += dstStride; /* stride is multiple of 16 ,so dstperm and dstmask can remain out of the loop */ + } +} + +static void PREFIX_h264_qpel16_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { + + register int i; + + const int16_t i20ss = 20; + const int16_t i5ss = 5; + const int16_t i16ss = 16; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t v16ss = spu_splats(i16ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + const int permM2 = (unsigned int) (src-2) & 15; + const int permM1 = (unsigned int) (src-1) & 15; + const int permP0 = (unsigned int) (src) & 15; + const int permP1 = (unsigned int) (src+1) & 15; + const int permP2 = (unsigned int) (src+2) & 15; + const int permP3 = (unsigned int) (src+3) & 15; + + register int align = ((((unsigned long)src) - 2) % 16); + + for (i = 0 ; i < h ; i ++) { + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vuint8_t srcR1 = *(vuint8_t *)(src-2); + vuint8_t srcR2 = *(vuint8_t *)(src+14); + + switch (align) { + default: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); + } break; + case 11: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = srcR2; + } break; + case 12: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = srcR2; + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 13: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = srcR2; + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 14: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = srcR2; + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 15: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = srcR2; + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + } + + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); + const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); + const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); + + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); + const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); + const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); + + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); + const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); + const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); + + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); + const vsint16_t sum1B = spu_add(srcP0B, srcP1B); + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); + const vsint16_t sum2B = spu_add(srcM1B, srcP2B); + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); + const vsint16_t sum3B = spu_add(srcM2B, srcP3B); + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, v16ss); + + const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); + const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); + const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); + const vsint16_t pp1B = spu_add(pp1B3, v16ss); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); + const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); + const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); + + const vsint16_t pp3A = spu_add(sum3A, pp1A); + const vsint16_t pp3B = spu_add(sum3B, pp1B); + + const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); + const vsint16_t psumB = spu_sub(pp3B, (vsint16_t)pp2B); + + vsint16_t sumA = spu_rlmask(psumA, -5); + vsint16_t sumB = spu_rlmask(psumB, -5); + + //Saturation to 0 and 255 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); + sumA = spu_and(sumA,(vsint16_t)sat); + sat = spu_cmpgt(sumA,vmax); + sumA = spu_sel(sumA,vmax,sat); + sat = spu_cmpgt(sumB,(vsint16_t)vzero); + sumB = spu_and(sumB,(vsint16_t)sat); + sat = spu_cmpgt(sumB,vmax); + sumB = spu_sel(sumB,vmax,sat); + + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu); + + /* 16x16 dest luma blocks are alway aligned */ + const vuint8_t vdst = *(vuint8_t *)dst; + + vuint8_t fsum; + OP_U8_SPU(fsum, sum, vdst); + + *(vuint8_t *)dst=fsum; + + src += STRIDE_Y; + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ + } +} + +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ +static void PREFIX_h264_qpel16_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { + register int i; + + const int16_t i20ss = 20; + const int16_t i5ss = 5; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + const int permM2 = (unsigned int) (src-2) & 15; + const int permM1 = (unsigned int) (src-1) & 15; + const int permP0 = (unsigned int) (src) & 15; + const int permP1 = (unsigned int) (src+1) & 15; + const int permP2 = (unsigned int) (src+2) & 15; + const int permP3 = (unsigned int) (src+3) & 15; + + register int align = ((((unsigned long)src) - 2) % 16); + + src -= (2 * STRIDE_Y); + + for (i = 0 ; i < (h+5) ; i ++) { + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vuint8_t srcR1 = *(vuint8_t *)(src-2); + vuint8_t srcR2 = *(vuint8_t *)(src+14); + + switch (align) { + default: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); + } break; + case 11: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = srcR2; + } break; + case 12: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = srcR2; + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 13: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = srcR2; + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 14: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = srcR2; + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 15: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = srcR2; + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + } + + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); + const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); + const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); + + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); + const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); + const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); + + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); + const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); + const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); + + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); + const vsint16_t sum1B = spu_add(srcP0B, srcP1B); + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); + const vsint16_t sum2B = spu_add(srcM1B, srcP2B); + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); + const vsint16_t sum3B = spu_add(srcM2B, srcP3B); + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, sum3A); + + const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); + const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); + const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); + const vsint16_t pp1B = spu_add(pp1B3, sum3B); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); + const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); + const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); + + const vsint16_t psumA = spu_sub(pp1A, pp2A); + const vsint16_t psumB = spu_sub(pp1B, pp2B); + + *(vsint16_t *)tmp = psumA; + *(vsint16_t *)(tmp+8) = psumB; + + src += STRIDE_Y; + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ + } + + const int32_t ni10si = -10; + const int16_t i1ss = 1; + const int32_t i512si = 512; + const int32_t ni16si = -16; + + const vsint32_t nv10si = spu_splats(ni10si); + const vsint16_t v1ss = spu_splats(i1ss); + const vsint32_t v512si = spu_splats(i512si); + const vsint32_t nv16si = spu_splats(ni16si); + + const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; + const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; + + int16_t *tmpbis = tmp - (tmpStride * (h+5)); + + vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); + vsint16_t tmpM2ssB = *(vsint16_t *)(tmpbis+8); + tmpbis += tmpStride; + vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); + vsint16_t tmpM1ssB = *(vsint16_t *)(tmpbis+8); + tmpbis += tmpStride; + vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); + vsint16_t tmpP0ssB = *(vsint16_t *)(tmpbis+8); + tmpbis += tmpStride; + vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); + vsint16_t tmpP1ssB = *(vsint16_t *)(tmpbis+8); + tmpbis += tmpStride; + vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); + vsint16_t tmpP2ssB = *(vsint16_t *)(tmpbis+8); + tmpbis += tmpStride; + + for (i = 0 ; i < h ; i++) { + const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); + const vsint16_t tmpP3ssB = *(vsint16_t *)(tmpbis+8); + tmpbis += tmpStride; + + const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); + const vsint16_t sum1B = spu_add(tmpP0ssB, tmpP1ssB); + const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); + const vsint16_t sum2B = spu_add(tmpM1ssB, tmpP2ssB); + const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); + const vsint16_t sum3B = spu_add(tmpM2ssB, tmpP3ssB); + + tmpM2ssA = tmpM1ssA; + tmpM2ssB = tmpM1ssB; + tmpM1ssA = tmpP0ssA; + tmpM1ssB = tmpP0ssB; + tmpP0ssA = tmpP1ssA; + tmpP0ssB = tmpP1ssB; + tmpP1ssA = tmpP2ssA; + tmpP1ssB = tmpP2ssB; + tmpP2ssA = tmpP3ssA; + tmpP2ssB = tmpP3ssB; + + const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); + const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); + const vsint32_t pp1Be = spu_mule(sum1B, v20ss); + const vsint32_t pp1Bo = spu_mulo(sum1B, v20ss); + + const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); + const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); + const vsint32_t pp2Be = spu_mule(sum2B, v5ss); + const vsint32_t pp2Bo = spu_mulo(sum2B, v5ss); + + const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); + const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); + const vsint32_t pp3Be = spu_rlmask((vsint32_t)sum3B, nv16si); + const vsint32_t pp3Bo = spu_mulo(sum3B, v1ss); + + const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); + const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); + const vsint32_t pp1cBe = spu_add(pp1Be, v512si); + const vsint32_t pp1cBo = spu_add(pp1Bo, v512si); + + const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); + const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); + const vsint32_t pp32Be = spu_sub(pp3Be, pp2Be); + const vsint32_t pp32Bo = spu_sub(pp3Bo, pp2Bo); + + const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); + const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); + const vsint32_t sumBe = spu_add(pp1cBe, pp32Be); + const vsint32_t sumBo = spu_add(pp1cBo, pp32Bo); + + const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); + const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); + const vsint32_t ssumBe = spu_rlmask(sumBe, nv10si); + const vsint32_t ssumBo = spu_rlmask(sumBo, nv10si); + + vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, ssumBe, packs); + vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, ssumBo, packs); + + //Saturation to 0 and 255 + sat = spu_cmpgt(ssume,(vsint16_t)vzero); + ssume = spu_and(ssume,(vsint16_t)sat); + sat = spu_cmpgt(ssume,vmax); + ssume = spu_sel(ssume,vmax,sat); + sat = spu_cmpgt(ssumo,(vsint16_t)vzero); + ssumo = spu_and(ssumo,(vsint16_t)sat); + sat = spu_cmpgt(ssumo,vmax); + ssumo = spu_sel(ssumo,vmax,sat); + + const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); + + const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); + + /* 16x16 dest luma blocks are alway aligned */ + const vuint8_t vdst = *(vuint8_t *)dst; + + vuint8_t fsum; + OP_U8_SPU(fsum, sum, vdst); + + *(vuint8_t *)dst=fsum; + + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ + + } +} + +static void PREFIX_h264_qpel8_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { + + register int i; + + const int16_t i20ss= 20; + const int16_t i5ss= 5; + const int16_t i16ss= 16; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t v16ss = spu_splats(i16ss); + const int shift_src = (unsigned int) src & 15; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + /* 8x8 dest luma blocks are aligned or desaligned by 8*/ + const int shift_dst = (unsigned int) dst & 15; + vuint8_t dstmask; + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; + + if(shift_dst==0){ + dstmask = dst8mask1; + } + else{ + dstmask = dst8mask2; + } + + uint8_t *srcbis = src - (STRIDE_Y * 2); + + const vuint8_t srcM2a = *(vuint8_t *)(srcbis); + const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcM1a = *(vuint8_t *)(srcbis); + const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP0a = *(vuint8_t *)(srcbis); + const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP1a = *(vuint8_t *)(srcbis); + const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP2a = *(vuint8_t *)(srcbis); + const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); + + srcbis += STRIDE_Y; + + vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); + vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); + vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); + vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); + vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); + + for (i = 0 ; i < h ; i++) { + const vuint8_t srcP3a = *(vuint8_t *)(srcbis); + const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); + + const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); + srcbis += STRIDE_Y; + + const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); + const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); + const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); + + srcM2ssA = srcM1ssA; + srcM1ssA = srcP0ssA; + srcP0ssA = srcP1ssA; + srcP1ssA = srcP2ssA; + srcP2ssA = srcP3ssA; + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, v16ss); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint16_t pp3A = spu_add(sum3A, pp1A); + const vsint16_t psumA = spu_sub(pp3A, pp2A); + vsint16_t sumA = spu_rlmask(psumA, -5); + + //Saturation to 0 and 255 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); + sumA = spu_and(sumA,(vsint16_t)sat); + sat = spu_cmpgt(sumA,vmax); + sumA = spu_sel(sumA,vmax,sat); + + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + dst += dstStride; + } +} + +static void PREFIX_h264_qpel8_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { + + register int i; + + const int16_t i20ss = 20; + const int16_t i5ss = 5; + const int16_t i16ss = 16; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t v16ss = spu_splats(i16ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + /* 8x8 dest luma blocks are aligned or desaligned by 8*/ + const int shift_dst = (unsigned int) dst & 15; + vuint8_t dstmask; + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; + + if(shift_dst==0){ + dstmask = dst8mask1; + } + else{ + dstmask = dst8mask2; + } + + const int permM2 = (unsigned int) (src-2) & 15; + const int permM1 = (unsigned int) (src-1) & 15; + const int permP0 = (unsigned int) (src) & 15; + const int permP1 = (unsigned int) (src+1) & 15; + const int permP2 = (unsigned int) (src+2) & 15; + const int permP3 = (unsigned int) (src+3) & 15; + + register int align = ((((unsigned long)src) - 2) % 16); + + for (i = 0 ; i < h ; i ++) { + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vuint8_t srcR1 = *(vuint8_t *)(src-2); + vuint8_t srcR2 = *(vuint8_t *)(src+14); + + switch (align) { + default: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); + } break; + case 11: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = srcR2; + } break; + case 12: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = srcR2; + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 13: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = srcR2; + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 14: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = srcR2; + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 15: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = srcR2; + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + } + + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); + + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); + + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); + + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, v16ss); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint16_t pp3A = spu_add(sum3A, pp1A); + + const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); + + vsint16_t sumA = spu_rlmask(psumA, -5); + + //Saturation to 0 and 255 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); + sumA = spu_and(sumA,(vsint16_t)sat); + sat = spu_cmpgt(sumA,vmax); + sumA = spu_sel(sumA,vmax,sat); + + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + src += STRIDE_Y; + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ + } +} + +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ +static void PREFIX_h264_qpel8_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { + register int i; + + const int16_t i20ss = 20; + const int16_t i5ss = 5; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + const int permM2 = (unsigned int) (src-2) & 15; + const int permM1 = (unsigned int) (src-1) & 15; + const int permP0 = (unsigned int) (src) & 15; + const int permP1 = (unsigned int) (src+1) & 15; + const int permP2 = (unsigned int) (src+2) & 15; + const int permP3 = (unsigned int) (src+3) & 15; + + register int align = ((((unsigned long)src) - 2) % 16); + + src -= (2 * STRIDE_Y); + + for (i = 0 ; i < (h+5) ; i ++) { + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vuint8_t srcR1 = *(vuint8_t *)(src-2); + vuint8_t srcR2 = *(vuint8_t *)(src+14); + + switch (align) { + default: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); + } break; + case 11: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = srcR2; + } break; + case 12: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = srcR2; + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 13: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = srcR2; + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 14: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = srcR2; + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 15: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = srcR2; + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + } + + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh); + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh); + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh); + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh); + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh); + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh); + + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, sum3A); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint16_t psumA = spu_sub(pp1A, pp2A); + + *(vsint16_t *)tmp = psumA; + + src += STRIDE_Y; + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ + } + + const int32_t ni10si = -10; + const int16_t i1ss = 1; + const int32_t i512si = 512; + const int32_t ni16si = -16; + + const vsint32_t nv10si = spu_splats(ni10si); + const vsint16_t v1ss = spu_splats(i1ss); + const vsint32_t v512si = spu_splats(i512si); + const vsint32_t nv16si = spu_splats(ni16si); + + const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; + const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; + + const int shift_dst = (unsigned int) (dst) & 15; + /* 8x8 dest luma blocks are aligned or desaligned by 8*/ + vuint8_t dstmask; + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; + + if(shift_dst==0){ + dstmask = dst8mask1; + } + else{ + dstmask = dst8mask2; + } + + int16_t *tmpbis = tmp - (tmpStride * (h+5)); + + vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + + for (i = 0 ; i < h ; i++) { + const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + + const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); + const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); + const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); + + tmpM2ssA = tmpM1ssA; + tmpM1ssA = tmpP0ssA; + tmpP0ssA = tmpP1ssA; + tmpP1ssA = tmpP2ssA; + tmpP2ssA = tmpP3ssA; + + const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); + const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); + const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); + const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); + + const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); + const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); + + const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); + const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); + + const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); + const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); + + const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); + const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); + + const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); + const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); + + vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs); + vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs); + + //Saturation to 0 and 255 + sat = spu_cmpgt(ssume,(vsint16_t)vzero); + ssume = spu_and(ssume,(vsint16_t)sat); + sat = spu_cmpgt(ssume,vmax); + ssume = spu_sel(ssume,vmax,sat); + sat = spu_cmpgt(ssumo,(vsint16_t)vzero); + ssumo = spu_and(ssumo,(vsint16_t)sat); + sat = spu_cmpgt(ssumo,vmax); + ssumo = spu_sel(ssumo,vmax,sat); + + const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); + + const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ + + } +} + +static void PREFIX_h264_qpel4_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { + + register int i; + + const int16_t i20ss= 20; + const int16_t i5ss= 5; + const int16_t i16ss= 16; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t v16ss = spu_splats(i16ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const int shift_src = (unsigned int) src & 15; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ + const int shift_dst = (unsigned int) dst & 15; + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; + + switch(shift_dst){ + case 0: dstmask = dst4mask0; + break; + case 4: dstmask = dst4mask4; + break; + case 8: dstmask = dst4mask8; + break; + case 12: dstmask = dst4mask12; + break; + } + + uint8_t *srcbis = src - (STRIDE_Y * 2); + + const vuint8_t srcM2a = *(vuint8_t *)(srcbis); + const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcM1a = *(vuint8_t *)(srcbis); + const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP0a = *(vuint8_t *)(srcbis); + const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP1a = *(vuint8_t *)(srcbis); + const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); + + srcbis += STRIDE_Y; + const vuint8_t srcP2a = *(vuint8_t *)(srcbis); + const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); + + srcbis += STRIDE_Y; + + vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); + vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); + vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); + vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); + vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); + + for (i = 0 ; i < h ; i++) { + const vuint8_t srcP3a = *(vuint8_t *)(srcbis); + const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); + const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); + + const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); + srcbis += STRIDE_Y; + + const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); + const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); + const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); + + srcM2ssA = srcM1ssA; + srcM1ssA = srcP0ssA; + srcP0ssA = srcP1ssA; + srcP1ssA = srcP2ssA; + srcP2ssA = srcP3ssA; + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, v16ss); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint16_t pp3A = spu_add(sum3A, pp1A); + const vsint16_t psumA = spu_sub(pp3A, pp2A); + vsint16_t sumA = spu_rlmask(psumA, -5); + + //Saturation to 0 and 255 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); + sumA = spu_and(sumA,(vsint16_t)sat); + sat = spu_cmpgt(sumA,vmax); + sumA = spu_sel(sumA,vmax,sat); + + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + dst += dstStride; + } +} + +static void PREFIX_h264_qpel4_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { + + register int i; + + const int16_t i20ss = 20; + const int16_t i5ss = 5; + const int16_t i16ss = 16; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t v16ss = spu_splats(i16ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ + const int shift_dst = (unsigned int) dst & 15; + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; + + switch(shift_dst){ + case 0: dstmask = dst4mask0; + break; + case 4: dstmask = dst4mask4; + break; + case 8: dstmask = dst4mask8; + break; + case 12: dstmask = dst4mask12; + break; + } + + const int permM2 = (unsigned int) (src-2) & 15; + const int permM1 = (unsigned int) (src-1) & 15; + const int permP0 = (unsigned int) (src) & 15; + const int permP1 = (unsigned int) (src+1) & 15; + const int permP2 = (unsigned int) (src+2) & 15; + const int permP3 = (unsigned int) (src+3) & 15; + + register int align = ((((unsigned long)src) - 2) % 16); + + for (i = 0 ; i < h ; i ++) { + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vuint8_t srcR1 = *(vuint8_t *)(src-2); + vuint8_t srcR2 = *(vuint8_t *)(src+14); + + switch (align) { + default: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); + } break; + case 11: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = srcR2; + } break; + case 12: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = srcR2; + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 13: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = srcR2; + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 14: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = srcR2; + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 15: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = srcR2; + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + } + + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); + + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); + + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); + + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, v16ss); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint16_t pp3A = spu_add(sum3A, pp1A); + + const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); + + vsint16_t sumA = spu_rlmask(psumA, -5); + + //Saturation to 0 and 255 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); + sumA = spu_and(sumA,(vsint16_t)sat); + sat = spu_cmpgt(sumA,vmax); + sumA = spu_sel(sumA,vmax,sat); + + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + src += STRIDE_Y; + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ + } +} + +static void PREFIX_h264_qpel4_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { + register int i; + + const int16_t i20ss = 20; + const int16_t i5ss = 5; + const int16_t imax = 255; + + const vsint32_t vzero = spu_splats(0); + const vsint16_t v20ss = spu_splats(i20ss); + const vsint16_t v5ss = spu_splats(i5ss); + const vsint16_t vmax = (vsint16_t)spu_splats(imax); + vuint16_t sat; + + const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07}; + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; + + const int permM2 = (unsigned int) (src-2) & 15; + const int permM1 = (unsigned int) (src-1) & 15; + const int permP0 = (unsigned int) (src) & 15; + const int permP1 = (unsigned int) (src+1) & 15; + const int permP2 = (unsigned int) (src+2) & 15; + const int permP3 = (unsigned int) (src+3) & 15; + + register int align = ((((unsigned long)src) - 2) % 16); + + src -= (2 * STRIDE_Y); + + for (i = 0 ; i < (h+5) ; i ++) { + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vuint8_t srcR1 = *(vuint8_t *)(src-2); + vuint8_t srcR2 = *(vuint8_t *)(src+14); + + switch (align) { + default: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); + } break; + case 11: { + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); + srcP3 = srcR2; + } break; + case 12: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); + srcP2 = srcR2; + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 13: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); + srcP1 = srcR2; + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 14: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); + srcP0 = srcR2; + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + case 15: { + vuint8_t srcR3 = *(vuint8_t *)(src+30); + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); + srcM1 = srcR2; + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); + } break; + } + + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh); + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh); + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh); + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh); + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh); + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh); + + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); + + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); + const vsint16_t pp1A = spu_add(pp1A3, sum3A); + + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); + + const vsint16_t psumA = spu_sub(pp1A, pp2A); + + *(vsint16_t *)tmp = psumA; + + src += STRIDE_Y; + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ + } + + const int32_t ni10si = -10; + const int16_t i1ss = 1; + const int32_t i512si = 512; + const int32_t ni16si = -16; + + const vsint32_t nv10si = spu_splats(ni10si); + const vsint16_t v1ss = spu_splats(i1ss); + const vsint32_t v512si = spu_splats(i512si); + const vsint32_t nv16si = spu_splats(ni16si); + + const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; + const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; + + const int shift_dst = (unsigned int) (dst) & 15; + /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; + const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; + + switch(shift_dst){ + case 0: dstmask = dst4mask0; + break; + case 4: dstmask = dst4mask4; + break; + case 8: dstmask = dst4mask8; + break; + case 12: dstmask = dst4mask12; + break; + } + + int16_t *tmpbis = tmp - (tmpStride * (h+5)); + + vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + + for (i = 0 ; i < h ; i++) { + const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); + tmpbis += tmpStride; + + const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); + const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); + const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); + + tmpM2ssA = tmpM1ssA; + tmpM1ssA = tmpP0ssA; + tmpP0ssA = tmpP1ssA; + tmpP1ssA = tmpP2ssA; + tmpP2ssA = tmpP3ssA; + + const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); + const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); + const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); + const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); + + const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); + const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); + + const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); + const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); + + const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); + const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); + + const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); + const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); + + const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); + const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); + + vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs); + vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs); + + //Saturation to 0 and 255 + sat = spu_cmpgt(ssume,(vsint16_t)vzero); + ssume = spu_and(ssume,(vsint16_t)sat); + sat = spu_cmpgt(ssume,vmax); + ssume = spu_sel(ssume,vmax,sat); + sat = spu_cmpgt(ssumo,(vsint16_t)vzero); + ssumo = spu_and(ssumo,(vsint16_t)sat); + sat = spu_cmpgt(ssumo,vmax); + ssumo = spu_sel(ssumo,vmax,sat); + + const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); + + const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); + + const vuint8_t dst1 = *(vuint8_t *)dst; + + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); + vuint8_t fsum; + OP_U8_SPU(fsum, dsum, dst1); + + *(vuint8_t *)dst=fsum; + + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ + + } +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_mc_spu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_mc_spu.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2009 TUDelft + * + * Cell Parallel SPU - 2DWave Macroblock Decoding. + */ + +/** + * @file libavcodec/cell/spu/h264_main_spu.c + * Cell Parallel SPU - 2DWave Macroblock Decoding + * @author C C Chi + * + * SIMD kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + + +#include +#include +#include +#include + +#include "h264_mc_spu.h" +#include "h264_dma.h" +#include "h264_tables.h" +#include "h264_decode_mb_spu.h" + + +//biweight buffer +DECLARE_ALIGNED_16(uint8_t, tmp_y_ls[48*16]); +DECLARE_ALIGNED_16(uint8_t, tmp_cb_ls[32*8]); +DECLARE_ALIGNED_16(uint8_t, tmp_cr_ls[32*8]); + +//ref buffer (double buffered) +DECLARE_ALIGNED_16(uint8_t, mc_ref[2][16*(4+5)*48 + 2*16*(2+1)*32]); +uint8_t* ref_ptr; + +/** Motion Compensation functions*/ + +static void fill_mc_part(H264mc *mc, int n, int chroma_height, int x_offset, int y_offset, int itp, int weight, int list0, int list1){ + H264mc_part *mc_part = mc->mc_part + mc->npart; + mc_part->n =n; + mc_part->chroma_height =chroma_height; + mc_part->x_offset = x_offset; + mc_part->y_offset = y_offset; + mc_part->itp = itp; + mc_part->weight = weight; + mc_part->list0 = list0; + mc_part->list1 = list1; + + mc->npart++; +} + +void calc_mc_params(H264Mb* mb, H264mc *mc){ + int mb_type = mb->mb_type; + mc->npart=0; + + assert(!IS_INTRA(mb_type)); + if(IS_16X16(mb_type)){ + fill_mc_part(mc, 0, 8, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + }else if(IS_16X8(mb_type)){ + fill_mc_part(mc, 0, 4, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + fill_mc_part(mc, 8, 4, 0, 4, 0, 1, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); + }else if(IS_8X16(mb_type)){ + fill_mc_part(mc, 0, 8, 0, 0, 1, 2, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + fill_mc_part(mc, 4, 8, 4, 0, 1, 2, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); + }else{ + int i; + assert(IS_8X8(mb_type)); + + for(i=0; i<4; i++){ + const int sub_mb_type= mb->sub_mb_type[i]; + const int n= 4*i; + int x_offset= (i&1)<<2; + int y_offset= (i&2)<<1; + + if(IS_SUB_8X8(sub_mb_type)){ + fill_mc_part(mc, n, 4, x_offset, y_offset, 1, 3, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else if(IS_SUB_8X4(sub_mb_type)){ + fill_mc_part(mc, n, 2, x_offset, y_offset, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + fill_mc_part(mc, n+2, 2, x_offset, y_offset+2, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else if(IS_SUB_4X8(sub_mb_type)){ + fill_mc_part(mc, n, 4, x_offset, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + fill_mc_part(mc, n+1, 4, x_offset+2, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else{ + int j; + assert(IS_SUB_4X4(sub_mb_type)); + for(j=0; j<4; j++){ + int sub_x_offset= x_offset + 2*(j&1); + int sub_y_offset= y_offset + (j&2); + fill_mc_part(mc, n+j, 2, sub_x_offset, sub_y_offset, 2, 6, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + } + } + } + } +} + +/** +* Returns a pointer to mc_buf +*/ +static void* alloc_mc_buf(int size){ + void* ptr = ref_ptr; + ref_ptr += size; + return ptr; +} + +#define TAG_OFFSET_MC MBD_mc_buf1 +static uint8_t* get_mc_data(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){ + assert(src_ea); + int unalign; + unsigned address_align; + + uint8_t* ea; + uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride); + + ea = src_ea + pic_xoffset + pic_yoffset*linesize; + address_align = ((unsigned) ea) & 0xFFFFFFF0; + unalign = ((unsigned) ea) & 0xF; + get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, idx + TAG_OFFSET_MC, 0); + return (ref_ptr + unalign); +} + +static uint8_t* get_mc_data_blocking(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){ + assert(src_ea); + int unalign; + unsigned address_align; + + uint8_t* ea; + uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride); + + ea = src_ea + pic_xoffset + pic_yoffset*linesize; + address_align = ((unsigned) ea) & 0xFFFFFFF0; + unalign = ((unsigned) ea) & 0xF; + get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, MBD_mc_buf1, 0); + wait_dma_id(MBD_mc_buf1); + return (ref_ptr + unalign); +} + +//#undef TAG_OFFSET_MC + +static void get_mc_components(H264Context_spu *h, H264Mb *mb, H264mc_part* mc_part, Picture_spu *pic, int n, int chroma_height, int list, int src_x_offset, int src_y_offset, int idx){ + assert(pic); + H264slice *s = h->s; + ref_data *ref = &mc_part->ref[list]; + const int mx= mb->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; + const int my= mb->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; + + const int pic_width = 16*s->mb_width; + const int pic_height = 16*s->mb_height; + + int blk_h= chroma_height*2+5; + //int blk_w= 8*2+5; + + int blk_h_c= chroma_height+1; + //int blk_w_c= 9; + + int ymx= mx>>2; + int ymy= my>>2; + int cmy= my>>3; + int cmx= mx>>3; + + //truncate the motion vectors references + if(ymy>= pic_height+2){ + ymy=pic_height+1; + }else if(ymy <=-19){ + ymy=-18; + } + if(ymx>= pic_width+2){ + ymx= pic_width+1; + }else if(ymx<=-19){ + ymx=-19; + } + + if(cmy >= pic_height>>1){ + cmy = (pic_height>>1) -1; + }else if(cmy<=-9){ + cmy=-8; + } + if(cmx >= pic_width>>1){ + cmx = (pic_width>>1) -1; + }else if(cmx<=-9){ + cmx=-8; + } + if (!h->blocking){ + ref->data[0]=get_mc_data(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx); + ref->data[1]=get_mc_data(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); + ref->data[2]=get_mc_data(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); + } else { + ref->data[0]=get_mc_data_blocking(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx); + ref->data[1]=get_mc_data_blocking(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); + ref->data[2]=get_mc_data_blocking(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); + + } + +} + +static void get_ref_data(H264Context_spu *h, H264Mb *mb, H264mc_part *mc_part, int idx){ + H264slice *s = h->s; + int x_offset = mc_part->x_offset; + int y_offset = mc_part->y_offset; + int list0 = mc_part->list0; + int list1 = mc_part->list1; + int n = mc_part->n; + int chroma_height = mc_part->chroma_height; + Picture_spu *refpic; + + x_offset += 8*mb->mb_x; + y_offset += 8*mb->mb_y; + + if(list0){ + refpic= &s->ref_list[0][ mb->ref_cache[0][ scan8[n] ] ]; + get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 0, x_offset, y_offset, idx); + } + if(list1){ + refpic= &s->ref_list[1][ mb->ref_cache[1][ scan8[n] ] ]; + get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 1, x_offset, y_offset, idx); + } +} + +void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc){ + int idx = h->mc_idx; + int i; + + get_list = get_list_buf; + ref_ptr = mc_ref[idx]; + for(i=0; inpart; i++){ + get_ref_data(h, mb, &mc->mc_part[i], idx); + } +} + +static void mc_dir_part(H264Context_spu *h, H264mc_part* mc_part, int n, int chroma_height, int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, int stride_y, int stride_c){ + + H264Mb *mb = h->mb; + ref_data* ref = &mc_part->ref[list]; + const int mx= mb->mv_cache[list][ scan8[n] ][0]; //to determine the interpolation mode + const int my= mb->mv_cache[list][ scan8[n] ][1]; + const int luma_xy= (mx&3) + ((my&3)<<2); + uint8_t *src_y, *src_cb, *src_cr; + + src_y = ref->data[0] +2+2*STRIDE_Y; + src_cb = ref->data[1]; + src_cr = ref->data[2]; + + qpix_op[luma_xy](dest_y, src_y, stride_y, chroma_height*2); + chroma_op(dest_cb, src_cb, stride_c, chroma_height, mx&7, my&7); + chroma_op(dest_cr, src_cr, stride_c, chroma_height, mx&7, my&7); +} + + +static void mc_part_biweighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg){ + + H264Mb *mb = h->mb; + H264slice *s = h->s; + int n = mc_part->n; + int chroma_height = mc_part->chroma_height; + int itp = mc_part->itp; + int refn0 = mb->ref_cache[0][ scan8[n] ]; + int refn1 = mb->ref_cache[1][ scan8[n] ]; + qpel_mc_func *qpix_put= h->dsp.put_h264_qpel_pixels_tab[itp]; + h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp]; + + // don't optimize for luma-only case, since B-frames usually + // use implicit weights => chroma too. + mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c); + + mc_dir_part(h, mc_part, n, chroma_height, 1, tmp_y_ls, tmp_cb_ls, tmp_cr_ls, qpix_put, chroma_put, STRIDE_Y, STRIDE_C); + + if(s->use_weight == 2){ + int weight0 = s->implicit_weight[refn0][refn1][mb->mb_y&1]; + int weight1 = 64 - weight0; + luma_weight_avg( dest_y, tmp_y_ls, stride_y, STRIDE_Y, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0); + }else{ + luma_weight_avg(dest_y, tmp_y_ls, stride_y, STRIDE_Y, s->luma_log2_weight_denom, s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]); + + chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]); + + chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]); + } +} + +static void mc_part_weighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, int list1){ + + H264Mb *mb = h->mb; + H264slice *s = h->s; + + int n = mc_part->n; + int chroma_height = mc_part->chroma_height; + int itp = mc_part->itp; + qpel_mc_func *qpix_put= h->dsp.put_h264_qpel_pixels_tab[itp]; + h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp]; + + int list = list1 ? 1 : 0; + int refn = mb->ref_cache[list][ scan8[n] ]; + + mc_dir_part(h, mc_part, n, chroma_height, list, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c); + + luma_weight_op(dest_y, stride_y, s->luma_log2_weight_denom, s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]); + if(s->use_weight_chroma){ + chroma_weight_op(dest_cb, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]); + + chroma_weight_op(dest_cr, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]); + } +} + + +static void mc_part_std(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, int list0, int list1){ + int n = mc_part->n; + int chroma_height = mc_part->chroma_height; + int itp = mc_part->itp; + + qpel_mc_func *qpix_op= h->dsp.put_h264_qpel_pixels_tab[itp]; + h264_chroma_mc_func chroma_op= h->dsp.put_h264_chroma_pixels_tab[itp]; + + if(list0){ + mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c); + + qpix_op= h->dsp.avg_h264_qpel_pixels_tab[itp]; + chroma_op= h->dsp.avg_h264_chroma_pixels_tab[itp]; + } + + if(list1){ + mc_dir_part(h, mc_part, n, chroma_height, 1, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c); + } +} + +static void mc_part(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ + H264slice *s = h->s; + + int weight = mc_part->weight; + + int x_offset = mc_part->x_offset; + int y_offset = mc_part->y_offset; + int list0 = mc_part->list0; + int list1 = mc_part->list1; + + dest_y += 2*x_offset + 2*y_offset*stride_y; + dest_cb += x_offset + y_offset*stride_c; + dest_cr += x_offset + y_offset*stride_c; + + if(list0 && list1 && s->use_weight !=0){ + h264_biweight_func *weight_avg = &h->dsp.biweight_h264_pixels_tab[weight]; + mc_part_biweighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_avg[0], weight_avg[3]); + } + else if ((list0 || list1) && s->use_weight ==1){ + h264_weight_func *weight_op = &h->dsp.weight_h264_pixels_tab[weight]; + mc_part_weighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_op[0], weight_op[3], list1); + } + else{ + mc_part_std(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, list0, list1); + } +} + +void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ + int i; + H264mc *mc =h->mc; + for(i=0; inpart; i++){ + mc_part(h, &mc->mc_part[i], dest_y, dest_cb, dest_cr, stride_y, stride_c); + } +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_mc_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_mc_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,53 @@ +#ifndef H264_MC_SPU_H +#define H264_MC_SPU_H + +//#include "types_spu.h" + +// motion compensation constants: +#define MB_TYPE_16x16 0x0008 +#define MB_TYPE_16x8 0x0010 +#define MB_TYPE_8x16 0x0020 +#define MB_TYPE_8x8 0x0040 +#define MB_TYPE_P0L0 0x1000 +#define IS_16X16(a) ((a)&MB_TYPE_16x16) +#define IS_16X8(a) ((a)&MB_TYPE_16x8) +#define IS_8X16(a) ((a)&MB_TYPE_8x16) +#define IS_8X8(a) ((a)&MB_TYPE_8x8) +#define IS_SUB_8X8(a) ((a)&MB_TYPE_16x16) //note reused +#define IS_SUB_8X4(a) ((a)&MB_TYPE_16x8) //note reused +#define IS_SUB_4X8(a) ((a)&MB_TYPE_8x16) //note reused +#define IS_SUB_4X4(a) ((a)&MB_TYPE_8x8) //note reused +#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list)))) + +#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) +#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) + +//Motion compensation buffer strides +#define STRIDE_Y 48 +#define STRIDE_C 32 + +typedef struct ref_data{ + uint8_t *data[3]; +}ref_data; + +typedef struct H264mc_part{ + int n; + int chroma_height; + int x_offset; + int y_offset; + int itp; + int weight; + int list0; + int list1; + int use_weight; + ref_data ref[2]; + +}H264mc_part; + +typedef struct H264mc{ + H264mc_part mc_part[16]; + int npart; +}H264mc; + + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_pred_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_pred_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,90 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 prediction functions. + * @author Michael Niedermayer + */ + +#ifndef AVCODEC_H264PRED_H +#define AVCODEC_H264PRED_H + +//#include "libavutil/common.h" +//#include "dsputil.h" + +/** + * Prediction types + */ +//@{ +#define VERT_PRED 0 +#define HOR_PRED 1 +#define DC_PRED 2 +#define DIAG_DOWN_LEFT_PRED 3 +#define DIAG_DOWN_RIGHT_PRED 4 +#define VERT_RIGHT_PRED 5 +#define HOR_DOWN_PRED 6 +#define VERT_LEFT_PRED 7 +#define HOR_UP_PRED 8 + +#define LEFT_DC_PRED 9 +#define TOP_DC_PRED 10 +#define DC_128_PRED 11 + +#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN 12 +#define HOR_UP_PRED_RV40_NODOWN 13 +#define VERT_LEFT_PRED_RV40_NODOWN 14 + +#define DC_PRED8x8 0 +#define HOR_PRED8x8 1 +#define VERT_PRED8x8 2 +#define PLANE_PRED8x8 3 + +#define LEFT_DC_PRED8x8 4 +#define TOP_DC_PRED8x8 5 +#define DC_128_PRED8x8 6 + +#define ALZHEIMER_DC_L0T_PRED8x8 7 +#define ALZHEIMER_DC_0LT_PRED8x8 8 +#define ALZHEIMER_DC_L00_PRED8x8 9 +#define ALZHEIMER_DC_0L0_PRED8x8 10 +//@} + +/** + * Context for storing H.264 prediction functions + */ +typedef struct H264PredContext{ + void (*pred4x4 [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp? + void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride); + void (*pred8x8 [4+3+4])(uint8_t *src, int stride); + void (*pred16x16[4+3])(uint8_t *src, int stride); + + void (*pred4x4_add [2])(uint8_t *pix/*align 4*/, const DCTELEM *block/*align 16*/, int stride); + void (*pred8x8l_add [2])(uint8_t *pix/*align 8*/, const DCTELEM *block/*align 16*/, int stride); + void (*pred8x8_add [3])(uint8_t *pix/*align 8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); + void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); +}H264PredContext; + +void ff_h264_pred_init(H264PredContext *h); +void ff_h264_pred_init_arm(H264PredContext *h); + + +#endif /* AVCODEC_H264PRED_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_tables.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_tables.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,26 @@ +#include +#include "h264_tables.h" + +uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP] = {0, }; + +int block_offset[16+4+4]; + +void ff_cropTbl_init(){ + int i; + for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; + for(i=0;i>3); + } + for(i=0; i<4; i++){ + block_offset[16+i]= + block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*uvlinesize*((scan8[i] - scan8[0])>>3); + } +} \ No newline at end of file diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_tables.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_tables.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,83 @@ +#ifndef H264_TABLES_H +#define H264_TABLES_H + +#define MAX_NEG_CROP 1024 + +extern uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP]; +extern int block_offset[16+4+4]; + +static const uint8_t scan8[16 + 2*4]={ + 4+1*8, 5+1*8, 4+2*8, 5+2*8, + 6+1*8, 7+1*8, 6+2*8, 7+2*8, + 4+3*8, 5+3*8, 4+4*8, 5+4*8, + 6+3*8, 7+3*8, 6+4*8, 7+4*8, + 1+1*8, 2+1*8, + 1+2*8, 2+2*8, + 1+4*8, 2+4*8, + 1+5*8, 2+5*8, +}; + +static const uint8_t ff_zigzag_direct[64] = { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63 +}; + +static const uint8_t zigzag_scan[16]={ + 0+0*4, 1+0*4, 0+1*4, 0+2*4, + 1+1*4, 2+0*4, 3+0*4, 2+1*4, + 1+2*4, 0+3*4, 1+3*4, 2+2*4, + 3+1*4, 3+2*4, 2+3*4, 3+3*4, +}; + +static const uint8_t luma_dc_zigzag_scan[16]={ + 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64, + 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64, + 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64, + 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64, +}; + +static const uint8_t chroma_dc_scan[4]={ + (0+0*2)*16, (1+0*2)*16, + (0+1*2)*16, (1+1*2)*16, //FIXME +}; + +static const uint8_t rem6[52]={ +0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, +}; + +static const uint8_t div6[52]={ +0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, +}; + +static const uint8_t dequant4_coeff_init[6][3]={ + {10,13,16}, + {11,14,18}, + {13,16,20}, + {14,18,23}, + {16,20,25}, + {18,23,29}, +}; + +static const uint8_t dequant8_coeff_init_scan[16] = { + 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 +}; +static const uint8_t dequant8_coeff_init[6][6]={ + {20,18,32,19,25,24}, + {22,19,35,21,28,26}, + {26,23,42,24,33,31}, + {28,25,45,26,35,33}, + {32,28,51,30,40,38}, + {36,32,58,34,46,43}, +}; + + +void init_block_offset(int linesize, int uvlinesize); +void ff_cropTbl_init(); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/h264_types_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/h264_types_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,203 @@ +#ifndef H264_CELL_TYPES_H +#define H264_CELL_TYPES_H + +#include +#include + +typedef struct spe_pos{ + volatile int count; //number of mb processed + uint32_t pad[3]; +}spe_pos; + +//only the picture pointers are needed from the picture struct; +typedef struct Picture_spu { + uint8_t* data[3]; +} Picture_spu; + +///For Cell, might be idea to use this instead for everything +// struct that contains the pararms that change on slice +typedef struct H264slice{ + int deblocking_filter; + int linesize; + int uvlinesize; + int mb_width; + int mb_height; + + int use_weight; + int use_weight_chroma; + int luma_log2_weight_denom; + int chroma_log2_weight_denom; + + int16_t luma_weight[16][2][2]; + int16_t chroma_weight[16][2][2][2]; + int16_t implicit_weight[16][16][2]; + + // ref picture ptr + Picture_spu ref_list[2][16]; + int state; + int emu_edge_width; + int emu_edge_height; + + int slice_type; + int slice_type_nos; + int slice_alpha_c0_offset; + int slice_beta_offset; + + uint8_t chroma_qp_table[2][64]; + + H264Mb *blocks; + uint8_t *dst_y, *dst_cb, *dst_cr; + + //uint32_t pad[2]; // padding the structure for multiple of 16 bytes +}H264slice; + +typedef struct H264spe{ +#define EDIP 0 +#define EDB 1 +#define MBD 2 + int type; + int idx; + int spe_id; + int spe_total; + int mb_width; + int mb_stride; + int mb_height; + int linesize; + int uvlinesize; + //H264slice* slice_params; + void* src_spe; + void* tgt_spe; + + mutex_ea_t lock; + cond_ea_t cond; + atomic_ea_t cnt; + + mutex_ea_t rl_lock; + cond_ea_t rl_cond; + atomic_ea_t rl_cnt; +}H264spe; + +typedef struct H264Cabac_spu{ + int blocking; + + int top_cbp; + int left_cbp; + int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct + + uint32_t dequant4_buffer[6][52][16]; + uint32_t dequant8_buffer[2][52][64]; + uint32_t (*dequant4_coeff[6])[16]; + uint32_t (*dequant8_coeff[2])[64]; + + uint8_t (*non_zero_count_top)[32]; + uint8_t (*non_zero_count)[32]; + + uint8_t (*mvd_top[2])[2]; + uint8_t (*mvd[2])[2]; + + uint8_t *direct_top; + uint8_t *direct; + + uint8_t *chroma_pred_mode_top; + uint8_t *chroma_pred_mode; + + int8_t *intra4x4_pred_mode_top; + int8_t *intra4x4_pred_mode; + + uint16_t *cbp_top; + uint16_t *cbp; + + int8_t *qscale_top; + int8_t *qscale; + + int8_t *ref_index_top[2]; + int8_t *ref_index[2]; + + int16_t (*motion_val_top[2])[2]; + int16_t (*motion_val[2])[2]; + uint32_t *mb_type_top; + uint32_t *mb_type; + + int8_t *list1_ref_index[2]; + uint32_t *list1_mb_type; + DECLARE_ALIGNED_16(int16_t, list1_motion_val[2][4*4][2]); // fill for a macroblock when required + + int b_stride; + int mb_stride; + int mb_width; + int mb_height; + + uint8_t zigzag_scan[16]; + uint8_t zigzag_scan8x8[64]; + + uint8_t direct_cache[5*8]; + // Used to calculate loopfilter bS. + DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; + DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; + DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; + DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; + +} H264Cabac_spu; + +typedef struct EDSlice_spu{ + PPS pps; ///< current pps + + H264Mb *mbs; + + int state; + int qp_thresh; ///< QP threshold to skip loopfilter + + PictureInfo pic; + PictureInfo list1; +// Picture *ref_list[2][16]; ///Reordered version of default_ref_list according to picture reordering in slice header + int ref_count[2]; ///< counts frames or fields, depending on current mb mode + int slice_type; + int slice_type_nos; + int direct_8x8_inference_flag; + + uint8_t list_count; + uint32_t coded_pic_num; +///stuff only needed for nal/entropy decoding + H264Mb *m; + //GetBitContext gb; + const uint8_t *bytestream_start; + int byte_bufsize; + int transform_bypass; + int direct_spatial_mv_pred; + int map_col_to_list0[2][16]; + int dist_scale_factor[16]; + + int cabac_init_idc; + int ref2frm[2][64]; ///< reference to frame number lists, the first 2 are for -2,-1 + int qscale; + int chroma_qp[2]; //QPc + int last_qscale_diff; + +// Picture* release_ref[MAX_MMCO_COUNT]; +// int release_cnt; + + +// int use_weight; +// int use_weight_chroma; +// int luma_log2_weight_denom; +// int chroma_log2_weight_denom; + +// int8_t luma_weight[16][2][2]; +// int8_t chroma_weight[16][2][2][2]; +// int8_t implicit_weight[16][16][2]; + + + +// int slice_alpha_c0_offset; +// int slice_beta_offset; + +// int nal_ref_idc; +// int nal_unit_type; +// uint8_t *rbsp_buffer; +// unsigned int rbsp_buffer_size; + + + +} EDSlice_spu; + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/mathops_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/mathops_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,137 @@ +/* + * simple math operations + * Copyright (c) 2001, 2002 Fabrice Bellard + * Copyright (c) 2006 Michael Niedermayer et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef AVCODEC_MATHOPS_H +#define AVCODEC_MATHOPS_H + +// #include "libavutil/common.h" +// #include "libavutil/internal.h" +// +// /* generic implementation */ +// +// #ifndef MULL +// # define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s)) +// #endif +// +// #ifndef MULH +// //gcc 3.4 creates an incredibly bloated mess out of this +// //# define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) +// +// static av_always_inline int MULH(int a, int b){ +// return ((int64_t)(a) * (int64_t)(b))>>32; +// } +// #endif +// +// #ifndef UMULH +// static av_always_inline unsigned UMULH(unsigned a, unsigned b){ +// return ((uint64_t)(a) * (uint64_t)(b))>>32; +// } +// #endif +// +// #ifndef MUL64 +// # define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) +// #endif +// +// #ifndef MAC64 +// # define MAC64(d, a, b) ((d) += MUL64(a, b)) +// #endif +// +// #ifndef MLS64 +// # define MLS64(d, a, b) ((d) -= MUL64(a, b)) +// #endif +// +// /* signed 16x16 -> 32 multiply add accumulate */ +// #ifndef MAC16 +// # define MAC16(rt, ra, rb) rt += (ra) * (rb) +// #endif +// +// /* signed 16x16 -> 32 multiply */ +// #ifndef MUL16 +// # define MUL16(ra, rb) ((ra) * (rb)) +// #endif +// +// #ifndef MLS16 +// # define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb)) +// #endif + +/* median of 3 */ +#ifndef mid_pred +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ +#if 0 + int t= (a-b)&((a-b)>>31); + a-=t; + b+=t; + b-= (b-c)&((b-c)>>31); + b+= (a-b)&((a-b)>>31); + + return b; +#else + if(a>b){ + if(c>b){ + if(c>a) b=a; + else b=c; + } + }else{ + if(b>c){ + if(c>a) b=c; + else b=a; + } + } + return b; +#endif +} +#endif + +// #ifndef sign_extend +// static inline av_const int sign_extend(int val, unsigned bits) +// { +// return (val << (INT_BIT - bits)) >> (INT_BIT - bits); +// } +// #endif +// +// #ifndef zero_extend +// static inline av_const unsigned zero_extend(unsigned val, unsigned bits) +// { +// return (val << (INT_BIT - bits)) >> (INT_BIT - bits); +// } +// #endif +// +// #ifndef COPY3_IF_LT +// #define COPY3_IF_LT(x, y, a, b, c, d)\ +// if ((y) < (x)) {\ +// (x) = (y);\ +// (a) = (b);\ +// (c) = (d);\ +// } +// #endif +// +// #ifndef NEG_SSR32 +// # define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s))) +// #endif +// +// #ifndef NEG_USR32 +// # define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s))) +// #endif + +#endif /* AVCODEC_MATHOPS_H */ + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/rectangle_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/rectangle_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,92 @@ +/* + * rectangle filling function + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * useful rectangle filling function + * @author Michael Niedermayer + */ + +#ifndef AVCODEC_RECTANGLE_H +#define AVCODEC_RECTANGLE_H + +#include + +#define STRIDE_ALIGN 16 + + +/** + * fill a rectangle. + * @param h height of the rectangle, should be a constant + * @param w width of the rectangle, should be a constant + * @param size the size of val (1, 2 or 4), should be a constant + */ +static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ + uint8_t *p= (uint8_t*)vp; + assert(size==1 || size==2 || size==4); + assert(w<=4); + + w *= size; + stride *= size; + + assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); + assert((stride&(w-1))==0); + if(w==2){ + const uint16_t v= size==4 ? val : val*0x0101; + *(uint16_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint16_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint16_t*)(p + 2*stride)= v; + *(uint16_t*)(p + 3*stride)= v; + }else if(w==4){ + const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101; + *(uint32_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint32_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint32_t*)(p + 2*stride)= v; + *(uint32_t*)(p + 3*stride)= v; + }else if(w==8){ + const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL; + *(uint64_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint64_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint64_t*)(p + 2*stride)= v; + *(uint64_t*)(p + 3*stride)= v; + }else if(w==16){ + const uint64_t v= val*0x0100000001ULL; + *(uint64_t*)(p + 0+0*stride)= v; + *(uint64_t*)(p + 8+0*stride)= v; + *(uint64_t*)(p + 0+1*stride)= v; + *(uint64_t*)(p + 8+1*stride)= v; + if(h==2) return; + *(uint64_t*)(p + 0+2*stride)= v; + *(uint64_t*)(p + 8+2*stride)= v; + *(uint64_t*)(p + 0+3*stride)= v; + *(uint64_t*)(p + 8+3*stride)= v; + }else + assert(0); + assert(h==4); +} + +#endif /* AVCODEC_RECTANGLE_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/spe_ed.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/spe_ed.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,508 @@ +#define CELL_SPE + +#include +#include +#include +#include +#include "libavcodec/avcodec.h" +#include "h264_cabac_spu.h" +#include "cabac_spu.h" +#include "h264_types_spu.h" +#include "h264_tables.h" +#include "h264_dma.h" +#include "h264_tables.h" + +#define MB_WIDTH 240 +#define MB_STRIDE (MB_WIDTH+16) + +H264Cabac_spu hcabac; +CABACContext cabac; +DECLARE_ALIGNED_16(EDSlice_spu, slice[2]); +DECLARE_ALIGNED_16(H264Mb, mb[2]); +DECLARE_ALIGNED_16(H264spe, spe); + +DECLARE_ALIGNED_16(uint8_t, non_zero_count_table[2][MB_STRIDE][32]); +DECLARE_ALIGNED_16(uint8_t, mvd_table[2][2][8*MB_STRIDE][2]); +DECLARE_ALIGNED_16(uint8_t, direct_table[2][4*MB_STRIDE]); +DECLARE_ALIGNED_16(uint8_t, chroma_pred_mode_table[2][MB_STRIDE]); +DECLARE_ALIGNED_16(uint8_t, intra4x4_pred_mode_table[2][8*MB_STRIDE]); +DECLARE_ALIGNED_16(uint16_t,cbp_table[2][MB_STRIDE]); +DECLARE_ALIGNED_16(uint8_t, qscale_table[2][MB_STRIDE]); + +DECLARE_ALIGNED_16(uint32_t, mb_type_table[2][MB_STRIDE]); +DECLARE_ALIGNED_16(int8_t, ref_index_table[2][2][4*MB_STRIDE]); +DECLARE_ALIGNED_16(int16_t, motion_val_table[2][2][4*4*MB_WIDTH][2]); + +DECLARE_ALIGNED(128, uint8_t, bytestream_ls[4096]); +DECLARE_ALIGNED_16(uint32_t, list1_mb_type_table[2][MB_STRIDE]); +DECLARE_ALIGNED_16(int8_t, list1_ref_index_table[2][2][4*MB_STRIDE]); + +DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending +//mb position of neighbouring spes +DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1 +static int total_lines; + +static inline int dep_resolved(H264spe *p){ + int spe_id = p->spe_id; + volatile int lines_proc = src_spe.count; + if (spe_id==0) + return (total_lines < lines_proc-1 +p->mb_height)? 1:0; + else + return (total_lines < lines_proc-1)? 1:0; +} + +static void update_tgt_spe_dep(H264spe *p, int end){ + // if (end ){ + total_lines++; + spe_pos* dma_spe = &dma_temp; + spe_pos* tgt_spe = p->tgt_spe + (unsigned) &src_spe; //located in target spe local store + dma_spe->count = end? total_lines+1: total_lines; + spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), ED_put); + // } + +} + +static int init_cabac(H264spe *p, H264Cabac_spu *hc){ + hc->mb_height = p->mb_height; + hc->mb_width = p->mb_width; + hc->b_stride = 4*p->mb_width; + hc->mb_stride = p->mb_stride; + + for(int i=0; i<16; i++){ + #define T(x) (x>>2) | ((x<<2) & 0xF) + hc->zigzag_scan[i] = T(zigzag_scan[i]); + #undef T + } + for(int i=0; i<64; i++){ + #define T(x) (x>>3) | ((x&7)<<3) + hc->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]); + #undef T + } +} + +static void reset_cabac_buffers(){ + memset(intra4x4_pred_mode_table, 0, sizeof(intra4x4_pred_mode_table)); + memset(mvd_table, 0, sizeof(mvd_table)); + memset(direct_table, 0, sizeof(direct_table)); + memset(chroma_pred_mode_table, 0, sizeof(chroma_pred_mode_table)); + memset(cbp_table, 0, sizeof(cbp_table)); + memset(qscale_table, 0, sizeof(qscale_table)); + memset(mb_type_table, 0, sizeof(mb_type_table)); + memset(ref_index_table, 0, sizeof(ref_index_table)); + memset(motion_val_table, 0, sizeof(motion_val_table)); +} + +static void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int bufsize){ + int align = (unsigned) buf & 0xF; + int dma_size; + + c->bytestream_ea_start= + c->bytestream_ea= buf; + c->bytestream_ea_end= buf + bufsize; + c->bufsize = bufsize; + + if (bufsize + align >= sizeof(bytestream_ls)){ + dma_size = sizeof(bytestream_ls); + c->bufsize = c->bufsize +align - sizeof(bytestream_ls); + }else{ + int align_end = (bufsize+align) &0xF; + if (align_end) + dma_size = bufsize+align + 16-align_end; + else + dma_size = bufsize+align; + c->bufsize = 0; + } +// printf("%d\n", dma_size); + c->bytestream_end = &bytestream_ls[dma_size]; + c->bytestream_start= c->bytestream = &bytestream_ls[align]; + spu_dma_get(bytestream_ls, (unsigned) buf - align, dma_size, ED_get ); + c->bytestream_ea_start= + c->bytestream_ea= buf + dma_size -align; + + wait_dma_id(ED_get); + + if (align %2){ + c->low = (*c->bytestream++)<<18; + c->low+= (*c->bytestream++)<<10; + c->low+= ((*c->bytestream++)<<2) + 2; + }else { + c->low = (*c->bytestream++)<<18; + c->low+= (*c->bytestream++)<<10; + c->low+= (2<<8); + } + + c->range= 0x1FE; + bytecount=0; +} + +static void init_dequant8_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){ + int i,q,x; + const int transpose = HAVE_ALTIVEC; + hc->dequant8_coeff[0] = hc->dequant8_buffer[0]; + hc->dequant8_coeff[1] = hc->dequant8_buffer[1]; + + for(i=0; i<2; i++){ + if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){ + hc->dequant8_coeff[1] = hc->dequant8_buffer[0]; + break; + } + + for(q=0; q<52; q++){ + int shift = div6[q]; + int idx = rem6[q]; + for(x=0; x<64; x++) + hc->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] = + ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * + s->pps.scaling_matrix8[i][x]) << shift; + } + } +} + +static void init_dequant4_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){ + int i,j,q,x; + const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; + for(i=0; i<6; i++ ){ + hc->dequant4_coeff[i] = hc->dequant4_buffer[i]; + for(j=0; jpps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){ + hc->dequant4_coeff[i] = hc->dequant4_buffer[j]; + break; + } + } + if(jdequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] = + ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] * + s->pps.scaling_matrix4[i][x]) << shift; + } + } +} + +static void init_dequant_tables(EDSlice_spu *s, H264Cabac_spu *hc){ + int i,x; + + init_dequant4_coeff_table(s, hc); + if(s->pps.transform_8x8_mode) + init_dequant8_coeff_table(s, hc); + if(s->transform_bypass){ + for(i=0; i<6; i++) + for(x=0; x<16; x++) + hc->dequant4_coeff[i][0][x] = 1<<6; + if(s->pps.transform_8x8_mode) + for(i=0; i<2; i++) + for(x=0; x<64; x++) + hc->dequant8_coeff[i][0][x] = 1<<6; + } +} + +static void init_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s){ + hc->non_zero_count_top = non_zero_count_table[0]; + hc->non_zero_count = non_zero_count_table[1]; + hc->mvd_top[0] = mvd_table[0][0]; + hc->mvd[0] = mvd_table[0][1]; + hc->mvd_top[1] = mvd_table[1][0]; + hc->mvd[1] = mvd_table[1][1]; + hc->direct_top = direct_table[0]; + hc->direct = direct_table[1]; + hc->chroma_pred_mode_top = chroma_pred_mode_table[0]; + hc->chroma_pred_mode = chroma_pred_mode_table[1]; + hc->intra4x4_pred_mode_top = intra4x4_pred_mode_table[0]; + hc->intra4x4_pred_mode = intra4x4_pred_mode_table[1]; + hc->cbp_top = cbp_table[0]; + hc->cbp = cbp_table[1]; + hc->qscale_top = qscale_table[0] +1; + hc->qscale = qscale_table[1] +1; + + hc->mb_type_top = mb_type_table[0]+1; + hc->mb_type = mb_type_table[1]+1; + hc->ref_index_top[0] = ref_index_table[0][0]; + hc->ref_index_top[1] = ref_index_table[1][0]; + hc->ref_index[0] = ref_index_table[0][1]; + hc->ref_index[1] = ref_index_table[1][1]; + hc->motion_val_top[0] = motion_val_table[0][0]; + hc->motion_val_top[1] = motion_val_table[1][0]; + hc->motion_val[0] = motion_val_table[0][1]; + hc->motion_val[1] = motion_val_table[1][1]; + + int mb_stride = hc->mb_stride; + + if (s->slice_type_nos == FF_B_TYPE){ + while(!dep_resolved(&spe)); + spu_dma_get(list1_mb_type_table[0], (unsigned) (s->list1.mb_type -1), mb_stride*sizeof(uint32_t), ED_get); + spu_dma_get(list1_ref_index_table[0][0], (unsigned) s->list1.ref_index[0], mb_stride*4*sizeof(int8_t), ED_get); + spu_dma_get(list1_ref_index_table[0][1], (unsigned) s->list1.ref_index[1], mb_stride*4*sizeof(int8_t), ED_get); + wait_dma_id(ED_get); + spu_dma_get(list1_mb_type_table[1], (unsigned) (s->list1.mb_type -1 + mb_stride), mb_stride*sizeof(uint32_t), ED_get); + spu_dma_get(list1_ref_index_table[1][0], (unsigned) (s->list1.ref_index[0] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); + spu_dma_get(list1_ref_index_table[1][1], (unsigned) (s->list1.ref_index[1] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); + hc->list1_mb_type = list1_mb_type_table[0]+1; + hc->list1_ref_index[0] = list1_ref_index_table[0][0]; + hc->list1_ref_index[1] = list1_ref_index_table[0][1]; + } + +} + +static void update_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s, int line){ + int mb_stride = hc->mb_stride; + int mb_width = hc->mb_width; + int top = (line+1)%2; + int cur = line%2; + int bottom = (line+1)%2; //same as top, but to identify prebuffering of next line. + + hc->non_zero_count_top = non_zero_count_table[top]; + hc->non_zero_count = non_zero_count_table[cur]; + hc->mvd_top[0] = mvd_table[0][top]; + hc->mvd[0] = mvd_table[0][cur]; + hc->mvd_top[1] = mvd_table[1][top]; + hc->mvd[1] = mvd_table[1][cur]; + hc->direct_top = direct_table[top]; + hc->direct = direct_table[cur]; + hc->chroma_pred_mode_top = chroma_pred_mode_table[top]; + hc->chroma_pred_mode = chroma_pred_mode_table[cur]; + hc->intra4x4_pred_mode_top = intra4x4_pred_mode_table[top]; + hc->intra4x4_pred_mode = intra4x4_pred_mode_table[cur]; + hc->cbp_top = cbp_table[top]; + hc->cbp = cbp_table[cur]; + hc->qscale_top = qscale_table[top] +1; + hc->qscale = qscale_table[cur] +1; + + hc->mb_type_top = mb_type_table[top]+1; + hc->mb_type = mb_type_table[cur]+1; + hc->ref_index_top[0] = ref_index_table[0][top]; + hc->ref_index_top[1] = ref_index_table[1][top]; + hc->ref_index[0] = ref_index_table[0][cur]; + hc->ref_index[1] = ref_index_table[1][cur]; + hc->motion_val_top[0] = motion_val_table[0][top]; + hc->motion_val_top[1] = motion_val_table[1][top]; + hc->motion_val[0] = motion_val_table[0][cur]; + hc->motion_val[1] = motion_val_table[1][cur]; + + wait_dma_id(ED_put); + + spu_dma_put(mb_type_table[top], (unsigned) (s->pic.mb_type -1 + line*mb_stride), mb_stride*sizeof(uint32_t), ED_put); + spu_dma_put(ref_index_table[0][top], (unsigned) (s->pic.ref_index[0] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put); + spu_dma_put(ref_index_table[1][top], (unsigned) (s->pic.ref_index[1] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put); + spu_dma_put(motion_val_table[0][top], (unsigned) (s->pic.motion_val[0]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put); + spu_dma_put(motion_val_table[1][top], (unsigned) (s->pic.motion_val[1]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put); + + if (s->slice_type_nos == FF_B_TYPE){ + update_tgt_spe_dep(&spe, 0); + wait_dma_id(ED_get); + + if (line + 2 < hc->mb_height){ + while(!dep_resolved(&spe)); + spu_dma_get(list1_mb_type_table[cur], (unsigned) (s->list1.mb_type -1 + (line+2)*mb_stride), mb_stride*sizeof(uint32_t), ED_get); + spu_dma_get(list1_ref_index_table[cur][0], (unsigned) (s->list1.ref_index[0] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); + spu_dma_get(list1_ref_index_table[cur][1], (unsigned) (s->list1.ref_index[1] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); + } + hc->list1_mb_type = list1_mb_type_table[bottom]+1; + hc->list1_ref_index[0] = list1_ref_index_table[bottom][0]; + hc->list1_ref_index[1] = list1_ref_index_table[bottom][1]; + } + +} + +// void printmbdiff(EDSlice_spu *s, H264Cabac_spu *hc, H264Mb *mp, H264Mb *ms){ +// +// printf("mb_x %d, %d\n", mp->mb_x, ms->mb_x); +// printf("mb_y %d, %d\n", mp->mb_y, ms->mb_y); +// printf("mb_xy %d, %d\n", mp->mb_xy, ms->mb_xy); +// printf("top_mb_xy %d, %d\n", mp->top_mb_xy, ms->top_mb_xy); +// printf("left_mb_xy %d, %d\n", mp->left_mb_xy, ms->left_mb_xy); +// printf("chroma_pred_mode %d, %d\n", mp->chroma_pred_mode, ms->chroma_pred_mode); +// printf("intra16x16_pred_mode %d, %d\n", mp->intra16x16_pred_mode, ms->intra16x16_pred_mode); +// printf("topleft_samples %d, %d\n", mp->topleft_samples_available, ms->topleft_samples_available); +// printf("topright_samples %d, %d\n", mp->topright_samples_available, ms->topright_samples_available); +// printf("top_samples %d, %d\n", mp->top_samples_available, ms->top_samples_available); +// printf("left_samples %d, %d\n", mp->left_samples_available, ms->left_samples_available); +// +// if (memcmp(mp->intra4x4_pred_mode_cache, ms->intra4x4_pred_mode_cache, 40)){ +// for (int i=0; i<5; i++){ +// for (int j=0; j<8; j++){ +// printf("%d, %d\t", mp->intra4x4_pred_mode_cache[i*8+j],ms->intra4x4_pred_mode_cache[i*8+j]); +// } +// printf("\n"); +// } +// } +// +// if (memcmp(mp->non_zero_count_cache, ms->non_zero_count_cache, 48)){ +// for (int i=0; i<6; i++){ +// for (int j=0; j<8; j++){ +// printf("%u, %u\t", mp->non_zero_count_cache[i*8+j],ms->non_zero_count_cache[i*8+j]); +// } +// printf("\n"); +// } +// } +// +// if (memcmp(mp->sub_mb_type, ms->sub_mb_type, 8)){ +// for (int i=0; i<4; i++){ +// printf("%u, %u\t", mp->sub_mb_type[i], mp->sub_mb_type[i]); +// printf("\n"); +// } +// } +// +// if (memcmp(mp->mv_cache, ms->mv_cache, 320)){ +// for (int k=0; k<2; k++){ +// for (int i=0; i<5; i++){ +// for (int j=0; j<8; j++){ +// printf("%d, %d, %d, %d\t", mp->mv_cache[k][i*8+j][0], mp->mv_cache[k][i*8+j][1], ms->mv_cache[k][i*8+j][0], ms->mv_cache[k][i*8+j][1]); +// } +// printf("\n"); +// } +// } +// } +// +// if (memcmp(mp->ref_cache, ms->ref_cache, 80)){ +// for (int k=0; k<2; k++){ +// for (int i=0; i<5; i++){ +// for (int j=0; j<8; j++){ +// printf("%d, %d\t", mp->ref_cache[k][i*8+j], ms->ref_cache[k][i*8+j]); +// } +// printf("\n"); +// } +// } +// } +// +// printf("cbp %d, %d\n", mp->cbp, ms->cbp); +// for (int i=0; imb_stride; i++){ +// printf("%d, ", hc->cbp[i]); fflush(0); +// } +// printf("\n"); +// +// printf("mb_type %x, %x\n", mp->mb_type, ms->mb_type); +// printf("mb_type IS_INTRA %d, IS_INTRA16x16 %d, IS_DIRECT %d\n", IS_INTRA(ms->mb_type), IS_INTRA16x16(ms->mb_type), IS_DIRECT(ms->mb_type) ); +// printf("left_type %d, %d\n", mp->left_type, ms->left_type); +// printf("top_type %d, %d\n", mp->top_type, ms->top_type); +// printf("qscale_mb_xy %d, %d\n", mp->qscale_mb_xy, ms->qscale_mb_xy); +// printf("qscale_left_mb_xy %d, %d\n", mp->qscale_left_mb_xy, ms->qscale_left_mb_xy); +// printf("qscale_top_mb_xy %d, %d\n", mp->qscale_top_mb_xy, ms->qscale_top_mb_xy); +// // for (int i=0; imb_stride; i++){ +// // printf("%d, ", qscale_table[0][i]); fflush(0); +// // } +// +// if (memcmp(mp->mb, ms->mb, 768)){ +// for (int i=0; i<16; i++){ +// for (int j=0; j<16; j++){ +// printf("%d, %d\t", mp->mb[j + i*16], ms->ref_cache[j + i*16]); +// } +// printf("\n"); +// } +// for (int i=0; i<8; i++){ +// for (int j=0; j<8; j++){ +// printf("%d, %d\t", mp->mb[256 + j + i*8], ms->ref_cache[j + i*8]); +// } +// printf("\n"); +// } +// for (int i=0; i<8; i++){ +// for (int j=0; j<8; j++){ +// printf("%d, %d\t", mp->mb[320+ j + i*8], ms->ref_cache[j + i*8]); +// } +// printf("\n"); +// } +// } +// +// if (memcmp(mp->bS, ms->bS, 32)){ +// for (int k=0; k<2; k++){ +// for (int i=0; i<4; i++){ +// for (int j=0; j<4; j++){ +// printf("%d, %d\t", mp->bS[k][i][j], mp->mv_cache[k][i][j]); +// } +// printf("\n"); +// } +// } +// } +// if (memcmp(mp->edges, ms->edges, 4)){ +// printf("edges %d, %d, %d, %d\n", mp->edges[0], ms->edges[0], mp->edges[1], ms->edges[1]); +// printf("deblock %d, %d\n", mp->deblock_mb, ms->deblock_mb); +// } +// +// printf("dequant4_coeff_y %d, %d\n", mp->dequant4_coeff_y, ms->dequant4_coeff_y); +// printf("dequant4_coeff_cb %d, %d\n", mp->dequant4_coeff_cb, ms->dequant4_coeff_cb); +// printf("dequant4_coeff_cr %d, %d\n", mp->dequant4_coeff_cr, ms->dequant4_coeff_cr); +// } +// DECLARE_ALIGNED_16(H264Mb, tmp); + + +int main(unsigned long long id, unsigned long long argp){ + EDSlice_spu *s; + H264Cabac_spu *hc = &hcabac; + CABACContext *c = &cabac; + H264spe *p = &spe; + + spu_write_out_mbox((unsigned) slice); + spu_dma_get(p, (unsigned) argp, sizeof(H264spe), ED_spe); //ID_slice is used out of convienience + wait_dma_id(ED_spe); + + ff_init_cabac_states(); + init_cabac(p, hc); + hc->blocking=0; + for(;;){ + spu_read_in_mbox(); + s = &slice[0]; + reset_cabac_buffers(); + init_entropy_buf(hc, s); + + if (hc->blocking) wait_dma_id(ED_get); + //printf("framesize %d\n", s->byte_bufsize);fflush(0); + init_dequant_tables(s, hc); + ff_init_cabac_decoder( c, s->bytestream_start, s->byte_bufsize ); + ff_h264_init_cabac_states(s, c); + + int mb_slot=0; + for(int j=0; jmb_height; j++){ + for(int i=0; imb_width; i++){ + int eos,ret; + H264Mb *m = &mb[mb_slot]; + m->mb_x=i; + m->mb_y=j; + s->m = m; + + ret = ff_h264_decode_mb_cabac(hc, s, c); + +// spu_dma_get(&tmp, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_get); +// wait_dma_id(ED_get); +// if (memcmp(&tmp, m, sizeof(H264Mb))){ +// printf("coded pic num %d\n", s->coded_pic_num); +// printmbdiff(s, hc,&tmp, m); +// return 0; +// } + //printf("qscale %d\n", m->qscale_mb_xy); + if (!hc->blocking){ + if (mb_slot){ + spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb1); + wait_dma_id(ED_putmb0); + }else { + spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0); + wait_dma_id(ED_putmb1); + } + mb_slot++; mb_slot%=2; + }else { + spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0); + wait_dma_id(ED_putmb0); + } + + + eos = get_cabac_terminate( c); + + if( ret < 0) { + fprintf(stderr, "error at %d bytecount\n", bytecount); + return -1; + } + } + update_entropy_buf(hc, s, j); + if (hc->blocking){ wait_dma_id(ED_get); wait_dma_id(ED_put);} + } + wait_dma_id(ED_put); + spu_write_out_mbox(1); + + } + + return 0; + + +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/spe_mbd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/spe_mbd.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2009 TUDelft + * + * Cell Parallel SPU - 2DWave Macroblock Decoding. + */ + +/** + * @file libavcodec/cell/spu/h264_main_spu.c + * Cell Parallel SPU - 2DWave Macroblock Decoding + * @author C C Chi + * + * SIMD kernels + * H.264/AVC motion compensation + * @author Mauricio Alvarez + * @author Albert Paradis + */ + + +/* Enable this lines to enable simulator statistic or generate traces */ + +//#define ENABLE_SIMULATOR +//#define ENABLE_PARAVER_TRACING_CELL + +#ifdef ENABLE_SIMULATOR + #include "/opt/ibm/systemsim-cell/include/callthru/spu/profile.h" +#endif + +#ifdef ENABLE_TRACES + #include "spu_trace.h" +#endif +#include +#include +#include +#include +#include +#include +#include + +//#include "dsputil_cell.h" +#include "types_spu.h" +#include "h264_intra_spu.h" +#include "h264_decode_mb_spu.h" +#include "h264_mc_spu.h" +#include "h264_tables.h" +#include "h264_dma.h" + + +/** functions for supporting tracing with paraver for the SPU + * + */ +inline void trace_init_SPU(){ +#ifdef ENABLE_PARAVER_TRACING_CELL + SPUtrace_init (); +#endif +} + +inline void trace_fini_SPU(){ +#ifdef ENABLE_PARAVER_TRACING_CELL + SPUtrace_fini (); +#endif +} + +inline void trace_event_SPU(int event, int id){ +#ifdef ENABLE_PARAVER_TRACING_CELL + SPUtrace_event (event, id); +#else + (void) event; + (void) id; +#endif +} + +// for simulator statistic +inline void clear_statistic(){ +#ifdef ENABLE_SIMULATOR + prof_clear(); +#endif +} + +inline void start_statistic(){ +#ifdef ENABLE_SIMULATOR + prof_start(); +#endif +} + +inline void stop_statistic(){ +#ifdef ENABLE_SIMULATOR + prof_stop(); +#endif +} + +H264Context_spu h_context; // struct that contain all the params to decode a macroblock + +DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending +//mb position of neighbouring spes +DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1 +//DECLARE_ALIGNED_16(spe_pos, tgt_spe); //written by SPE_ID +1 + +/** +* Initializes the buffering of the mb data and associated mc data. The init_mb_buffer needs to +* be called before any get_next_mb and only once at the beginning of the slice. +* +* Note: init_mc_buffer and get_next_mb expect the width of the picture to be more than 2 mb's +*/ +#define TAG_OFFSET_MB MBD_buf1 +#define TAG_OFFSET_MC MBD_mc_buf1 +static void init_mb_buffer(H264Context_spu* h){ + H264slice *s = h->s; + H264Mb *next_mb; + int mb_height = s->mb_height; + int mb_width = s->mb_width; + + h->mc_idx =0; + + h->mb_dec = 0; + h->mb_mc = 0; + h->mb_dma = 0; + + h->curr_line %= mb_height; + h->next_mb_idx = h->curr_line * mb_width; + h->mb_id = h->curr_line * mb_width; + h->n_mc= h->curr_line * mb_width; + + next_mb = s->blocks + h->mb_id; + spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); + h->mb_dma++; + h->mb_id++; + + next_mb = s->blocks + h->mb_id; + spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); + h->mb_dma++; + h->mb_id++; + wait_dma_id(0 + TAG_OFFSET_MB); + + H264Mb *mb = &h->mb_buf[0]; + H264mc *mc = &h->mc_buf[0]; + if(!IS_INTRA(mb->mb_type)){ + calc_mc_params(mb, mc); + fill_ref_buf(h, mb, mc); + } + h->n_mc++; + h->mb_mc++; +} + +static void *get_next_mb(H264Context_spu *h){ + H264slice *s = h->s; + H264spe *spe = &h->spe; + H264Mb *mb_buf = h->mb_buf; + H264mc *mc_buf = h->mc_buf; + H264Mb *next_mb; + H264Mb *next_dma_mb; + + if (h->curr_line >= s->mb_height) + return NULL; + + if (h->mb_id < h->mb_total){ + next_dma_mb = s->blocks + h->mb_id; + spu_dma_get(&mb_buf[h->mb_dma], (unsigned) next_dma_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); + h->mb_dma = (h->mb_dma+1)%3; + h->mb_id++; + if (h->mb_id%s->mb_width ==0){ + h->mb_id+=(spe->spe_total-1)*s->mb_width; + } + } + + h->mc = &mc_buf[h->mc_idx]; + wait_dma_id(h->mc_idx + TAG_OFFSET_MC); + h->mc_idx = (h->mc_idx+1)%2; + if (h->n_mc < h->mb_total){ + wait_dma_id(h->mb_mc + TAG_OFFSET_MB); + H264Mb *mb = &mb_buf[h->mb_mc]; + H264mc *mc = &mc_buf[h->mc_idx]; + if(!IS_INTRA(mb->mb_type)){ + calc_mc_params(mb, mc); + fill_ref_buf(h, mb, mc); + } + h->n_mc++; + if (h->n_mc%s->mb_width ==0){ + h->n_mc+=(spe->spe_total-1)*s->mb_width; + } + } + h->next_mb_idx++; + if (h->next_mb_idx % s->mb_width ==0){ + h->next_mb_idx+=(spe->spe_total-1)*s->mb_width; + h->curr_line+=spe->spe_total; + } + + h->mb_mc = (h->mb_mc+1)%3; + next_mb = &mb_buf[h->mb_dec]; + h->mb_dec = (h->mb_dec+1)%3; + return next_mb; +} + +static void *get_next_mb_blocking(H264Context_spu *h){ + H264slice *s = h->s; + H264spe *spe = &h->spe; + H264Mb *mb_buf = h->mb_buf; + H264mc *mc_buf = h->mc_buf; + H264Mb *next_mb; + H264Mb *next_dma_mb; + + if (h->mb_id >= h->mb_total) + return NULL; + + //printf("%d\n", h->mb_id); + next_dma_mb = s->blocks + h->mb_id; + spu_dma_get(&mb_buf[0], (unsigned) next_dma_mb, sizeof(H264Mb), MBD_buf1); + //h->mb_dma = (h->mb_dma+1)%3; + h->mb_id++; + if (h->mb_id%s->mb_width ==0){ + h->mb_id+=(spe->spe_total-1)*s->mb_width; + } + wait_dma_id(MBD_buf1); + + h->mc = &mc_buf[0]; + //h->mc_idx = (h->mc_idx+1)%2; + //if (h->n_mc < h->mb_total){ + H264Mb *mb = &mb_buf[0]; + H264mc *mc = &mc_buf[0]; + if(!IS_INTRA(mb->mb_type)){ + calc_mc_params(mb, mc); + fill_ref_buf(h, mb, mc); + } + //h->n_mc++; + /*if (h->n_mc%s->mb_width ==0){ + h->n_mc+=(spe->spe_total-1)*s->mb_width; + }*/ +// wait_dma_id(MBD_mc_buf1); + +// h->next_mb_idx++; +// if (h->next_mb_idx % s->mb_width ==0){ +// h->next_mb_idx+=(spe->spe_total-1)*s->mb_width; +// h->curr_line+=spe->spe_total; +// } + +// h->mb_mc = (h->mb_mc+1)%3; + next_mb = &mb_buf[0]; +// h->mb_dec = (h->mb_dec+1)%3; + return next_mb; +} + + +#undef TAG_OFFSET_MB +#undef TAG_OFFSET_MC +static inline int dep_resolved(H264Context_spu *h){ + H264slice *s = h->s; + int spe_id = h->spe.spe_id; + volatile int mb_proc_dep = src_spe.count; + if (spe_id==0) + return (h->mb_proc < mb_proc_dep-1 +s->mb_width)? 1:0; + else + return (h->mb_proc < mb_proc_dep-1)? 1:0; +} + +void update_tgt_spe_dep(H264Context_spu *h, int end){ + H264Mb *mb = h->mb; + H264slice *s = h->s; + H264spe *spe = &h->spe; + int mb_x = mb->mb_x; + + if (end || (mb_x%2==0 && mb_x!=0) || mb_x==s->mb_width-1){ + spe_pos* dma_spe = &dma_temp; + spe_pos* tgt_spe = (spe_pos*) ((unsigned) spe->tgt_spe + (unsigned) &src_spe); //located in target spe local store + dma_spe->count = end? h->mb_proc+1: h->mb_proc; + spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), MBD_put); + } + h->mb_proc++; +} + + +int main(unsigned long long id, unsigned long long argp) +{ + (void) id; + H264Context_spu* h = &h_context; + H264spe *spe_params = (H264spe *) (unsigned) argp; + + spu_dma_get(&h->spe, (unsigned) spe_params, sizeof(H264spe), MBD_slice); //ID_slice is used out of convienience + wait_dma_id(MBD_slice); + + //clear_statistic(); + dsputil_h264_init_cell(&h->dsp); + ff_cropTbl_init(); + init_pred_ptrs(&h->hpc); + + //send slice_buf to ppe + spu_write_out_mbox((unsigned) h->slice_buf); + h->sl_idx=0; + // initialize tracing with paraver + //trace_init_SPU(); + h->frames =0; + src_spe.count =0; + h->mb_proc = 0; + + h->mb_id=0; + h->mc_idx=0; + h->mb_dec=0; + h->mb_mc=0; + h->mb_dma=0; + h->next_mb_idx=0; + + h->blocking=0; + + + H264spe* p = &h->spe; + h->curr_line =p->spe_id; + h->mb_total = p->mb_height*p->mb_width; + int stride_y = 32; + int stride_c = 16; + //init block_offset array + init_block_offset(stride_y, stride_c); + for(;;){ + spu_read_in_mbox(); + + h->s = &h->slice_buf[h->sl_idx]; + h->sl_idx++; h->sl_idx%=2; + + if (h->s->state< 0){ + break; + } + + { + if(!h->blocking){ + init_mb_buffer(h); + while((h->mb=(H264Mb *)get_next_mb(h))){ + while(!dep_resolved(h)); + //printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p- >spe_id); + hl_decode_mb_internal(h, stride_y, stride_c); + } + update_tgt_spe_dep(h, 1); + }else{ + h->mb_id=0; + while((h->mb=(H264Mb *)get_next_mb_blocking(h))){ + while(!dep_resolved(h)); + //printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p- >spe_id); + hl_decode_mb_internal(h, stride_y, stride_c); + } + update_tgt_spe_dep(h, 1); + } + + } + + h->frames++; + + if (p->spe_id == ((h->frames*p->mb_height -1)%p->spe_total)){ + //printf("spe %d, %d\n", atomic_read(p->rl_cnt), h->frames); + //MBSlice is copied beforehand. + //only inc cnt. + atomic_inc(p->rl_cnt); + } + { + atomic_dec(p->cnt); + } + } + + return 0; +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/cell/types_spu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/cell/types_spu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2006 Guillaume Poirier + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef TYPES_SPU_H +#define TYPES_SPU_H + +/*********************************************************************** + * Scalar types + **********************************************************************/ + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef unsigned long long uint64_t; + +// typedef short DCTELEM; // transform coeficients of dct + +/*********************************************************************** + * Vector types + **********************************************************************/ + typedef vector signed int vsint32_t; + typedef vector unsigned int vuint32_t; + typedef vector signed short vsint16_t; + typedef vector unsigned short vuint16_t; + typedef vector signed char vsint8_t; + typedef vector unsigned char vuint8_t; + +/*********************************************************************** + * Functions + **********************************************************************/ + typedef void (*qpel_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h); + typedef void (*h264_chroma_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h, int x, int y); + typedef void (*h264_idct_func)(uint8_t *dst, short *block, int stride); + typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); + typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, + int weights, int offset); + typedef void(* intra_pred4x4)(uint8_t *src, uint8_t *topright, int stride); + typedef void(* intra_pred16x16)(uint8_t *src, int stride); + typedef void(* intra_pred8x8)(uint8_t *src, int stride); + typedef void(* intra_pred8x8l)(uint8_t *src, int topleft, int topright, int stride); + + +#define AVV(x...) {x} + + +#endif // AVCODEC_TYPES_SPU_H + + + + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/dsputil.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/dsputil.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1057 @@ +/* + * DSP utils + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * DSP utils + */ + +#include "libavutil/log.h" +#include "dsputil.h" +#include "simple_idct.h" +#include "mathops.h" +#include "config.h" + +uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; +uint32_t ff_squareTbl[512] = {0, }; + +const uint8_t ff_zigzag_direct[64] = { + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63 +}; + + +#define PIXOP2(OPNAME, OP) \ +static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + int i;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + l1= (c&0x03030303UL)\ + + (d&0x03030303UL);\ + h1= ((c&0xFCFCFCFCUL)>>2)\ + + ((d&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + a= AV_RN32(&src1[i*src_stride1+4]);\ + b= AV_RN32(&src2[i*src_stride2+4]);\ + c= AV_RN32(&src3[i*src_stride3+4]);\ + d= AV_RN32(&src4[i*src_stride4+4]);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + l1= (c&0x03030303UL)\ + + (d&0x03030303UL);\ + h1= ((c&0xFCFCFCFCUL)>>2)\ + + ((d&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + }\ +}\ +\ +static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ +}\ +\ +static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ +}\ +\ +static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ +}\ +\ +static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ +}\ +\ +static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ + int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ + int i;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + l1= (c&0x03030303UL)\ + + (d&0x03030303UL);\ + h1= ((c&0xFCFCFCFCUL)>>2)\ + + ((d&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + a= AV_RN32(&src1[i*src_stride1+4]);\ + b= AV_RN32(&src2[i*src_stride2+4]);\ + c= AV_RN32(&src3[i*src_stride3+4]);\ + d= AV_RN32(&src4[i*src_stride4+4]);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x01010101UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + l1= (c&0x03030303UL)\ + + (d&0x03030303UL);\ + h1= ((c&0xFCFCFCFCUL)>>2)\ + + ((d&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + }\ +}\ +static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ + int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ + OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ + OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ +}\ +static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ + int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ + OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ + OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ +}\ +\ +static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i, a0, b0, a1, b1;\ + a0= pixels[0];\ + b0= pixels[1] + 2;\ + a0 += b0;\ + b0 += pixels[2];\ +\ + pixels+=line_size;\ + for(i=0; i>2; /* FIXME non put */\ + block[1]= (b1+b0)>>2;\ +\ + pixels+=line_size;\ + block +=line_size;\ +\ + a0= pixels[0];\ + b0= pixels[1] + 2;\ + a0 += b0;\ + b0 += pixels[2];\ +\ + block[0]= (a1+a0)>>2;\ + block[1]= (b1+b0)>>2;\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + const uint32_t a= AV_RN32(pixels );\ + const uint32_t b= AV_RN32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= AV_RN32(pixels );\ + b= AV_RN32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int j;\ + for(j=0; j<2; j++){\ + int i;\ + const uint32_t a= AV_RN32(pixels );\ + const uint32_t b= AV_RN32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= AV_RN32(pixels );\ + b= AV_RN32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ + pixels+=4-line_size*(h+1);\ + block +=4-line_size*h;\ + }\ +}\ +\ +static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int j;\ + for(j=0; j<2; j++){\ + int i;\ + const uint32_t a= AV_RN32(pixels );\ + const uint32_t b= AV_RN32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x01010101UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= AV_RN32(pixels );\ + b= AV_RN32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x01010101UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ + pixels+=4-line_size*(h+1);\ + block +=4-line_size*h;\ + }\ +}\ +\ +CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ + +#define op_avg(a, b) a = rnd_avg32(a, b) + +#define op_put(a, b) a = b + +PIXOP2(avg, op_avg) +PIXOP2(put, op_put) +#undef op_avg +#undef op_put + + +#define H264_CHROMA_MC(OPNAME, OP)\ +static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ + const int A=(8-x)*(8-y);\ + const int B=( x)*(8-y);\ + const int C=(8-x)*( y);\ + const int D=( x)*( y);\ + int i;\ + \ + assert(x<8 && y<8 && x>=0 && y>=0);\ +\ + if(D){\ + for(i=0; i=0 && y>=0);\ +\ + if(D){\ + for(i=0; i=0 && y>=0);\ +\ + if(D){\ + for(i=0; i>6)+1)>>1) +#define op_put(a, b) a = (((b) + 32)>>6) + +H264_CHROMA_MC(put_ , op_put) +H264_CHROMA_MC(avg_ , op_avg) +#undef op_avg +#undef op_put + + +#define H264_LOWPASS(OPNAME, OP, OP2) \ +static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + const int h=2;\ + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ + int i;\ + for(i=0; i>5]+1)>>1) +#define op_put(a, b) a = cm[((b) + 16)>>5] +#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1) +#define op2_put(a, b) a = cm[((b) + 512)>>10] + +H264_LOWPASS(put_ , op_put, op2_put) +H264_LOWPASS(avg_ , op_avg, op2_avg) +H264_MC(put_, 2) +H264_MC(put_, 4) +H264_MC(put_, 8) +H264_MC(put_, 16) +H264_MC(avg_, 4) +H264_MC(avg_, 8) +H264_MC(avg_, 16) + +#undef op_avg +#undef op_put +#undef op2_avg +#undef op2_put + +static void clear_block_c(DCTELEM *block) +{ + memset(block, 0, sizeof(DCTELEM)*64); +} + +/** + * memset(blocks, 0, sizeof(DCTELEM)*6*64) + */ +static void clear_blocks_c(DCTELEM *blocks) +{ + memset(blocks, 0, sizeof(DCTELEM)*6*64); +} + +static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; } + +/* init static data */ +av_cold void dsputil_static_init(void) +{ + int i; + + for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; + for(i=0;i= 4.2.\n" + "Do not report crashes to FFmpeg developers.\n"); +#endif + did_fail=1; + } + return -1; + } + return 0; +} + +av_cold void dsputil_init(DSPContext* c) +{ + (void) avg_pixels2_c; // kill a warning, avg_pixels2_c is a macro created function. + ff_check_alignment(); + dsputil_static_init(); + + c->idct_put= ff_simple_idct_put; + c->idct_add= ff_simple_idct_add; + c->idct = ff_simple_idct; + + c->clear_block = clear_block_c; + c->clear_blocks = clear_blocks_c; + +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \ + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \ + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \ + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \ + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \ + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \ + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \ + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \ + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \ + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \ + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \ + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \ + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \ + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c + + + dspfunc(put_h264_qpel, 0, 16); + dspfunc(put_h264_qpel, 1, 8); + dspfunc(put_h264_qpel, 2, 4); + dspfunc(put_h264_qpel, 3, 2); + dspfunc(avg_h264_qpel, 0, 16); + dspfunc(avg_h264_qpel, 1, 8); + dspfunc(avg_h264_qpel, 2, 4); + +#undef dspfunc + c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c; + c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c; + c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c; + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c; + c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c; + c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c; + + + c->prefetch= just_return; + + if (HAVE_MMX) dsputil_init_mmx (c); + if (ARCH_ARM) dsputil_init_arm (c); + if (HAVE_ALTIVEC) dsputil_init_ppc (c); //fixme PPC prefetch +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/dsputil.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/dsputil.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,465 @@ +/* + * DSP utils + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * DSP utils. + * note, many functions in here may use MMX which trashes the FPU state, it is + * absolutely necessary to call emms_c() between dsp & float/double code + */ + +#ifndef AVCODEC_DSPUTIL_H +#define AVCODEC_DSPUTIL_H + +#include "libavutil/intreadwrite.h" +#include "avcodec.h" +#include "h264_idct.h" +// +void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, + const float *win, float add_bias, int len); +void ff_float_to_int16_c(int16_t *dst, const float *src, long len); +void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels); + +/* encoding scans */ +extern const uint8_t ff_alternate_horizontal_scan[64]; +extern const uint8_t ff_alternate_vertical_scan[64]; +extern const uint8_t ff_zigzag_direct[64]; +extern const uint8_t ff_zigzag248_direct[64]; + +/* pixel operations */ +#define MAX_NEG_CROP 1024 + +/* temporary */ +extern uint32_t ff_squareTbl[512]; +extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP]; + +/* VP3 DSP functions */ +void ff_vp3_idct_c(DCTELEM *block/* align 16*/); +void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); +void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); +void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); + +void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values); +void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values); + +/* VP6 DSP functions */ +void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride, + const int16_t *h_weights, const int16_t *v_weights); + +/* Bink functions */ +void ff_bink_idct_c (DCTELEM *block); +void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block); +void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); + +/* CAVS functions */ +void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); +void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); + +/* VC1 functions */ +void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); +void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); + +/* EA functions */ +void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); + +/* 1/2^n downscaling functions from imgconvert.c */ +void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); +void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); +void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); +void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); + +void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); + +/* minimum alignment rules ;) +If you notice errors in the align stuff, need more alignment for some ASM code +for some CPU or need to use a function with less aligned data then send a mail +to the ffmpeg-devel mailing list, ... + +!warning These alignments might not match reality, (missing attribute((align)) +stuff somewhere possible). +I (Michael) did not check them, these are just the alignments which I think +could be reached easily ... + +!future video codecs might need functions with less strict alignment +*/ + +/* +void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size); +void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); +void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); +void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); +void clear_blocks_c(DCTELEM *blocks); +*/ + +/* add and put pixel (decoding) */ +// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16 +//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4 +typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h); +typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h); +typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); +typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); + +typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h); + +#define DEF_OLD_QPEL(name)\ +void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ +void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ +void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); + +DEF_OLD_QPEL(qpel16_mc11_old_c) +DEF_OLD_QPEL(qpel16_mc31_old_c) +DEF_OLD_QPEL(qpel16_mc12_old_c) +DEF_OLD_QPEL(qpel16_mc32_old_c) +DEF_OLD_QPEL(qpel16_mc13_old_c) +DEF_OLD_QPEL(qpel16_mc33_old_c) +DEF_OLD_QPEL(qpel8_mc11_old_c) +DEF_OLD_QPEL(qpel8_mc31_old_c) +DEF_OLD_QPEL(qpel8_mc12_old_c) +DEF_OLD_QPEL(qpel8_mc32_old_c) +DEF_OLD_QPEL(qpel8_mc13_old_c) +DEF_OLD_QPEL(qpel8_mc33_old_c) + +#define CALL_2X_PIXELS(a, b, n)\ +static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + b(block , pixels , line_size, h);\ + b(block+n, pixels+n, line_size, h);\ +} + +/* motion estimation */ +// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2 +// although currently h<4 is not used as functions with width <8 are neither used nor implemented +typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/; + +/** + * Scantable. + */ +typedef struct ScanTable{ + const uint8_t *scantable; + uint8_t permutated[64]; + uint8_t raster_end[64]; +#if ARCH_PPC + /** Used by dct_quantize_altivec to find last-non-zero */ + DECLARE_ALIGNED(16, uint8_t, inverse)[64]; +#endif +} ScanTable; + +void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable); + +void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, + int block_w, int block_h, + int src_x, int src_y, int w, int h); + + +/** + * DSPContext. + */ +typedef struct DSPContext { + /* pixel ops : interface with DCT */ + void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size); + void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride); + void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); + void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); + void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); + void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); + void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); + void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); + + void (*clear_block)(DCTELEM *block/*align 16*/); + void (*clear_blocks)(DCTELEM *blocks/*align 16*/); + + + /** + * Halfpel motion compensation with rounding (a+b+1)>>1. + * this is an array[4][4] of motion compensation functions for 4 + * horizontal blocksizes (8,16) and the 4 halfpel positions
+ * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] + * @param block destination where the result is stored + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + op_pixels_func put_pixels_tab[4][4]; + + /** + * Halfpel motion compensation with rounding (a+b+1)>>1. + * This is an array[4][4] of motion compensation functions for 4 + * horizontal blocksizes (8,16) and the 4 halfpel positions
+ * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] + * @param block destination into which the result is averaged (a+b+1)>>1 + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + op_pixels_func avg_pixels_tab[4][4]; + + /** + * Halfpel motion compensation with no rounding (a+b)>>1. + * this is an array[2][4] of motion compensation functions for 2 + * horizontal blocksizes (8,16) and the 4 halfpel positions
+ * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] + * @param block destination where the result is stored + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + op_pixels_func put_no_rnd_pixels_tab[4][4]; + + /** + * Halfpel motion compensation with no rounding (a+b)>>1. + * this is an array[2][4] of motion compensation functions for 2 + * horizontal blocksizes (8,16) and the 4 halfpel positions
+ * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] + * @param block destination into which the result is averaged (a+b)>>1 + * @param pixels source + * @param line_size number of bytes in a horizontal line of block + * @param h height + */ + op_pixels_func avg_no_rnd_pixels_tab[4][4]; + + void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h); + + + qpel_mc_func put_qpel_pixels_tab[2][16]; + qpel_mc_func avg_qpel_pixels_tab[2][16]; + qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16]; + qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16]; + qpel_mc_func put_mspel_pixels_tab[8]; + + /** + * h264 Chroma MC + */ + h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; + h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; + /* This is really one func used in VC-1 decoding */ + h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3]; + h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3]; + + qpel_mc_func put_h264_qpel_pixels_tab[4][16]; + qpel_mc_func avg_h264_qpel_pixels_tab[4][16]; + + qpel_mc_func put_2tap_qpel_pixels_tab[4][16]; + qpel_mc_func avg_2tap_qpel_pixels_tab[4][16]; + + + /* (I)DCT */ + void (*fdct)(DCTELEM *block/* align 16*/); + void (*fdct248)(DCTELEM *block/* align 16*/); + + /* IDCT really*/ + void (*idct)(DCTELEM *block/* align 16*/); + + /** + * block -> idct -> clip to unsigned 8 bit -> dest. + * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) + * @param line_size size in bytes of a horizontal line of dest + */ + void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); + + /** + * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. + * @param line_size size in bytes of a horizontal line of dest + */ + void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); + + void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w); +#define EDGE_WIDTH 32 + + void (*prefetch)(void *mem, int stride, int h); + +} DSPContext; + +void dsputil_static_init(void); +void dsputil_init(DSPContext* p); + +int ff_check_alignment(void); + +/** + * permute block according to permuatation. + * @param last last non zero element in scantable order + */ +void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last); + +void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type); + +#define BYTE_VEC32(c) ((c)*0x01010101UL) + +static inline uint32_t rnd_avg32(uint32_t a, uint32_t b) +{ + return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); +} + +static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b) +{ + return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); +} + + +/** + * Empty mmx state. + * this must be called between any dsp function and float/double code. + * for example sin(); dsp->idct_put(); emms_c(); cos() + */ +#define emms_c() + +/* should be defined by architectures supporting + one or more MultiMedia extension */ +int mm_support(void); +extern int mm_flags; + +void dsputil_init_arm(DSPContext* c); +void dsputil_init_mmx(DSPContext* c); +void dsputil_init_ppc(DSPContext* c); + +void ff_dsputil_init_dwt(DSPContext *c); + +#if HAVE_MMX + +#undef emms_c + +static inline void emms(void) +{ + __asm__ volatile ("emms;":::"memory"); +} + + +#define emms_c() \ +{\ + if (mm_flags & FF_MM_MMX)\ + emms();\ +} + +#elif ARCH_ARM + +#if HAVE_NEON +# define STRIDE_ALIGN 16 +#endif + +#elif ARCH_PPC || ARCH_PPC64 || ARCH_CELL + +#define STRIDE_ALIGN 16 + +#endif + +#ifndef STRIDE_ALIGN +# define STRIDE_ALIGN 8 +#endif + +#define WRAPPER8_16(name8, name16)\ +static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ + return name8(s, dst , src , stride, h)\ + +name8(s, dst+8 , src+8 , stride, h);\ +} + +#define WRAPPER8_16_SQ(name8, name16)\ +static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ + int score=0;\ + score +=name8(s, dst , src , stride, 8);\ + score +=name8(s, dst+8 , src+8 , stride, 8);\ + if(h==16){\ + dst += 8*stride;\ + src += 8*stride;\ + score +=name8(s, dst , src , stride, 8);\ + score +=name8(s, dst+8 , src+8 , stride, 8);\ + }\ + return score;\ +} + +static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) +{ + int i; + for(i=0; i + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * bitstream reader API header. + */ + +#ifndef AVCODEC_GET_BITS_H +#define AVCODEC_GET_BITS_H + +#include +#include +#include +#include "libavutil/bswap.h" +#include "libavutil/common.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/log.h" +#include "mathops.h" + + +typedef struct GetBitContext { + uint8_t *rbsp; + unsigned int rbsp_size; + uint8_t *raw; + const uint8_t *buffer, *buffer_end; + unsigned int alloc_size; + unsigned int buf_size; + uint32_t *buffer_ptr; + uint32_t cache0; + uint32_t cache1; + int bit_count; + int size_in_bits; +} GetBitContext; + +/* Bitstream reader API docs: +name + arbitrary name which is used as prefix for the internal variables + +gb + getbitcontext + +OPEN_READER(name, gb) + loads gb into local variables + +CLOSE_READER(name, gb) + stores local vars in gb + +UPDATE_CACHE(name, gb) + refills the internal cache from the bitstream + after this call at least MIN_CACHE_BITS will be available, + +GET_CACHE(name, gb) + will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit) + +SHOW_UBITS(name, gb, num) + will return the next num bits + +SHOW_SBITS(name, gb, num) + will return the next num bits and do sign extension + +SKIP_BITS(name, gb, num) + will skip over the next num bits + note, this is equivalent to SKIP_CACHE; SKIP_COUNTER + +SKIP_CACHE(name, gb, num) + will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER) + +SKIP_COUNTER(name, gb, num) + will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS) + +LAST_SKIP_CACHE(name, gb, num) + will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing + +LAST_SKIP_BITS(name, gb, num) + is equivalent to LAST_SKIP_CACHE; SKIP_COUNTER + +for examples see get_bits, show_bits, skip_bits, get_vlc +*/ + +#define MIN_CACHE_BITS 32 + +#define OPEN_READER(name, gb)\ + int name##_bit_count=(gb)->bit_count;\ + uint32_t name##_cache0= (gb)->cache0;\ + uint32_t name##_cache1= (gb)->cache1;\ + uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\ + +#define CLOSE_READER(name, gb)\ + (gb)->bit_count= name##_bit_count;\ + (gb)->cache0= name##_cache0;\ + (gb)->cache1= name##_cache1;\ + (gb)->buffer_ptr= name##_buffer_ptr;\ + +#define UPDATE_CACHE(name, gb)\ + if(name##_bit_count > 0){\ + const uint32_t next= be2me_32( *name##_buffer_ptr );\ + name##_cache0 |= NEG_USR32(next,name##_bit_count);\ + name##_cache1 |= next<buffer_ptr - s->buffer)*8 - 32 + s->bit_count; +} + +static inline void skip_bits_long(GetBitContext *s, int n){ + OPEN_READER(re, s) + re_bit_count += n; + re_buffer_ptr += re_bit_count>>5; + re_bit_count &= 31; + re_cache0 = be2me_32( re_buffer_ptr[-1] ) << re_bit_count; + re_cache1 = 0; + UPDATE_CACHE(re, s) + CLOSE_READER(re, s) +} + +/** + * read mpeg1 dc style vlc (sign bit + mantisse with no MSB). + * if MSB not set it is negative + * @param n length in bits + * @author BERO + */ +static inline int get_xbits(GetBitContext *s, int n){ + register int sign; + register int32_t cache; + OPEN_READER(re, s) + UPDATE_CACHE(re, s) + cache = GET_CACHE(re,s); + sign=(~cache)>>31; + LAST_SKIP_BITS(re, s, n) + CLOSE_READER(re, s) + return (NEG_USR32(sign ^ cache, n) ^ sign) - sign; +} + +static inline int get_sbits(GetBitContext *s, int n){ + register int tmp; + OPEN_READER(re, s) + UPDATE_CACHE(re, s) + tmp= SHOW_SBITS(re, s, n); + LAST_SKIP_BITS(re, s, n) + CLOSE_READER(re, s) + return tmp; +} + +/** + * reads 1-17 bits. + * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't + */ +static inline unsigned int get_bits(GetBitContext *s, int n){ + register int tmp; + OPEN_READER(re, s) + UPDATE_CACHE(re, s) + tmp= SHOW_UBITS(re, s, n); + LAST_SKIP_BITS(re, s, n) + CLOSE_READER(re, s) + return tmp; +} + +/** + * shows 1-17 bits. + * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't + */ +static inline unsigned int show_bits(GetBitContext *s, int n){ + register int tmp; + OPEN_READER(re, s) + UPDATE_CACHE(re, s) + tmp= SHOW_UBITS(re, s, n); +// CLOSE_READER(re, s) + return tmp; +} + +static inline void skip_bits(GetBitContext *s, int n){ + //Note gcc seems to optimize this to s->index+=n for the ALT_READER :)) + OPEN_READER(re, s) + UPDATE_CACHE(re, s) + LAST_SKIP_BITS(re, s, n) + CLOSE_READER(re, s) +} + +static inline unsigned int get_bits1(GetBitContext *s){ + return get_bits(s, 1); +} + +static inline unsigned int show_bits1(GetBitContext *s){ + return show_bits(s, 1); +} + +static inline void skip_bits1(GetBitContext *s){ + skip_bits(s, 1); +} + +/** + * reads 0-32 bits. + */ +static inline unsigned int get_bits_long(GetBitContext *s, int n){ + if(n<=MIN_CACHE_BITS) return get_bits(s, n); + else{ + int ret= get_bits(s, 16) << (n-16); + return ret | get_bits(s, n-16); + } +} + +/** + * reads 0-32 bits as a signed integer. + */ +static inline int get_sbits_long(GetBitContext *s, int n) { + return sign_extend(get_bits_long(s, n), n); +} + +/** + * shows 0-32 bits. + */ +static inline unsigned int show_bits_long(GetBitContext *s, int n){ + if(n<=MIN_CACHE_BITS) return show_bits(s, n); + else{ + GetBitContext gb= *s; + return get_bits_long(&gb, n); + } +} + +static inline int check_marker(GetBitContext *s, const char *msg) +{ + int bit= get_bits1(s); + if(!bit) + av_log(AV_LOG_INFO, "Marker bit missing %s\n", msg); + + return bit; +} + +/** + * init GetBitContext. + * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits + * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end + * @param bit_size the size of the buffer in bits + * + * While GetBitContext stores the buffer size, for performance reasons you are + * responsible for checking for the buffer end yourself (take advantage of the padding)! + */ +static inline void init_get_bits(GetBitContext *s, + const uint8_t *buffer, int bit_size) +{ + int buffer_size= (bit_size+7)>>3; + if(buffer_size < 0 || bit_size < 0) { + buffer_size = bit_size = 0; + buffer = NULL; + } + + s->buffer= buffer; + s->size_in_bits= bit_size; + s->buffer_end= buffer + buffer_size; + + s->buffer_ptr = (uint32_t*)((intptr_t)buffer&(~3)); + s->bit_count = 32 + 8*((intptr_t)buffer&3); + skip_bits_long(s, 0); +} + +static inline void align_get_bits(GetBitContext *s) +{ + int n= (-get_bits_count(s)) & 7; + if(n) skip_bits(s, n); +} + +#define tprintf(p, ...) {} + +static inline int get_bits_left(GetBitContext *gb) +{ + return gb->size_in_bits - get_bits_count(gb); +} + +#endif /* AVCODEC_GET_BITS_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/golomb.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/golomb.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,184 @@ +/* + * exp golomb vlc stuff + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * @brief + * exp golomb vlc stuff + * @author Michael Niedermayer + */ + +#include "libavutil/common.h" + +const uint8_t ff_log2_tab[256]={ + 0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + +const uint8_t ff_golomb_vlc_len[512]={ +14,13,12,12,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, +7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +const uint8_t ff_ue_golomb_vlc_code[512]={ +31,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30, + 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +const int8_t ff_se_golomb_vlc_code[512]={ + 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 8, -8, 9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, + 4, 4, 4, 4, -4, -4, -4, -4, 5, 5, 5, 5, -5, -5, -5, -5, 6, 6, 6, 6, -6, -6, -6, -6, 7, 7, 7, 7, -7, -7, -7, -7, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + + +const uint8_t ff_ue_golomb_len[256]={ + 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,11, +11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,13, +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,17, +}; + +const uint8_t ff_interleaved_golomb_vlc_len[256]={ +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +}; + +const uint8_t ff_interleaved_ue_golomb_vlc_code[256]={ + 15,16,7, 7, 17,18,8, 8, 3, 3, 3, 3, 3, 3, 3, 3, + 19,20,9, 9, 21,22,10,10,4, 4, 4, 4, 4, 4, 4, 4, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 23,24,11,11,25,26,12,12,5, 5, 5, 5, 5, 5, 5, 5, + 27,28,13,13,29,30,14,14,6, 6, 6, 6, 6, 6, 6, 6, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const int8_t ff_interleaved_se_golomb_vlc_code[256]={ + 8, -8, 4, 4, 9, -9, -4, -4, 2, 2, 2, 2, 2, 2, 2, 2, + 10,-10, 5, 5, 11,-11, -5, -5, -2, -2, -2, -2, -2, -2, -2, -2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 12,-12, 6, 6, 13,-13, -6, -6, 3, 3, 3, 3, 3, 3, 3, 3, + 14,-14, 7, 7, 15,-15, -7, -7, -3, -3, -3, -3, -3, -3, -3, -3, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]={ +0, 1, 0, 0, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, +4, 5, 2, 2, 6, 7, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +8, 9, 4, 4, 10,11,5, 5, 2, 2, 2, 2, 2, 2, 2, 2, +12,13,6, 6, 14,15,7, 7, 3, 3, 3, 3, 3, 3, 3, 3, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,}; diff -r 11d15c47beaf -r 897f711a7157 libavcodec/golomb.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/golomb.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,410 @@ +/* + * exp golomb vlc stuff + * Copyright (c) 2003 Michael Niedermayer + * Copyright (c) 2004 Alex Beregszaszi + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * @brief + * exp golomb vlc stuff + * @author Michael Niedermayer and Alex Beregszaszi + */ + +#ifndef AVCODEC_GOLOMB_H +#define AVCODEC_GOLOMB_H + +#include +#include "get_bits.h" + +#define INVALID_VLC 0x80000000 + +extern const uint8_t ff_golomb_vlc_len[512]; +extern const uint8_t ff_ue_golomb_vlc_code[512]; +extern const int8_t ff_se_golomb_vlc_code[512]; +extern const uint8_t ff_ue_golomb_len[256]; + +extern const uint8_t ff_interleaved_golomb_vlc_len[256]; +extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256]; +extern const int8_t ff_interleaved_se_golomb_vlc_code[256]; +extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]; + + + /** + * read unsigned exp golomb code. + */ +static inline int get_ue_golomb(GetBitContext *gb){ + unsigned int buf; + int log; + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf=GET_CACHE(re, gb); + + if(buf >= (1<<27)){ + buf >>= 32 - 9; + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_ue_golomb_vlc_code[buf]; + }else{ + log= 2*av_log2_c(buf) - 31; + buf>>= log; + buf--; + LAST_SKIP_BITS(re, gb, 32 - log); + CLOSE_READER(re, gb); + + return buf; + } +} + + /** + * read unsigned exp golomb code, constraint to a max of 31. + * the return value is undefined if the stored value exceeds 31. + */ +static inline int get_ue_golomb_31(GetBitContext *gb){ + unsigned int buf; + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf=GET_CACHE(re, gb); + + buf >>= 32 - 9; + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_ue_golomb_vlc_code[buf]; +} + +static inline int svq3_get_ue_golomb(GetBitContext *gb){ + uint32_t buf; + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf=GET_CACHE(re, gb); + + if(buf&0xAA800000){ + buf >>= 32 - 8; + LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_interleaved_ue_golomb_vlc_code[buf]; + }else{ + int ret = 1; + + while (1) { + buf >>= 32 - 8; + LAST_SKIP_BITS(re, gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8)); + + if (ff_interleaved_golomb_vlc_len[buf] != 9){ + ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1; + ret |= ff_interleaved_dirac_golomb_vlc_code[buf]; + break; + } + ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf]; + UPDATE_CACHE(re, gb); + buf = GET_CACHE(re, gb); + } + + CLOSE_READER(re, gb); + return ret - 1; + } +} + +/** + * read unsigned truncated exp golomb code. + */ +static inline int get_te0_golomb(GetBitContext *gb, int range){ + assert(range >= 1); + + if(range==1) return 0; + else if(range==2) return get_bits1(gb)^1; + else return get_ue_golomb(gb); +} + +/** + * read unsigned truncated exp golomb code. + */ +static inline int get_te_golomb(GetBitContext *gb, int range){ + assert(range >= 1); + + if(range==2) return get_bits1(gb)^1; + else return get_ue_golomb(gb); +} + + +/** + * read signed exp golomb code. + */ +static inline int get_se_golomb(GetBitContext *gb){ + unsigned int buf; + int log; + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf=GET_CACHE(re, gb); + + if(buf >= (1<<27)){ + buf >>= 32 - 9; + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_se_golomb_vlc_code[buf]; + }else{ + log= 2*av_log2_c(buf) - 31; + buf>>= log; + + LAST_SKIP_BITS(re, gb, 32 - log); + CLOSE_READER(re, gb); + + if(buf&1) buf= -(buf>>1); + else buf= (buf>>1); + + return buf; + } +} + +static inline int svq3_get_se_golomb(GetBitContext *gb){ + unsigned int buf; + int log; + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf=GET_CACHE(re, gb); + + if(buf&0xAA800000){ + buf >>= 32 - 8; + LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); + CLOSE_READER(re, gb); + + return ff_interleaved_se_golomb_vlc_code[buf]; + }else{ + LAST_SKIP_BITS(re, gb, 8); + UPDATE_CACHE(re, gb); + buf |= 1 | (GET_CACHE(re, gb) >> 8); + + if((buf & 0xAAAAAAAA) == 0) + return INVALID_VLC; + + for(log=31; (buf & 0x80000000) == 0; log--){ + buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30); + } + + LAST_SKIP_BITS(re, gb, 63 - 2*log - 8); + CLOSE_READER(re, gb); + + return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1; + } +} + +static inline int dirac_get_se_golomb(GetBitContext *gb){ + uint32_t buf; + uint32_t ret; + + ret = svq3_get_ue_golomb(gb); + + if (ret) { + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf = SHOW_SBITS(re, gb, 1); + LAST_SKIP_BITS(re, gb, 1); + ret = (ret ^ buf) - buf; + CLOSE_READER(re, gb); + } + + return ret; +} + +/** + * read unsigned golomb rice code (ffv1). + */ +static inline int get_ur_golomb(GetBitContext *gb, int k, int limit, int esc_len){ + unsigned int buf; + int log; + + OPEN_READER(re, gb); + UPDATE_CACHE(re, gb); + buf=GET_CACHE(re, gb); + + log= av_log2_c(buf); + + if(log > 31-limit){ + buf >>= log - k; + buf += (30-log)<= 32-MIN_CACHE_BITS+(MIN_CACHE_BITS==32) && 32-log < limit){ + buf >>= log - k; + buf += (30-log)<>1; + else return -(v>>1); + +// return (v>>1) ^ -(v&1); +} + +/** + * read signed golomb rice code (flac). + */ +static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, int esc_len){ + int v= get_ur_golomb_jpegls(gb, k, limit, esc_len); + return (v>>1) ^ -(v&1); +} + +/** + * read unsigned golomb rice code (shorten). + */ +static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k){ + return get_ur_golomb_jpegls(gb, k, INT_MAX, 0); +} + +/** + * read signed golomb rice code (shorten). + */ +static inline int get_sr_golomb_shorten(GetBitContext* gb, int k) +{ + int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0); + if (uvar & 1) + return ~(uvar >> 1); + else + return uvar >> 1; +} + + + +#ifdef TRACE + +static inline int get_ue(GetBitContext *s, char *file, const char *func, int line){ + int show= show_bits(s, 24); + int pos= get_bits_count(s); + int i= get_ue_golomb(s); + int len= get_bits_count(s) - pos; + int bits= show>>(24-len); + + print_bin(bits, len); + + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); + + return i; +} + +static inline int get_se(GetBitContext *s, char *file, const char *func, int line){ + int show= show_bits(s, 24); + int pos= get_bits_count(s); + int i= get_se_golomb(s); + int len= get_bits_count(s) - pos; + int bits= show>>(24-len); + + print_bin(bits, len); + + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); + + return i; +} + +static inline int get_te(GetBitContext *s, int r, char *file, const char *func, int line){ + int show= show_bits(s, 24); + int pos= get_bits_count(s); + int i= get_te0_golomb(s, r); + int len= get_bits_count(s) - pos; + int bits= show>>(24-len); + + print_bin(bits, len); + + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); + + return i; +} + +#define get_ue_golomb(a) get_ue(a, __FILE__, __PRETTY_FUNCTION__, __LINE__) +#define get_se_golomb(a) get_se(a, __FILE__, __PRETTY_FUNCTION__, __LINE__) +#define get_te_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__) +#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__) + +#endif + + +#endif /* AVCODEC_GOLOMB_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,215 @@ +#include "config.h" +#include "h264.h" +#include "h264_misc.h" +#include + +H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int width, int height, h264_options *opts){ + int i; + const int mb_height = (height + 15) / 16; + const int mb_width = (width + 15) / 16; + const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16 + + ff_init_cabac_states(); + + H264Context *h= av_mallocz(sizeof(H264Context)); + + start_timer(h, TOTAL); + h->file_name = file_name; + h->profile = opts->profile; + for (i=0; itotal_time[i]=0; + + h->ifile=ifile; + h->ofile =ofile; + + h->verbose =opts->verbose; + h->no_mbd =opts->no_mbd; + h->static_3d =opts->static_3d; + h->pipe_bufs = opts->pipe_bufs; + h->slice_bufs = opts->slice_bufs; + + h->ed_ppe_threads =0; + if (opts->ppe_ed){ + h->ed_ppe_threads = (opts->threads >opts->ppe_ed)? opts->ppe_ed :opts->threads; + } + + h->threads = opts->threads - h->ed_ppe_threads; + h->smt = opts->smt; + if (h->smt){ + h->threads *= 2; + } + + h->num_frames = opts->numframes; + + h->frame_width = width; + h->frame_height = height; + + while ((width/2) %STRIDE_ALIGN) + width+=STRIDE_ALIGN; + h->width = width; + h->height = mb_height*16; + + h->mb_height = mb_height; + h->mb_width = mb_width; + h->mb_stride = mb_stride; + h->b4_stride = mb_width*4 + 1; + h->b_stride = mb_width*4; + + h->smb_width = opts->smb_size[0]; + h->smb_height = opts->smb_size[1] < h->smb_width ? opts->smb_size[1] : h->smb_width; + h->smbc = getSuperMBContext(h, h->smb_width, h->smb_height); + + h->wave_order = opts->wave_order; + + h->pipe_bufs = opts->pipe_bufs; + + h->max_dpb_cnt = DPB_SIZE + opts->pipe_bufs; + h->free_dpb_cnt = h->max_dpb_cnt; + h->dpb = av_mallocz (h->max_dpb_cnt* sizeof (DecodedPicture)); + + + h->free_sb_cnt = h->threads*opts->slice_bufs + (h->no_mbd != 0) ; //one extra to overlap some latency of signaling/freeing slicebuffers in entropy only mode + h->sb_size = h->free_sb_cnt; + h->sb = av_mallocz(h->sb_size* sizeof(SliceBufferEntry)); + + h->rl_q.size = FFMAX(1, FFMIN( (h->height-3 - 512)/16, h->mb_width/2)) +1; + h->rl_q.free = h->rl_q.size -1; + h->rl_q.ready=0; + h->rl_q.fi = h->rl_q.fo= 0; + h->rl_q.queue = av_malloc(h->rl_q.size* sizeof(RingLineEntry*)); + for (i=0; irl_q.size; i++){ + if( posix_memalign((void**)&h->rl_q.queue[i],64,sizeof(RingLineEntry))) + h->rl_q.queue[i]=NULL; + h->rl_q.queue[i]->top = av_malloc(h->mb_width*sizeof(TopBorder)); + } + + h->rl_q.queue[0]->prev_line = h->rl_q.queue[h->rl_q.size-1]; + for (i=1; irl_q.size; i++){ + h->rl_q.queue[i]->prev_line = h->rl_q.queue[i-1]; + } + + if( HAVE_MMX | HAVE_ALTIVEC| HAVE_NEON ){ + for(i=0; i<16; i++){ + #define T(x) (x>>2) | ((x<<2) & 0xF) + h->zigzag_scan[i] = T(zigzag_scan[i]); + #undef T + } + for(i=0; i<64; i++){ + #define T(x) (x>>3) | ((x&7)<<3) + h->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]); + #undef T + } + }else{ + memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t)); + memcpy(h->zigzag_scan8x8, ff_zigzag_direct, 64*sizeof(uint8_t)); + } + + pthread_mutex_init(&h->smb_lock, NULL); + pthread_mutex_init(&h->sdl_lock, NULL); + pthread_cond_init(&h->sdl_cond, NULL); + + ///pthread initialization + pthread_mutex_init(&h->ilock, NULL); + pthread_cond_init(&h->icond, NULL); + pthread_mutex_init(&h->slock, NULL); + pthread_cond_init(&h->scond, NULL); + pthread_mutex_init(&h->tlock, NULL); + pthread_cond_init(&h->tcond, NULL); + pthread_mutex_init(&h->tdlock, NULL); + pthread_cond_init(&h->tdcond, NULL); + h->start =!opts->numamap; //default dont wait for start signal + h->statmbd = opts->statmbd; + h->rl_side_touch= opts->numamap; + h->touch_start=0; + h->setaff =opts->statsched; + h->init_threads=0; + + pthread_mutex_init(&h->task_lock, NULL); + pthread_cond_init(&h->task_cond, NULL); + for (i=0; ilock[i], NULL); + pthread_cond_init (&h->cond[i], NULL); + + pthread_mutex_init (&h->sb_q[i].lock, NULL); + pthread_cond_init (&h->sb_q[i].cond, NULL); + h->sb_q[i].size = h->free_sb_cnt; //change to num threads later + h->sb_q[i].queue = av_malloc(h->free_sb_cnt* sizeof(SliceBufferEntry*)); + h->sb_q[i].cnt = h->sb_q[i].fi = h->sb_q[i].fo =0; + } + +#if HAVE_LIBSDL2 + h->sdlq.size=2; + h->sdlq.ready=2; + h->sdlq.queue = av_malloc(2* sizeof(SDL_Texture*)); + pthread_mutex_init (&h->sdlq.sdl_lock, NULL); + pthread_cond_init (&h->sdlq.sdl_cond, NULL); +#endif + + h->display=opts->display; + h->fullscreen=opts->fullscreen; + + return h; +} + + +void free_h264dec_context(H264Context *h) { + int i; + + for(i=0; imax_dpb_cnt; i++) + free_dp(&h->dpb[i]); + av_free (h->dpb); + + for(i=0; isb_size; i++){ + if (h->sb[i].initialized){ + free_sb_entry(&h->sb[i]); + } + } + av_freep(&h->sb); + + for (i=0; irl_q.size; i++){ + av_freep(&h->rl_q.queue[i]->top); + av_freep(&h->rl_q.queue[i]); + } + av_freep(&h->rl_q.queue); + + ///pthread cleanup + pthread_mutex_destroy (&h->task_lock); + pthread_cond_destroy (&h->task_cond); + for (i=0; ilock[i]); + pthread_cond_destroy (&h->cond[i]); + + pthread_mutex_destroy (&h->sb_q[i].lock); + pthread_cond_destroy (&h->sb_q[i].cond); + av_freep( &h->sb_q[i].queue); + } + pthread_mutex_destroy (&h->slock); + pthread_cond_destroy (&h->scond); + pthread_mutex_destroy (&h->ilock); + pthread_cond_destroy (&h->icond); + + pthread_mutex_destroy(&h->smb_lock); + pthread_mutex_destroy (&h->sdl_lock); + pthread_cond_destroy (&h->sdl_cond); +#if HAVE_LIBSDL2 + av_free(h->sdlq.queue); + pthread_mutex_destroy (&h->sdlq.sdl_lock); + pthread_cond_destroy (&h->sdlq.sdl_cond); +#endif + + stop_timer(h, TOTAL); + if (h->threads==0){ + for (i=0; itotal_time[i] /= h->num_frames; + double others = h->total_time[TOTAL]; + for (i=1; itotal_time[i]; + if (h->profile == 1){ + printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [MBREC %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED], h->total_time[REC], others); + }else if (h->profile ==2){ + printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [PRED %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED],h->total_time[REC], others); + } + } + + av_free(h); +} \ No newline at end of file diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,76 @@ +/* +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder +* Copyright (c) 2003 Michael Niedermayer +* +* This file is part of FFmpeg. +* +* FFmpeg is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License as published by the Free Software Foundation; either +* version 2.1 of the License, or (at your option) any later version. +* +* FFmpeg is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public +* License along with FFmpeg; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +/** +* @file +* H.264 / AVC / MPEG4 part10 codec. +* @author Michael Niedermayer +*/ + +#ifndef H264_H +#define H264_H + +#include "h264_entropy.h" +#include "h264_data.h" +#include "h264_mc.h" +#include "h264_misc.h" +#include "h264_dsp.h" +#include "h264_pred.h" +#include "h264_parser.h" +#include "h264_nal.h" +#include "h264_rec.h" +#include "h264_deblock.h" +#include "h264_types.h" + +typedef struct h264_options{ + int statsched; + int statmbd; + int numamap; + int no_mbd; + int numframes; + int display; + int fullscreen; + int verbose; + int ppe_ed; // only useful for Cell + int profile; + int threads; + int smb_size[2]; // only useful for OmpSs + int wave_order; + int static_3d; + int pipe_bufs; + int slice_bufs; + int smt; +}h264_options; + +int h264_decode_cell(H264Context *h); +int h264_decode_cell_seq(H264Context *h); + +int h264_decode_ompss(H264Context *h); + +int h264_decode_pthread(H264Context *h); +int h264_decode_seq(H264Context *h); + + +H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int frame_width, int frame_height, h264_options *opts); +void free_h264dec_context(H264Context *h); + + +#endif /* AVCODEC_H264_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_cell.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_cell.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1242 @@ + +#include "h264_types.h" +#include "h264_parser.h" +#include "h264_nal.h" +#include "h264_entropy.h" +#include "h264_rec.h" +#include "h264_misc.h" +#include "cell/h264_types_spu.h" +#include "h264_pthread.h" + +#include +#include +#include + +#include +#include +#include +#include + +// spe global variables +unsigned rl_cnt_var, rl_mutex_var, rl_cond_var; +atomic_ea_t rl_cnt; +cond_ea_t rl_cond; +mutex_ea_t rl_lock; + +H264spe * spe_params; +unsigned mutex_var[16]; +unsigned cond_var[16]; +unsigned atomic_var[16]; + +pthread_t * spe_tid; +spe_context_ptr_t *spe_context; +void** spe_control_area; +void** spe_ls_area; +H264slice **spe_slice_buf; + +H264spe * spe_ed_params; +unsigned mutex_ed_var[16]; +unsigned cond_ed_var[16]; +unsigned atomic_ed_var[16]; + +pthread_t * spe_ed_tid; +spe_context_ptr_t *spe_ed_context; +void** spe_ed_control_area; +void** spe_ed_ls_area; +EDSlice_spu **spe_ed_slice_buf; + +//structs to propagate stop signal +MBSlice last_slice; +EDSlice last_ed_slice; +DecodedPicture last_pic; +RawFrame last_frm; + +static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){ + int i; + int cnt = *poc_cnt; + for(i=0; iref_list[1][0]->poc){ + *poc_cnt=i+1; + while(++i poc) { i++;} + if ( i< cnt) + memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int)); + + poc_list[i]=poc; + (*poc_cnt)++; +} + +static void *spe_ed_thread(void *arg){ + H264spe *params = (H264spe *)arg; + unsigned int idx = params->idx; + unsigned int runflags = 0; + unsigned int entry = SPE_DEFAULT_ENTRY; + // run SPE context + spe_context_run(spe_ed_context[idx], &entry, runflags, (void*) params, NULL, NULL); + // done - now exit thread + pthread_exit(NULL); +} + +static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) { + int i; + int num_threads = ip_threads+b_threads; + spe_program_handle_t * spe_program = spe_image_open("spe_ed"); + // reserve memory for spe thread id, context and argument addresses + spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t)); + spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); + spe_ed_params = av_malloc(num_threads * sizeof (H264spe)); + spe_ed_control_area = av_malloc(num_threads * sizeof (void*)); + spe_ed_ls_area = av_malloc(num_threads * sizeof (void*)); + spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*)); + + if (spe_program == NULL) + av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); + + for (i = 0; i < num_threads; i++) { + // create context for spe program + spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL); + if (spe_ed_context[i] == NULL) + av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); + // load SPE program into main memory + if ((spe_program_load(spe_ed_context[i], spe_program)) == -1) + av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); + //get the control_area for fast mailboxing + if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL) + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); + //get ls area for inter spe communication + if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL) + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); + } + + for (i = 0; i < ip_threads; i++) { + spe_ed_params[i].mb_width = h->mb_width; + spe_ed_params[i].mb_stride = h->mb_stride; + spe_ed_params[i].mb_height = h->mb_height; + spe_ed_params[i].type = EDIP; + spe_ed_params[i].spe_id = i; + spe_ed_params[i].idx = i; + //spe_ed_params[i].spe_total = ip_threads; //not used + //spe_params[i].slice_params= &slice_params; + spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; + spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads]; + + spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; + spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; + spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); + + mutex_init(spe_ed_params[i].lock); + cond_init(spe_ed_params[i].cond); + if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); + + //slicebufaddr + spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); + } + for (int j = 0; j < b_threads; j++) { + i = j+ip_threads; + spe_ed_params[i].mb_width = h->mb_width; + spe_ed_params[i].mb_stride = h->mb_stride; + spe_ed_params[i].mb_height = h->mb_height; + spe_ed_params[i].type = EDB; + spe_ed_params[i].idx = i; + spe_ed_params[i].spe_id = j; + spe_ed_params[i].spe_total = b_threads; + //spe_params[i].slice_params= &slice_params; + //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; + spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads]; + + spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; + spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; + spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); + + mutex_init(spe_ed_params[i].lock); + cond_init(spe_ed_params[i].cond); + if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); + + //slicebufaddr + spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); + } + spe_image_close(spe_program); + +} + +static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){ + dst->pps = src->pps; + dst->mbs = src->mbs; + dst->state = src->state; + dst->qp_thresh = src->qp_thresh; + dst->pic = *src->current_picture; + + dst->ref_count[0] = src->ref_count[0]; + dst->ref_count[1] = src->ref_count[1]; + dst->slice_type = src->slice_type; + dst->slice_type_nos = src->slice_type_nos; + dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag; + dst->list_count = src->list_count; + dst->coded_pic_num = src->coded_pic_num; + + GetBitContext *gb = &src->gb; + align_get_bits( gb); + dst->bytestream_start = gb->buffer + get_bits_count(gb)/8; + dst->byte_bufsize = (get_bits_left(gb) + 7)/8; + + dst->transform_bypass = src->transform_bypass; + dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred; + memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int)); + memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int)); + dst->cabac_init_idc = src->cabac_init_idc; + memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int)); + dst->chroma_qp[0]= src->chroma_qp[0]; + dst->chroma_qp[1]= src->chroma_qp[1]; + dst->qscale = src->qscale; + dst->last_qscale_diff = src->last_qscale_diff; + + if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0]; +} + +static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){ + unsigned status; + + spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0); + spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status); + + + _spe_in_mbox_write(spe_ed_control_area[id], 0); + + while (!spe_out_mbox_status(spe_ed_context[id])){ + //pthread_yield(); + usleep(1000); + } + _spe_out_mbox_read(spe_ed_control_area[id]); +} + +static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){ + int i,j; + + if( !s->pps.cabac ){ + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); + return -1; + } + DECLARE_ALIGNED(16, EDSlice_spu, slice); + fill_EDSlice_spu(&slice, s); + + send_slice_to_spe_and_wait(&slice, id); + + return 0; +} + +static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){ + int i,j; + + if( !s->pps.cabac ){ + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); + return -1; + } + DECLARE_ALIGNED(16, EDSlice_spu, slice); + fill_EDSlice_spu(&slice, s); + + send_slice_to_spe_and_wait(&slice, 0); + + if (s->release_cnt>0) { + for (int i=0; irelease_cnt; i++){ + release_pib_entry(h, s->release_ref[i], 2); + } + s->release_cnt=0; + } + + release_pib_entry(h, s->current_picture, 1); + av_freep(&s->gb.raw); + if (s->gb.rbsp) + av_freep(&s->gb.rbsp); + + return 0; +} + +static void *entr_IP_spe_thread(void *arg){ + EDThreadContext *eip = (EDThreadContext *) arg; + H264Context *h = eip->h; +// printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid)); + for (int i=0; imbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); + } + + EntropyContext *ec = get_entropy_context(h); + EDSlice *s; + + for(;;){ + { + pthread_mutex_lock(&eip->ed_lock); + while (eip->ed_cnt <= 0) + pthread_cond_wait(&eip->ed_cond, &eip->ed_lock); + s = &eip->ed_q[eip->ed_fo]; + eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT; + pthread_mutex_unlock(&eip->ed_lock); + } + + if (s->state<0) + break; + { + pthread_mutex_lock(&eip->mbs_lock); + while (eip->mbs_cnt <= 0) + pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock); + + s->mbs = eip->mbs[eip->mbs_fo]; + s->ed = eip; + eip->mbs_cnt--; + eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS; + pthread_mutex_unlock(&eip->mbs_lock); + } + if (eip->cell){ + decode_slice_entropy_cell(ec, s, eip->thread_num); + }else{ + decode_slice_entropy(ec, s); + } + +// { +// pthread_mutex_lock(&h->lock[ENTROPY2]); +// h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc; +// while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT) +// h->ed_poc_fo++; +// +// pthread_cond_signal(&h->cond[ENTROPY2]); +// pthread_mutex_unlock(&h->lock[ENTROPY2]); +// } + + { + pthread_mutex_lock(&h->lock[ENTROPY4]); + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); + h->ed_reorder_q[h->ed_reorder_fi] = *s; + h->ed_reorder_cnt++; + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[ENTROPY4]); + pthread_mutex_unlock(&h->lock[ENTROPY4]); + } + + { + pthread_mutex_lock(&eip->ed_lock); + eip->ed_cnt--; + pthread_cond_signal(&eip->ed_cond); + pthread_mutex_unlock(&eip->ed_lock); + } + } + + free_entropy_context(ec); + + pthread_exit(NULL); + return NULL; +} + +static void *entr_B_spe_thread(void *arg){ + EDThreadContext *eb = (EDThreadContext *) arg; + H264Context *h = eb->h; +// printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid)); + for (int i=0; imbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); + } + + EntropyContext *ec = get_entropy_context(h); + EDSlice *s; + + for(;;){ + { + pthread_mutex_lock(&eb->ed_lock); + while (eb->ed_cnt <= 0) + pthread_cond_wait(&eb->ed_cond, &eb->ed_lock); + s = &eb->ed_q[eb->ed_fo]; + eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT; + pthread_mutex_unlock(&eb->ed_lock); + } + + if (s->state<0) + break; + { + pthread_mutex_lock(&eb->mbs_lock); + while (eb->mbs_cnt <= 0) + pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock); + s->mbs = eb->mbs[eb->mbs_fo]; + s->ed = eb; + eb->mbs_cnt--; + eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS; + pthread_mutex_unlock(&eb->mbs_lock); + } + //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed); + decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads); + + { + pthread_mutex_lock(&h->lock[ENTROPY4]); + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); + h->ed_reorder_q[h->ed_reorder_fi] = *s; + h->ed_reorder_cnt++; + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[ENTROPY4]); + pthread_mutex_unlock(&h->lock[ENTROPY4]); + + } + + { + pthread_mutex_lock(&eb->ed_lock); + eb->ed_cnt--; + pthread_cond_signal(&eb->ed_cond); + pthread_mutex_unlock(&eb->ed_lock); + } + } + eb->lines_cnt++; + + free_entropy_context(ec); + + pthread_exit(NULL); + return NULL; +} + +static void *entr_B_distribute(void *arg){ + H264Context *h = (H264Context *) arg; + EDSlice *s; + + int i, n=0, poc; + +// printf("eb dist, pid %d\n", syscall(SYS_gettid)); + + for(i=0; iedb_threads; i++){ + h->b[i].h =h; + h->b[i].thread_num =i; + h->b[i].thread_total =h->edb_threads; + pthread_mutex_init(&h->b[i].mbs_lock, NULL); + pthread_cond_init(&h->b[i].mbs_cond, NULL); + h->b[i].mbs_fo = 0; + h->b[i].mbs_cnt = SLICE_BUFS; + h->b[i].ed_fi =0; + h->b[i].ed_fo =0; + h->b[i].ed_cnt =0; + h->b[i].lines_cnt =0; + h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads]; + pthread_mutex_init(&h->b[i].ed_lock, NULL); + pthread_cond_init(&h->b[i].ed_cond, NULL); + pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]); + } + + for(;;){ + { + pthread_mutex_lock(&h->lock[ENTROPY3B]); + while (h->ed_B_cnt<=0) + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); + s= &h->ed_B_q[h->ed_B_fo]; + h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT; + pthread_mutex_unlock(&h->lock[ENTROPY3B]); + + } + if (s->state<0) + break; + + if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){ + while (poc < s->ref_list[1][0]->poc){ + pthread_mutex_lock(&h->lock[ENTROPY2]); + while (poc == h->ed_poc) + pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]); + poc = h->ed_poc; + pthread_mutex_unlock(&h->lock[ENTROPY2]); + } + } + { + pthread_mutex_lock(&h->b[n].ed_lock); + while (h->b[n].ed_cnt >= MAX_SLICE_COUNT) + pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock); + h->b[n].ed_q[ h->b[n].ed_fi] = *s; + h->b[n].ed_cnt++; + h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->b[n].ed_cond); + pthread_mutex_unlock(&h->b[n].ed_lock); + + n++; n%=h->edb_threads; + } + { + pthread_mutex_lock(&h->lock[ENTROPY3B]); + h->ed_B_cnt--; + pthread_cond_signal(&h->cond[ENTROPY3B]); + pthread_mutex_unlock(&h->lock[ENTROPY3B]); + + } + + } + + for (i=0; iedb_threads; i++){ + pthread_mutex_lock(&h->b[i].ed_lock); + while (h->b[i].ed_cnt >= MAX_SLICE_COUNT) + pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock); + h->b[i].ed_q[ h->b[i].ed_fi] = *s; + h->b[i].ed_cnt++; + h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->b[i].ed_cond); + pthread_mutex_unlock(&h->b[i].ed_lock); + + } + for(int i=0; iedb_threads; i++){ + pthread_join(h->ed_B_thr[i], NULL); + } + pthread_exit(NULL); + return NULL; +} + + +static void *entr_IPB_distribute(void *arg){ + H264Context *h = (H264Context *) arg; + EDSlice *s; + int i,n=0; + + create_spe_ED_threads(h, h->edip_threads, h->edb_threads); + pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h); + for(i=0; iedip_threads + h->edip_ppe_threads; i++){ + h->ip[i].h =h; + h->ip[i].cell = (i >= h->edip_ppe_threads); + pthread_mutex_init(&h->ip[i].mbs_lock, NULL); + pthread_cond_init(&h->ip[i].mbs_cond, NULL); + h->ip[i].thread_num = i - h->edip_ppe_threads; + h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads; + h->ip[i].mbs_fo = 0; + h->ip[i].mbs_cnt = SLICE_BUFS; + h->ip[i].ed_fi =0; + h->ip[i].ed_fo =0; + pthread_mutex_init(&h->ip[i].ed_lock, NULL); + pthread_cond_init(&h->ip[i].ed_cond, NULL); + pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]); + } + + for(;;){ + { + pthread_mutex_lock(&h->lock[ENTROPY]); + while (h->ed_cnt<=0) + pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]); + s= &h->ed_q[h->ed_fo]; + + pthread_mutex_unlock(&h->lock[ENTROPY]); + h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT; + } + if (s->state<0) + break; + + assert(s->current_picture); + if (s->slice_type_nos == FF_B_TYPE ) + { + pthread_mutex_lock(&h->lock[ENTROPY3B]); + while (h->ed_B_cnt>=MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); + h->ed_B_q[h->ed_B_fi] = *s; + h->ed_B_cnt++; + h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[ENTROPY3B]); + pthread_mutex_unlock(&h->lock[ENTROPY3B]); + }else + { + ///round robin now, change to based on rawframes size. + pthread_mutex_lock(&h->ip[n].ed_lock); + while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT) + pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock); + h->ip[n].ed_q[ h->ip[n].ed_fi] = *s; + h->ip[n].ed_cnt++; + h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->ip[n].ed_cond); + pthread_mutex_unlock(&h->ip[n].ed_lock); + + n++; n %=(h->edip_threads+h->edip_ppe_threads); + } + { + pthread_mutex_lock(&h->lock[ENTROPY]); + h->ed_cnt--; + pthread_cond_signal(&h->cond[ENTROPY]); + pthread_mutex_unlock(&h->lock[ENTROPY]); + + } + } + + { + pthread_mutex_lock(&h->lock[ENTROPY3B]); + while (h->ed_B_cnt>=MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); + h->ed_B_q[h->ed_B_fi] = *s; + h->ed_B_cnt++; + h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[ENTROPY3B]); + pthread_mutex_unlock(&h->lock[ENTROPY3B]); + } + { + for (i=0; iedip_threads + h->edip_ppe_threads; i++){ + pthread_mutex_lock(&h->ip[i].ed_lock); + while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT) + pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock); + h->ip[i].ed_q[ h->ip[i].ed_fi] = *s; + h->ip[i].ed_cnt++; + h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->ip[i].ed_cond); + pthread_mutex_unlock(&h->ip[i].ed_lock); + } + } + { + pthread_mutex_lock(&h->lock[ENTROPY4]); + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); + h->ed_reorder_q[h->ed_reorder_fi] = *s; + h->ed_reorder_cnt++; + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[ENTROPY4]); + pthread_mutex_unlock(&h->lock[ENTROPY4]); + + } + pthread_join(h->ed_B_dist, NULL); + for(i=0; iedip_threads; i++){ + pthread_join(h->ed_IP_thr[i], NULL); + } + pthread_exit(NULL); + return NULL; +} + +static pthread_t ed_IPB_dist; +static void *entropy_IPB_cell_thread(void *arg){ + H264Context *h = (H264Context *) arg; + int i; + EDSlice reorder[MAX_SLICE_COUNT]; + int ip_poc[MAX_SLICE_COUNT][2]={0,}; + int next_ip_id=0; + int ip_poc_cnt=0; + EDSlice *s; + int reorder_cnt=0; + unsigned next_pic_num=0; + + pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h); + int count =0; + for(;;){ + //signals received from the entropy decoders + { + pthread_mutex_lock(&h->lock[ENTROPY4]); + while (h->ed_reorder_cnt<=0) + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); + s= &h->ed_reorder_q[h->ed_reorder_fo]; + h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT; + pthread_mutex_unlock(&h->lock[ENTROPY4]); + } + + if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){ + for (i=0; iip_id < ip_poc[i][0]){ + memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int)); + break; + } + } + ip_poc[i][0]= s->ip_id; + ip_poc[i][1]= s->current_picture->poc; + ip_poc_cnt++; + + while (next_ip_id == ip_poc[0][0]){ + pthread_mutex_lock(&h->lock[ENTROPY2]); + h->ed_poc = ip_poc[0][1]; + + pthread_cond_signal(&h->cond[ENTROPY2]); + pthread_mutex_unlock(&h->lock[ENTROPY2]); + memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int)); + ip_poc_cnt--; + next_ip_id++; + } + } + + for(i=reorder_cnt; i>0; i--){ + if (s->coded_pic_num < reorder[i-1].coded_pic_num) + break; + reorder[i]=reorder[i-1]; + } + reorder[i]=*s; + + while(reorder_cnt>=0){ + if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){ + break; + } + EDSlice *es = &reorder[reorder_cnt]; + + { + pthread_mutex_lock(&h->lock[MBDEC]); + while (h->mbdec_cnt >= MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); + copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es); + + h->mbdec_cnt++; + h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[MBDEC]); + pthread_mutex_unlock(&h->lock[MBDEC]); + + } + + if (es->state<0) + goto end; + + assert(es->current_picture); + for (int i=0; irelease_cnt; i++){ + release_pib_entry(h, es->release_ref[i], 2); + } + release_pib_entry(h, es->current_picture, 1); + av_freep(&es->gb.raw); + if (es->gb.rbsp) + av_freep(&es->gb.rbsp); + + next_pic_num++; + reorder_cnt--; + } + reorder_cnt++; + + { + pthread_mutex_lock(&h->lock[ENTROPY4]); + h->ed_reorder_cnt--; + pthread_cond_signal(&h->cond[ENTROPY4]); + pthread_mutex_unlock(&h->lock[ENTROPY4]); + } + } + +end: + pthread_join(ed_IPB_dist, NULL); + pthread_exit(NULL); + return NULL; +} + + +static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){ + dst->deblocking_filter =1; + dst->linesize = src->current_picture->linesize[0]; + dst->uvlinesize = src->current_picture->linesize[1]; + dst->mb_width = h->mb_width; + dst->mb_height = h->mb_height; + dst->use_weight = src->use_weight; + dst->use_weight_chroma = src->use_weight_chroma; + dst->luma_log2_weight_denom = src->luma_log2_weight_denom; + dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom; + + //weights later + memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t)); + memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t)); + memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t)); + + for(int list=0; list<2; list++){ + for (int i=0; iref_count[list]; i++){ + Picture_spu *p_dst = &dst->ref_list[list][i]; + DecodedPicture *p_src = src->ref_list[list][i]; + if (p_src){ + p_dst->data[0] = p_src->data[0]; + p_dst->data[1] = p_src->data[1]; + p_dst->data[2] = p_src->data[2]; + } + } + } + dst->state = src->state; + + dst->emu_edge_width =32; + dst->emu_edge_height =32; + dst->slice_type = src->slice_type; + dst->slice_type_nos = src->slice_type_nos; + dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset; + dst->slice_beta_offset = src->slice_beta_offset; + + memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64); + + dst->blocks = src->mbs; + dst->dst_y = src->current_picture->data[0]; + dst->dst_cb = src->current_picture->data[1]; + dst->dst_cr = src->current_picture->data[2]; +} + +static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){ + static int rl_fi=0; + + DECLARE_ALIGNED(16, H264slice, spe_slice); + H264spe *p=&spe_params[0]; + unsigned status; + uint8_t *dst_y, *dst_cb, *dst_cr; + + DecodedPicture *dp; + + for (int i=0; i<2; i++){ + for(int j=0; j< s->ref_count[i]; j++){ + if (s->ref_list_cpn[i][j] ==-1) + continue; + int k; + for (k=0; kdpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ + s->ref_list[i][j] = &h->dpb[k]; + break; + } + } + } + } + + dp = get_dpb_entry(h); + init_dpb_entry(dp, s, d->width, d->height); + + if (h->no_mbd) + return; + + + fill_spe_slice(&spe_slice, s, h); + spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0); + spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status); + rl_fi++; rl_fi %= 2; + + _spe_in_mbox_write(spe_control_area[0], 0); + while (atomic_read(rl_cnt)<=0){ + //pthread_yield(); + usleep(1000); + } + atomic_dec(rl_cnt); + + +/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/ +// memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16); +// memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8); +// memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8); +// +// memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16); +// memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8); +// memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8); +// +// decode_slice_mb_seq(d, s); +// +// for (int i=0; imb_height*16; i++){ +// for (int j=0; jwidth; j++){ +// if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){ +// printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]); +// return; +// } +// } +// } +// +// for (int i=0; imb_height*8; i++){ +// for (int j=0; jwidth/2; j++){ +// if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){ +// printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]); +// return; +// } +// } +// } +// +// for (int i=0; imb_height*8; i++){ +// for (int j=0; jwidth/2; j++){ +// if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){ +// printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]); +// return; +// } +// } +// } + + + //printf("dst_y %p\n", dst_y); + + + for (int i=0; irelease_cnt; i++){ + for(int j=0; jdpb[j].cpn== s->release_ref_cpn[i]){ + release_dpb_entry(h, &h->dpb[j], 2); + break; + } + } + } + s->release_cnt=0; + +} + +static void *h264_spe_thread(void * thread_args ) { + H264spe *params = (H264spe *)thread_args; + unsigned int spe_id = params->spe_id; + unsigned int runflags = 0; + unsigned int entry = SPE_DEFAULT_ENTRY; + // run SPE context + spe_context_run(spe_context[spe_id], &entry, runflags, (void*) params, NULL, NULL); + // done - now exit thread + pthread_exit(NULL); +} + +static int create_spe_MBR_threads(H264Context *h, int num_threads) { + int i; + + // reserve memory for spe thread id, context and argument addresses + spe_tid = av_malloc(num_threads * sizeof (pthread_t)); + spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); + spe_params = av_malloc(num_threads * sizeof (H264spe)); + spe_control_area = av_malloc(num_threads * sizeof (void*)); + spe_ls_area = av_malloc(num_threads * sizeof (void*)); + spe_slice_buf = av_malloc(num_threads * sizeof (void*)); + + spe_program_handle_t *spe_program = spe_image_open("spe_mbd"); + + if (spe_program == NULL) + av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); + + for (i = 0; i < num_threads; i++) { + // create context for spe program + spe_context[i] = spe_context_create(SPE_MAP_PS, NULL); + if (spe_context[i] == NULL) + av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); + // load SPE program into main memory + if ((spe_program_load(spe_context[i], spe_program)) == -1) + av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); + //get the control_area for fast mailboxing + if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL) + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); + //get ls area for inter spe communication + if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL) + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); + } + + for (i = 0; i < num_threads; i++) { + spe_params[i].mb_width = h->mb_width; + spe_params[i].mb_height = h->mb_height; + spe_params[i].mb_stride = h->mb_stride; + spe_params[i].spe_id = i; + spe_params[i].spe_total = num_threads; + //spe_params[i].slice_params= &slice_params; + spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads]; + spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads]; + + spe_params[i].rl_lock = rl_lock; + spe_params[i].rl_cond = rl_cond; + spe_params[i].rl_cnt = rl_cnt; + spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i]; + spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i]; + spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0); + + mutex_init(spe_params[i].lock); + cond_init(spe_params[i].cond); + if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i])) + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); + + //slicebufaddr + spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]); + + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); + } + spe_image_close(spe_program); + return 0; +} + +//_spe_out_mbox_read(spe_control_area[i]); +/** +* joins all the spe worker threads. +*/ +static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) { + int i; + ///just to keep coding consistency. + { + for (i=0; icnt)>=2) {//double buffered + usleep(1000);//cond_wait(p->cond, p->lock); + } + + spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0); + spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); + //mutex_unlock(p->lock); + _spe_in_mbox_write(spe_control_area[i], 0); + } + } + + for (i=0; irl_threads); + for(;;){ + { + pthread_mutex_lock(&h->lock[MBDEC]); + while (h->mbdec_cnt<=0) + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); + s= &h->mbdec_q[h->mbdec_fo]; + h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT; + pthread_mutex_unlock(&h->lock[MBDEC]); + } + + if (s->state<0){ + break; + } + for (int i=0; i<2; i++){ + for(int j=0; j< s->ref_count[i]; j++){ + if (s->ref_list_cpn[i][j] ==-1) + continue; + int k; + for (k=0; kdpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ + s->ref_list[i][j] = &h->dpb[k]; + break; + } + } + + } + } + dp = get_dpb_entry(h); + init_dpb_entry(dp, s, h->width, h->height); + assert(s->current_picture); + { + while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ + usleep(1000); + } + h->mbrel_q[h->mbrel_fi] = *s; + + h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; + } + { + if(h->no_mbd){ + atomic_inc(rl_cnt); + }else { + fill_spe_slice(&spe_slice, s, h); + for (i=0; irl_threads; i++){ + H264spe *p=&spe_params[i]; + unsigned status; + while (atomic_read(p->cnt)>=2){ //double buffered + usleep(1000); + //cond_wait(p->cond, p->lock); + } + spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0); + spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); + rl_fi[i]++; rl_fi[i] %= 2; + atomic_inc(p->cnt); + + _spe_in_mbox_write(spe_control_area[i], 0); + } + } + } + + { + pthread_mutex_lock(&h->lock[MBDEC]); + h->mbdec_cnt--; + pthread_cond_signal(&h->cond[MBDEC]); + pthread_mutex_unlock(&h->lock[MBDEC]); + } + + } + + { + while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ + usleep(1000); + } + h->mbrel_q[h->mbrel_fi] = *s; + + h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; + } + spe_slice.state=-1; + join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi); + pthread_exit(NULL); + return NULL; +} + +static void *mbdec_cell_thread(void *arg){ + H264Context *h = (H264Context *) arg; + + rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; + rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; + rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; + atomic_set(rl_cnt, 0); + mutex_init(rl_lock); + cond_init(rl_cond); +// printf("mbdec, pid %d\n", syscall(SYS_gettid)); + pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h); + + for(;;){ + MBSlice *s=NULL; + { + while (atomic_read(rl_cnt)<=0){ + usleep(1000); + } + s= &h->mbrel_q[h->mbrel_fo]; + h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT; + } + + if (s->state<0) + break; + + for (int i=0; irelease_cnt; i++){ + for(int j=0; jdpb[j].cpn== s->release_ref_cpn[i]){ + release_dpb_entry(h, &h->dpb[j], 2); + break; + } + } + } + + { + EDThreadContext *ed = s->ed; + pthread_mutex_lock(&ed->mbs_lock); + ed->mbs_cnt++; + pthread_cond_signal(&ed->mbs_cond); + pthread_mutex_unlock(&ed->mbs_lock); + } + + { + pthread_mutex_lock(&h->lock[WRITE]); + while (h->write_cnt>= DPB_SIZE) + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); + assert(s); + assert(s->current_picture); + h->write_q[h->write_fi]= s->current_picture; + h->write_cnt++; + h->write_fi++; h->write_fi %= DPB_SIZE; + pthread_cond_signal(&h->cond[WRITE]); + pthread_mutex_unlock(&h->lock[WRITE]); + + } + { + atomic_dec(rl_cnt); + } + + } + + {//propagate exit + pthread_mutex_lock(&h->lock[WRITE]); + while (h->write_cnt>= DPB_SIZE) + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); + last_pic.reference = -1; + h->write_q[h->write_fi] = &last_pic; + h->write_cnt++; + h->write_fi++; h->write_fi %= DPB_SIZE; + pthread_cond_signal(&h->cond[WRITE]); + pthread_mutex_unlock(&h->lock[WRITE]); + + } + pthread_join(h->rl_dist_thr, NULL); + pthread_exit(NULL); + return NULL; +} + +/* +* The following code is the main loop of the file converter +*/ +int h264_decode_cell(H264Context *h) { + + pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; + + start_timer(); + + pthread_create(&read_thr, NULL, read_thread, h); + pthread_create(&parsenal_thr, NULL, parsenal_thread, h); + pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h); + pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h); + pthread_create(&write_thr, NULL, write_thread, h); + + pthread_join(read_thr, NULL); + pthread_join(parsenal_thr, NULL); + pthread_join(entropy_thr, NULL); + pthread_join(mbdec_thr, NULL); + pthread_join(write_thr, NULL); + + return 0; +} + +/* +* The following code is the main loop of the file converter +*/ +int h264_decode_cell_seq(H264Context *h) { +ParserContext *pc; + NalContext *nc; + EntropyContext *ec; + MBRecContext *rc; + OutputContext *oc; + + RawFrame frm; + EDSlice slice, *s=&slice; + MBSlice mbslice, *s2=&mbslice; + PictureInfo *pic=NULL; + DecodedPicture *out; + int size; + int frames=0; + + pc = get_parse_context(h->ifile); + nc = get_nal_context(h->width, h->height); + ec = get_entropy_context( h ); + rc = get_mbrec_context(h); + oc = get_output_context( h ); + + rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; + rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; + rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; + atomic_set(rl_cnt, 0); + mutex_init(rl_lock); + cond_init(rl_cond); + + memset(s, 0, sizeof(EDSlice)); + ff_init_slice(nc, s); + s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb)); + + DecodedPicture tmp; + tmp.base[0]=0; + ///fix this when want to debug the Cell errors + //init_dpb_entry(&tmp, h->width, h->height); + + create_spe_ED_threads(h, 1, 0); + create_spe_MBR_threads(h, 1); + + start_timer(); + + while(!pc->final_frame && frames++ < h->num_frames){ + + av_read_frame_internal(pc, &frm); + + PictureInfo *pic=get_pib_entry(h); + ff_alloc_picture_info(nc, s, pic); + decode_nal_units(nc, s, &frm); + + copyEDtoMBSlice(s2, s); + decode_slice_entropy_cell_seq(h, ec, s); + + decode_slice_mb_seq_cell(h, rc, s2, &tmp); + + out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height); + + if (out){ + release_dpb_entry(h, out, 1); + } + print_report(oc->frame_number, oc->video_size, 0, h->verbose); + } + while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; + + print_report(oc->frame_number, oc->video_size, 1, h->verbose); + + /* finished ! */ + av_freep(&s->mbs); + + free_parse_context(pc); + free_nal_context (nc); + free_entropy_context(ec); + free_mbrec_context(rc); + free_output_context(oc); + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_data.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_data.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,243 @@ +/* + * H26L/H264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * @brief + * H264 / AVC / MPEG4 part10 codec data table + * @author Michael Niedermayer + */ + +#ifndef AVCODEC_H264DATA_H +#define AVCODEC_H264DATA_H + +#include +#include "avcodec.h" +//#include "h264.h" + +/* +o-o o-o + / / / +o-o o-o + ,---' +o-o o-o + / / / +o-o o-o +*/ +//This table must be here because scan8[constant] must be known at compiletime +static const uint8_t scan8[16 + 2*4]={ + 4+1*8, 5+1*8, 4+2*8, 5+2*8, + 6+1*8, 7+1*8, 6+2*8, 7+2*8, + 4+3*8, 5+3*8, 4+4*8, 5+4*8, + 6+3*8, 7+3*8, 6+4*8, 7+4*8, + 1+1*8, 2+1*8, + 1+2*8, 2+2*8, + 1+4*8, 2+4*8, + 1+5*8, 2+5*8, +}; + +static const uint8_t golomb_to_pict_type[5]= +{FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE}; + +static const uint8_t golomb_to_intra4x4_cbp[48]={ + 47, 31, 15, 0, 23, 27, 29, 30, 7, 11, 13, 14, 39, 43, 45, 46, + 16, 3, 5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44, 1, 2, 4, + 8, 17, 18, 20, 24, 6, 9, 22, 25, 32, 33, 34, 36, 40, 38, 41 +}; + +static const uint8_t golomb_to_inter_cbp[48]={ + 0, 16, 1, 2, 4, 8, 32, 3, 5, 10, 12, 15, 47, 7, 11, 13, + 14, 6, 9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46, + 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41 +}; + +static const uint8_t zigzag_scan[16]={ + 0+0*4, 1+0*4, 0+1*4, 0+2*4, + 1+1*4, 2+0*4, 3+0*4, 2+1*4, + 1+2*4, 0+3*4, 1+3*4, 2+2*4, + 3+1*4, 3+2*4, 2+3*4, 3+3*4, +}; + +static const uint8_t field_scan[16]={ + 0+0*4, 0+1*4, 1+0*4, 0+2*4, + 0+3*4, 1+1*4, 1+2*4, 1+3*4, + 2+0*4, 2+1*4, 2+2*4, 2+3*4, + 3+0*4, 3+1*4, 3+2*4, 3+3*4, +}; + +static const uint8_t luma_dc_zigzag_scan[16]={ + 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64, + 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64, + 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64, + 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64, +}; + +static const uint8_t luma_dc_field_scan[16]={ + 0*16 + 0*64, 2*16 + 0*64, 1*16 + 0*64, 0*16 + 2*64, + 2*16 + 2*64, 3*16 + 0*64, 1*16 + 2*64, 3*16 + 2*64, + 0*16 + 1*64, 2*16 + 1*64, 0*16 + 3*64, 2*16 + 3*64, + 1*16 + 1*64, 3*16 + 1*64, 1*16 + 3*64, 3*16 + 3*64, +}; + +static const uint8_t chroma_dc_scan[4]={ + (0+0*2)*16, (1+0*2)*16, + (0+1*2)*16, (1+1*2)*16, //FIXME +}; + + +static const uint8_t field_scan8x8[64]={ + 0+0*8, 0+1*8, 0+2*8, 1+0*8, + 1+1*8, 0+3*8, 0+4*8, 1+2*8, + 2+0*8, 1+3*8, 0+5*8, 0+6*8, + 0+7*8, 1+4*8, 2+1*8, 3+0*8, + 2+2*8, 1+5*8, 1+6*8, 1+7*8, + 2+3*8, 3+1*8, 4+0*8, 3+2*8, + 2+4*8, 2+5*8, 2+6*8, 2+7*8, + 3+3*8, 4+1*8, 5+0*8, 4+2*8, + 3+4*8, 3+5*8, 3+6*8, 3+7*8, + 4+3*8, 5+1*8, 6+0*8, 5+2*8, + 4+4*8, 4+5*8, 4+6*8, 4+7*8, + 5+3*8, 6+1*8, 6+2*8, 5+4*8, + 5+5*8, 5+6*8, 5+7*8, 6+3*8, + 7+0*8, 7+1*8, 6+4*8, 6+5*8, + 6+6*8, 6+7*8, 7+2*8, 7+3*8, + 7+4*8, 7+5*8, 7+6*8, 7+7*8, +}; + +typedef struct IMbInfo{ + uint16_t type; + uint8_t pred_mode; + uint8_t cbp; +} IMbInfo; + +static const IMbInfo i_mb_type_info[26]={ +{MB_TYPE_INTRA4x4 , -1, -1}, +{MB_TYPE_INTRA16x16, 2, 0}, +{MB_TYPE_INTRA16x16, 1, 0}, +{MB_TYPE_INTRA16x16, 0, 0}, +{MB_TYPE_INTRA16x16, 3, 0}, +{MB_TYPE_INTRA16x16, 2, 16}, +{MB_TYPE_INTRA16x16, 1, 16}, +{MB_TYPE_INTRA16x16, 0, 16}, +{MB_TYPE_INTRA16x16, 3, 16}, +{MB_TYPE_INTRA16x16, 2, 32}, +{MB_TYPE_INTRA16x16, 1, 32}, +{MB_TYPE_INTRA16x16, 0, 32}, +{MB_TYPE_INTRA16x16, 3, 32}, +{MB_TYPE_INTRA16x16, 2, 15+0}, +{MB_TYPE_INTRA16x16, 1, 15+0}, +{MB_TYPE_INTRA16x16, 0, 15+0}, +{MB_TYPE_INTRA16x16, 3, 15+0}, +{MB_TYPE_INTRA16x16, 2, 15+16}, +{MB_TYPE_INTRA16x16, 1, 15+16}, +{MB_TYPE_INTRA16x16, 0, 15+16}, +{MB_TYPE_INTRA16x16, 3, 15+16}, +{MB_TYPE_INTRA16x16, 2, 15+32}, +{MB_TYPE_INTRA16x16, 1, 15+32}, +{MB_TYPE_INTRA16x16, 0, 15+32}, +{MB_TYPE_INTRA16x16, 3, 15+32}, +{MB_TYPE_INTRA_PCM , -1, -1}, +}; + +typedef struct PMbInfo{ + uint16_t type; + uint8_t partition_count; +} PMbInfo; + +static const PMbInfo p_mb_type_info[5]={ +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 4}, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4}, +}; + +static const PMbInfo p_sub_mb_type_info[4]={ +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, +{MB_TYPE_16x8 |MB_TYPE_P0L0 , 2}, +{MB_TYPE_8x16 |MB_TYPE_P0L0 , 2}, +{MB_TYPE_8x8 |MB_TYPE_P0L0 , 4}, +}; + +static const PMbInfo b_mb_type_info[23]={ +{MB_TYPE_DIRECT2|MB_TYPE_L0L1 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, +}; + +static const PMbInfo b_sub_mb_type_info[13]={ +{MB_TYPE_DIRECT2 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, +{MB_TYPE_8x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 4, }, +{MB_TYPE_8x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 4, }, +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, +}; + +static const uint8_t dequant4_coeff_init[6][3]={ + {10,13,16}, + {11,14,18}, + {13,16,20}, + {14,18,23}, + {16,20,25}, + {18,23,29}, +}; + +static const uint8_t dequant8_coeff_init_scan[16] = { + 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 +}; +static const uint8_t dequant8_coeff_init[6][6]={ + {20,18,32,19,25,24}, + {22,19,35,21,28,26}, + {26,23,42,24,33,31}, + {28,25,45,26,35,33}, + {32,28,51,30,40,38}, + {36,32,58,34,46,43}, +}; + +#endif /* AVCODEC_H264DATA_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_deblock.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_deblock.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,507 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... loop filter + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 loop filter. + * @author Michael Niedermayer + */ + +#include "dsputil.h" +#include "mathops.h" +#include "rectangle.h" +#include "h264_types.h" +#include "h264_misc.h" +#include "h264_data.h" +//#undef NDEBUG +#include + +/* Deblocking filter (p153) */ +static const uint8_t alpha_table[52*3] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, + 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, + 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, + 80, 90,101,113,127,144,162,182,203,226, + 255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, + 255,255,255,255,255,255,255,255,255,255,255,255,255, +}; +static const uint8_t beta_table[52*3] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, + 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, + 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, + 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, +}; +static const uint8_t tc0_table[52*3][4] = { + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, + {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, + {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, + {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, + {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, + {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, + {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, +}; + +av_always_inline static void filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s) { + const unsigned int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + tc[0] = tc0_table[index_a][bS[0]]; + tc[1] = tc0_table[index_a][bS[1]]; + tc[2] = tc0_table[index_a][bS[2]]; + tc[3] = tc0_table[index_a][bS[3]]; + mrc->hdsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc); + } else { + mrc->hdsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta); + } +} + +av_always_inline static void filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { + const unsigned int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + tc[0] = tc0_table[index_a][bS[0]]+1; + tc[1] = tc0_table[index_a][bS[1]]+1; + tc[2] = tc0_table[index_a][bS[2]]+1; + tc[3] = tc0_table[index_a][bS[3]]+1; + mrc->hdsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc); + } else { + mrc->hdsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta); + } +} + + +av_always_inline static void filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { + const unsigned int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + tc[0] = tc0_table[index_a][bS[0]]; + tc[1] = tc0_table[index_a][bS[1]]; + tc[2] = tc0_table[index_a][bS[2]]; + tc[3] = tc0_table[index_a][bS[3]]; + mrc->hdsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc); + } else { + mrc->hdsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta); + } +} + +av_always_inline static void filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { + const unsigned int index_a = qp + s->slice_alpha_c0_offset; + const int alpha = alpha_table[index_a]; + const int beta = beta_table[qp + s->slice_beta_offset]; + if (alpha ==0 || beta == 0) return; + + if( bS[0] < 4 ) { + int8_t tc[4]; + tc[0] = tc0_table[index_a][bS[0]]+1; + tc[1] = tc0_table[index_a][bS[1]]+1; + tc[2] = tc0_table[index_a][bS[2]]+1; + tc[3] = tc0_table[index_a][bS[3]]+1; + mrc->hdsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc); + } else { + mrc->hdsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta); + } +} + +static av_always_inline void filter_mb_dir(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, int dir) { + const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type; + const int qp_xy= m->qscale_mb_xy; + const int qp_dir = dir == 0 ? m->qscale_left_mb_xy : m->qscale_top_mb_xy; + const int linesize = mrc->linesize; + const int uvlinesize = mrc->uvlinesize; + const int mb_type = m->mb_type; + int edge; + const int edges = mrs->edges[dir]; + + if(mbm_type){ + int16_t* bS=mrs->bS[dir][0]; + /* Filter edge */ + // Do not use s->qscale as luma quantizer because it has not the same + // value in IPCM macroblocks. + if(bS[0]+bS[1]+bS[2]+bS[3]){ + int qp = ( qp_xy + qp_dir + 1 ) >> 1; + if( dir == 0 ) { + filter_mb_edgev( &img_y[0], linesize, bS, qp, mrc, s ); + { + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; + filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, mrc, s); + filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, mrc, s); + } + } else { + filter_mb_edgeh( &img_y[0], linesize, bS, qp, mrc, s ); + { + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; + filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, mrc, s); + filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, mrc, s); + } + } + } + } + + for( edge = 1; edge < edges; edge++ ) { + int16_t* bS=mrs->bS[dir][edge]; + int qp = qp_xy; + + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) + continue; + + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) + continue; + + /* Filter edge */ + // Do not use s->qscale as luma quantizer because it has not the same + // value in IPCM macroblocks. + + if( dir == 0 ) { + filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, mrc, s); + if( (edge&1) == 0 ) { + filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s); + filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s); + } + } else { + filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, mrc, s ); + if( (edge&1) == 0 ) { + filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s); + filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s); + } + } + } +} + +static int check_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, long b_idx, long bn_idx, int mvy_limit){ + int v; + v= mrs->ref_cache[0][b_idx] != mrs->ref_cache[0][bn_idx]; + if(!v && mrs->ref_cache[0][b_idx]!=-1) + // absolute value >= 7 | ... + v= ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) | + ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit); + + if(s->list_count==2){ + if(!v) + v = (mrs->ref_cache[1][b_idx] != mrs->ref_cache[1][bn_idx]) | + ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) | + ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit); + + if(v){ + if((mrs->ref_cache[0][b_idx] != mrs->ref_cache[1][bn_idx]) | + (mrs->ref_cache[1][b_idx] != mrs->ref_cache[0][bn_idx])) + return 1; + return + ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) | + ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit) | + ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) | + ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit); + } + } + + return v; +} + +static void calc_bS_values(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mvy_limit, int dir) { + int mb_type = m->mb_type; + int edge; + const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type; + + // how often to recheck mv-based bS when iterating between edges + static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1}, + {0,3,1,1,3,3,3,3}}; + const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7]; + const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4; + // how often to recheck mv-based bS when iterating along each edge + const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); + + mrs->edges[dir]= edges; + + if(mbm_type){ + int16_t* bS=mrs->bS[dir][0]; + if( IS_INTRA(mb_type|mbm_type)) { + AV_WN64A(bS, 0x0004000400040004ULL); + } else { + int i; + int mv_done; + if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { + int b_idx= 8 + 4; + int bn_idx= b_idx - (dir ? 8:1); + + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, 8 + 4, bn_idx, mvy_limit); + mv_done = 1; + } + else + mv_done = 0; + + for( i = 0; i < 4; i++ ) { + int x = dir == 0 ? 0 : i; + int y = dir == 0 ? i : 0; + int b_idx= 8 + 4 + x + 8*y; + int bn_idx= b_idx - (dir ? 8:1); + + if( mrs->non_zero_count_cache[b_idx] | + mrs->non_zero_count_cache[bn_idx] ) { + bS[i] = 2; + } + else if(!mv_done) + { + bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); + } + } + } + } + + /* Calculate bS */ + for( edge = 1; edge < edges; edge++ ) { + int16_t* bS=mrs->bS[dir][edge]; + + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) + continue; + + if( IS_INTRA(mb_type)) { + AV_WN64A(bS, 0x0003000300030003ULL); + } else { + int i; + int mv_done; + + if( edge & mask_edge ) { + AV_ZERO64(bS); + mv_done = 1; + } + else if( mask_par0 ) { + int b_idx= 8 + 4 + edge * (dir ? 8:1); + int bn_idx= b_idx - (dir ? 8:1); + + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); + mv_done = 1; + } + else + mv_done = 0; + + for( i = 0; i < 4; i++ ) { + int x = dir == 0 ? edge : i; + int y = dir == 0 ? i : edge; + int b_idx= 8 + 4 + x + 8*y; + int bn_idx= b_idx - (dir ? 8:1); + + if( mrs->non_zero_count_cache[b_idx] | + mrs->non_zero_count_cache[bn_idx] ) { + bS[i] = 2; + } + else if(!mv_done) + { + bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); + } + } + + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) + continue; + } + + } +} + + +/** +* +* @return zero if the loop filter can be skiped +*/ +static int fill_filter_caches(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ + H264Mb *m_top = m - mrc->mb_width; + H264Mb *m_left = m - 1; + const int mb_x = m->mb_x; + const int mb_y = m->mb_y; + int top_type, left_type; + int qp, top_qp, left_qp; + int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice + + qp = m->qscale_mb_xy ; + left_qp = m->qscale_left_mb_xy ; + top_qp = m->qscale_top_mb_xy ; + + //for sufficiently low qp, filtering wouldn't do anything + //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp + if(qp <= qp_thresh + && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh) + && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){ + return 0; + } + + if(IS_INTRA(mb_type)){ + return 1; + } + + { + int list; + for(list=0; listlist_count; list++){ + int8_t *ref; + + if(!USES_LIST(mb_type, list)){ + fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); + fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); + AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + AV_WN32A(&mrs->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); + continue; + } + + ref = &mrs->ref_index[list][4*mb_x]; + { + int (*ref2frm)[64] =(void *) (s->ref2frm[0] + 2); + AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + ref += 2; + + AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + AV_WN32A(&mrs->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); + } + } + } + + /* + 0 . T T. T T T T + 1 L . .L . . . . + 2 L . .L . . . . + 3 . T TL . . . . + 4 L . .L . . . . + 5 L . .. . . . . + */ + + if (IS_SKIP(mb_type)){ + memset(mrs->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui + } + + //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) + top_type = mrs->top_type; + left_type = mrs->left_type; + if(top_type){ + AV_COPY32(&mrs->non_zero_count_cache[4+8*0], &m_top->non_zero_count[3*4]); + } + + if(left_type){ + mrs->non_zero_count_cache[3+8*1]= m_left->non_zero_count[3+0*4]; + mrs->non_zero_count_cache[3+8*2]= m_left->non_zero_count[3+1*4]; + mrs->non_zero_count_cache[3+8*3]= m_left->non_zero_count[3+2*4]; + mrs->non_zero_count_cache[3+8*4]= m_left->non_zero_count[3+3*4]; + } + + if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ + int list; + for(list=0; listlist_count; list++){ + if(USES_LIST(top_type, list)){ + const int b_xy= 4*mb_x + 3*mrc->b_stride; + const int b8_x= 4*mb_x + 2; + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); + AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]); + + mrs->ref_cache[list][scan8[0] + 0 - 1*8]= + mrs->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 0]]; + mrs->ref_cache[list][scan8[0] + 2 - 1*8]= + mrs->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 1]]; + }else{ + AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]); + AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u); + } + + if(USES_LIST(left_type, list)){ + const int b_x = 4*(mb_x-1) + 3; + const int b8_x= 4*(mb_x-1) + 1; + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 0 ], mrs->motion_val[list][b_x + mrc->b_stride*0]); + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 8 ], mrs->motion_val[list][b_x + mrc->b_stride*1]); + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +16 ], mrs->motion_val[list][b_x + mrc->b_stride*2]); + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +24 ], mrs->motion_val[list][b_x + mrc->b_stride*3]); + + mrs->ref_cache[list][scan8[0] - 1 + 0 ]= + mrs->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*0]]; + mrs->ref_cache[list][scan8[0] - 1 +16 ]= + mrs->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*1]]; + + }else{ + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 0 ]); + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 8 ]); + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +16 ]); + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +24 ]); + + mrs->ref_cache[list][scan8[0] - 1 + 0 ]= + mrs->ref_cache[list][scan8[0] - 1 + 8 ]= + mrs->ref_cache[list][scan8[0] - 1 + 16 ]= + mrs->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED; + } + } + } + return 1; +} + +void ff_h264_filter_mb(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) { + if (fill_filter_caches(mrc, mrs, s, m, m->mb_type)){ + calc_bS_values(mrc, mrs, s, m, 4, 0); + calc_bS_values(mrc, mrs, s, m, 4, 1); + filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 0); + filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 1); + } +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_deblock.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_deblock.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,8 @@ +#ifndef H264_LOOPFILTER_H +#define H264_LOOPFILTER_H + +#include "h264_types.h" + +void ff_h264_filter_mb(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_dsp.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_dsp.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,320 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003-2010 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 DSP functions. + * @author Michael Niedermayer + */ + +#include +#include "avcodec.h" +#include "h264_dsp.h" + +#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) +#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) +#define H264_WEIGHT(W,H) \ +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ + int y; \ + offset <<= log2_denom; \ + if(log2_denom) offset += 1<<(log2_denom-1); \ + for(y=0; y> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); + tc++; + } + if( FFABS( q2 - q0 ) < beta ) { + if(tc0[i]) + pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); + tc++; + } + + i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ + pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ + } + pix += ystride; + } + } +} +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); +} +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); +} + +static av_always_inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) +{ + int d; + for( d = 0; d < 16; d++ ) { + const int p2 = pix[-3*xstride]; + const int p1 = pix[-2*xstride]; + const int p0 = pix[-1*xstride]; + + const int q0 = pix[ 0*xstride]; + const int q1 = pix[ 1*xstride]; + const int q2 = pix[ 2*xstride]; + + if( FFABS( p0 - q0 ) < alpha && + FFABS( p1 - p0 ) < beta && + FFABS( q1 - q0 ) < beta ) { + + if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ + if( FFABS( p2 - p0 ) < beta) + { + const int p3 = pix[-4*xstride]; + /* p0', p1', p2' */ + pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; + pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; + pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; + } else { + /* p0' */ + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + } + if( FFABS( q2 - q0 ) < beta) + { + const int q3 = pix[3*xstride]; + /* q0', q1', q2' */ + pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; + pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; + pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; + } else { + /* q0' */ + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + }else{ + /* p0', q0' */ + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; + pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; + } + } + pix += ystride; + } +} +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); +} +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); +} + +static av_always_inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) +{ + int i, d; + for( i = 0; i < 4; i++ ) { + const int tc = tc0[i]; + if( tc <= 0 ) { + pix += 2*ystride; + continue; + } + for( d = 0; d < 2; d++ ) { + const int p0 = pix[-1*xstride]; + const int p1 = pix[-2*xstride]; + const int q0 = pix[0]; + const int q1 = pix[1*xstride]; + + if( FFABS( p0 - q0 ) < alpha && + FFABS( p1 - p0 ) < beta && + FFABS( q1 - q0 ) < beta ) { + + int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + + pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ + pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ + } + pix += ystride; + } + } +} +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); +} +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); +} + +static av_always_inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) +{ + int d; + for( d = 0; d < 8; d++ ) { + const int p0 = pix[-1*xstride]; + const int p1 = pix[-2*xstride]; + const int q0 = pix[0]; + const int q1 = pix[1*xstride]; + + if( FFABS( p0 - q0 ) < alpha && + FFABS( p1 - p0 ) < beta && + FFABS( q1 - q0 ) < beta ) { + + pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ + } + pix += ystride; + } +} +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); +} +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); +} + +void ff_h264dsp_init(H264DSPContext *c) +{ + c->h264_idct_add= ff_h264_idct_add_c; + c->h264_idct8_add= ff_h264_idct8_add_c; + c->h264_idct_dc_add= ff_h264_idct_dc_add_c; + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; + c->h264_idct_add16 = ff_h264_idct_add16_c; + c->h264_idct8_add4 = ff_h264_idct8_add4_c; + c->h264_idct_add8 = ff_h264_idct_add8_c; + c->h264_idct_add16intra= ff_h264_idct_add16intra_c; + + c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; + c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; + c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; + c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; + c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; + c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; + c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; + c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; + c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; + c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; + c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; + c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; + c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; + c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; + c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; + c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; + c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; + c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; + c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; + c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; + + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; + c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; + c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; + c->h264_loop_filter_strength= NULL; + + if (ARCH_ARM) ff_h264dsp_init_arm(c); + if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c); + if (HAVE_MMX) ff_h264dsp_init_x86(c); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_dsp.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_dsp.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2003-2010 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 DSP functions. + * @author Michael Niedermayer + */ + +#ifndef AVCODEC_H264DSP_H +#define AVCODEC_H264DSP_H + +#include +#include "dsputil.h" + +//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); +typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); +typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); + +/** + * Context for storing H.264 DSP functions + */ +typedef struct H264DSPContext{ + /* weighted MC */ + h264_weight_func weight_h264_pixels_tab[10]; + h264_biweight_func biweight_h264_pixels_tab[10]; + + /* loop filter */ + void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0); + /* v/h_loop_filter_luma_intra: align 16 */ + void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); + void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); + void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0); + void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0); + void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); + void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); + // h264_loop_filter_strength: simd only. the C version is inlined in h264.c + void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field); + + /* IDCT */ + /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them + NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them + The reason for above, is that no 2 out of one list may use a different permutation. + */ + void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); + void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); + void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); + void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); + void (*h264_dct)(DCTELEM block[4][4]); + void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); + void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); + void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); + void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); + + qpel_mc_func (*qpel_put)[16]; + qpel_mc_func (*qpel_avg)[16]; +}H264DSPContext; + +void ff_h264dsp_init(H264DSPContext *c); +void ff_h264dsp_init_arm(H264DSPContext *c); +void ff_h264dsp_init_ppc(H264DSPContext *c); +void ff_h264dsp_init_x86(H264DSPContext *c); + +#endif /* AVCODEC_H264DSP_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_entropy.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_entropy.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,2065 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 cabac decoding. + * @author Michael Niedermayer + */ + +#include "avcodec.h" +#include "h264_types.h" +#include "h264_data.h" +#include "cabac.h" +#include "rectangle.h" +#include "h264_misc.h" + +// #undef NDEBUG +#include + +/* Cabac pre state table */ + +static const int8_t cabac_context_init_I[460][2] = +{ + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 unsused for I */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, + + /* 24- 39 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + + /* 40 - 53 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, + + /* 54 - 59 */ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 -> 87 */ + { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, + { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, + { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, + { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, + { -12, 115 },{ -16, 122 }, + + /* 88 -> 104 */ + { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, + { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, + { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, + { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, + { -22, 125 }, + + /* 105 -> 135 */ + { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, + { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, + { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, + { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, + { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, + { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, + { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, + { 14, 62 }, { -13, 108 },{ -15, 100 }, + + /* 136 -> 165 */ + { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, + { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, + { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, + { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, + { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, + { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, + { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, + { 0, 62 }, { 12, 72 }, + + /* 166 -> 196 */ + { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, + { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, + { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, + { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, + { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, + { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, + { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, + { 0, 89 }, { 26, -19 }, { 22, -17 }, + + /* 197 -> 226 */ + { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, + { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, + { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, + { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, + { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, + { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, + { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, + { 12, 68 }, { 2, 97 }, + + /* 227 -> 251 */ + { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, + { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, + { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, + { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, + { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, + { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, + { -4, 65 }, + + /* 252 -> 275 */ + { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, + { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, + { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, + { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, + { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, + { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 -> 307 */ + { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, + { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, + { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, + { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, + { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, + { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, + { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, + { 9, 64 }, { -12, 104 },{ -11, 97 }, + + /* 308 -> 337 */ + { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, + { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, + { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, + { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, + { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, + { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, + { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, + { 5, 64 }, { 12, 70 }, + + /* 338 -> 368 */ + { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, + { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, + { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, + { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, + { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, + { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, + { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, + { -12, 109 },{ 36, -35 }, { 36, -34 }, + + /* 369 -> 398 */ + { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, + { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, + { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, + { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, + { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, + { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, + { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, + { 29, 39 }, { 19, 66 }, + + /* 399 -> 435 */ + { 31, 21 }, { 31, 31 }, { 25, 50 }, + { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, + { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, + { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, + { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, + { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, + { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, + { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, + { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, + { 0, 68 }, { -9, 92 }, + + /* 436 -> 459 */ + { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, + { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, + { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, + { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, + { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, + { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 } +}; + +static const int8_t cabac_context_init_PB[3][460][2] = +{ + /* i_cabac_init_idc == 0 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, + { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, + { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, + { 17, 50 }, + + /* 24 - 39 */ + { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, + { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, + { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, + { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, + + /* 40 - 53 */ + { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, + { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, + { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, + { -3, 81 }, { 0, 88 }, + + /* 54 - 59 */ + { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, + { -7, 72 }, { 1, 58 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 87 */ + { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, + { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, + { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, + { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, + { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, + { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, + { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, + { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, + { 0, 68 }, { -4, 69 }, { -8, 88 }, + + /* 105 -> 165 */ + { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, + { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, + { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, + { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, + { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, + { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, + { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, + { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, + { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, + { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, + { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, + { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, + { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, + { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, + { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, + { 9, 69 }, + + /* 166 - 226 */ + { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, + { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, + { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, + { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, + { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, + { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, + { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, + { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, + { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, + { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, + { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, + { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, + { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, + { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, + { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, + { -9, 108 }, + + /* 227 - 275 */ + { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, + { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, + { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, + { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, + { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, + { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, + { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, + { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, + { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, + { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, + { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, + { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, + { -8, 85 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, + { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, + { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, + { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, + { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, + { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, + { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, + { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, + { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, + { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, + { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, + { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, + { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, + { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, + { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, + { 26, 43 }, + + /* 338 - 398 */ + { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, + { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, + { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, + { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, + { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, + { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, + { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, + { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, + { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, + { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, + { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, + { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, + { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, + { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, + { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, + { 11, 86 }, + + /* 399 - 435 */ + { 12, 40 }, { 11, 51 }, { 14, 59 }, + { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, + { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, + { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, + { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, + { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, + { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, + { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, + { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, + { -8, 66 }, { -8, 76 }, + + /* 436 - 459 */ + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, + { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, + { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, + }, + + /* i_cabac_init_idc == 1 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, + { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, + { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, + { 10, 54 }, + + /* 24 - 39 */ + { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, + { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, + { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, + { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, + + /* 40 - 53 */ + { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, + { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, + { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, + { -7, 86 },{ -5, 95 }, + + /* 54 - 59 */ + { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, + { -5, 72 },{ 0, 61 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 104 */ + { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, + { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, + { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, + { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, + { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, + { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, + { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, + { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, + { 0, 68 }, { -7, 74 }, { -9, 88 }, + + /* 105 -> 165 */ + { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, + { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, + { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, + { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, + { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, + { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, + { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, + { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, + { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, + { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, + { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, + { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, + { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, + { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, + { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, + { 0, 89 }, + + /* 166 - 226 */ + { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, + { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, + { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, + { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, + { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, + { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, + { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, + { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, + { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, + { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, + { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, + { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, + { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, + { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, + { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, + { -10, 116 }, + + /* 227 - 275 */ + { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, + { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, + { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, + { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, + { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, + { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, + { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, + { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, + { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, + { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, + { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, + { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, + { -4, 78 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, + { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, + { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, + { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, + { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, + { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, + { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, + { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, + { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, + { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, + { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, + { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, + { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, + { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, + { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, + { 18, 50 }, + + /* 338 - 398 */ + { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, + { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, + { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, + { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, + { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, + { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, + { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, + { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, + { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, + { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, + { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, + { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, + { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, + { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, + { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, + { 11, 83 }, + + /* 399 - 435 */ + { 25, 32 }, { 21, 49 }, { 21, 54 }, + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, + { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, + { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, + { -4, 67 }, { -7, 82 }, + + /* 436 - 459 */ + { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, + { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, + { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, + { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, + }, + + /* i_cabac_init_idc == 2 */ + { + /* 0 - 10 */ + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, + { -6, 53 }, { -1, 54 }, { 7, 51 }, + + /* 11 - 23 */ + { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, + { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, + { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, + { 14, 57 }, + + /* 24 - 39 */ + { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, + { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, + { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, + { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, + + /* 40 - 53 */ + { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, + { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, + { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, + { -3, 90 },{ -1, 101 }, + + /* 54 - 59 */ + { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, + { -7, 50 },{ 1, 60 }, + + /* 60 - 69 */ + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, + { 13, 41 }, { 3, 62 }, + + /* 70 - 104 */ + { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, + { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, + { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, + { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, + { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, + { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, + { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, + { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, + { 3, 68 }, { -8, 71 }, { -13, 98 }, + + /* 105 -> 165 */ + { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, + { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, + { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, + { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, + { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, + { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, + { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, + { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, + { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, + { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, + { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, + { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, + { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, + { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, + { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, + { -22, 127 }, + + /* 166 - 226 */ + { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, + { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, + { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, + { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, + { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, + { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, + { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, + { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, + { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, + { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, + { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, + { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, + { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, + { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, + { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, + { -24, 127 }, + + /* 227 - 275 */ + { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, + { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, + { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, + { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, + { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, + { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, + { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, + { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, + { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, + { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, + { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, + { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, + { -10, 87 }, + + /* 276 a bit special (not used, bypass is used instead) */ + { 0, 0 }, + + /* 277 - 337 */ + { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, + { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, + { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, + { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, + { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, + { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, + { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, + { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, + { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, + { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, + { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, + { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, + { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, + { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, + { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, + { 25, 42 }, + + /* 338 - 398 */ + { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, + { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, + { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, + { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, + { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, + { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, + { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, + { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, + { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, + { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, + { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, + { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, + { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, + { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, + { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, + { 25, 61 }, + + /* 399 - 435 */ + { 21, 33 }, { 19, 50 }, { 17, 61 }, + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, + { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, + { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, + { -6, 68 }, { -10, 79 }, + + /* 436 - 459 */ + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, + } +}; + +static const uint8_t left_block_options[4][16]={ + {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, + {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, + {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, + {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} +}; + +static const uint8_t rem6[52]={ +0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, +}; + +static const uint8_t div6[52]={ +0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, +}; + +static void init_dequant8_coeff_table(H264Slice *s, EntropyContext *ec){ + int i,q,x; + const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; + ec->dequant8_coeff[0] = ec->dequant8_buffer[0]; + ec->dequant8_coeff[1] = ec->dequant8_buffer[1]; + + for(i=0; i<2; i++){ + if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){ + ec->dequant8_coeff[1] = ec->dequant8_buffer[0]; + break; + } + + for(q=0; q<52; q++){ + int shift = div6[q]; + int idx = rem6[q]; + for(x=0; x<64; x++) + ec->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] = + ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * + s->pps.scaling_matrix8[i][x]) << shift; + } + } +} + +static void init_dequant4_coeff_table(H264Slice *s, EntropyContext *ec){ + int i,j,q,x; + const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; + for(i=0; i<6; i++ ){ + ec->dequant4_coeff[i] = ec->dequant4_buffer[i]; + for(j=0; jpps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){ + ec->dequant4_coeff[i] = ec->dequant4_buffer[j]; + break; + } + } + if(jdequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] = + ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] * + s->pps.scaling_matrix4[i][x]) << shift; + } + } +} + +void init_dequant_tables(H264Slice *s, EntropyContext *ec){ + int i,x; + + init_dequant4_coeff_table(s, ec); + if(s->pps.transform_8x8_mode) + init_dequant8_coeff_table(s, ec); + if(s->transform_bypass){ + for(i=0; i<6; i++) + for(x=0; x<16; x++) + ec->dequant4_coeff[i][0][x] = 1<<6; + if(s->pps.transform_8x8_mode) + for(i=0; i<2; i++) + for(x=0; x<64; x++) + ec->dequant8_coeff[i][0][x] = 1<<6; + } +} + +void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c) { + int i; + const int8_t (*tab)[2]; + + if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I; + else tab = cabac_context_init_PB[s->cabac_init_idc]; + + /* calculate pre-state */ + for( i= 0; i < 460; i++ ) { + int pre = 2*(((tab[i][0] * ec->curr_qscale) >>4 ) + tab[i][1]) - 127; + + pre^= pre>>31; + if(pre > 124) + pre= 124 + (pre&1); + + c->cabac_state[i] = pre; + } +} + +static void fill_decode_neighbors(EntropyContext *ec, H264Slice *s){ + H264Mb *m = ec->m; + const int mb_x = m->mb_x; + + if (m->mb_y){ + ec->top_type = ec->mb_type_top[mb_x]; + ec->topright_type= ec->mb_type_top[mb_x+1]; + ec->topleft_type = ec->mb_type_top[mb_x-1]; + m->qscale_top_mb_xy = ec->qscale_top[mb_x]; + } else { + ec->top_type = 0; + ec->topright_type= 0; + ec->topleft_type = 0; + m->qscale_top_mb_xy = 0; + } + + ec->left_type = ec->mb_type[mb_x-1] ; + m->qscale_left_mb_xy = ec->qscale[mb_x-1]; + +} + +static void fill_decode_caches(EntropyContext *ec, H264Slice *s, int mb_type){ + H264Mb *m = ec->m; + int topleft_type, top_type, topright_type, left_type; + const uint8_t * left_block= left_block_options[0]; + const int mb_x = m->mb_x; + int i; + + topleft_type = ec->topleft_type; + top_type = ec->top_type; + topright_type= ec->topright_type; + left_type = ec->left_type; + + if(!IS_SKIP(mb_type)){ + if(top_type){ + AV_COPY32(&ec->non_zero_count_cache[4+8*0], &ec->non_zero_count_top[mb_x][0]); + ec->non_zero_count_cache[1+8*0]= ec->non_zero_count_top[mb_x][4]; + ec->non_zero_count_cache[2+8*0]= ec->non_zero_count_top[mb_x][5]; + ec->non_zero_count_cache[1+8*3]= ec->non_zero_count_top[mb_x][6]; + ec->non_zero_count_cache[2+8*3]= ec->non_zero_count_top[mb_x][7]; + + }else { + ec->non_zero_count_cache[1+8*0]= + ec->non_zero_count_cache[2+8*0]= + ec->non_zero_count_cache[1+8*3]= + ec->non_zero_count_cache[2+8*3]= + AV_WN32A(&ec->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040); + } + + if(left_type){ + for (i=0; i<2; i++) { + ec->non_zero_count_cache[3+8*1 + 2*8*i]= ec->non_zero_count_left[i*2+0]; + ec->non_zero_count_cache[3+8*2 + 2*8*i]= ec->non_zero_count_left[i*2+1]; + ec->non_zero_count_cache[0+8*1 + 3*8*i]= ec->non_zero_count_left[4+i*2+0]; + ec->non_zero_count_cache[0+8*2 + 3*8*i]= ec->non_zero_count_left[4+i*2+1]; + } + } + else{ + for (i=0; i<2; i++) { + ec->non_zero_count_cache[3+8*1 + 2*8*i]= + ec->non_zero_count_cache[3+8*2 + 2*8*i]= + ec->non_zero_count_cache[0+8*1 + 3*8*i]= + ec->non_zero_count_cache[0+8*2 + 3*8*i]= !IS_INTRA(mb_type) ? 0 : 64; + } + } + + // top_cbp + if(top_type) { + ec->top_cbp = ec->cbp_top[mb_x]; + } else { + ec->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; + } + // left_cbp + if (left_type) { + ec->left_cbp = (ec->cbp[mb_x-1] & 0x1f0) + | ((ec->cbp[mb_x-1]>>(left_block[0]&(~1)))&2) + | (((ec->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2); + } else { + ec->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; + } + } + + if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ + int list; + + ec->ref_cache[0][scan8[5 ]+1] = ec->ref_cache[0][scan8[7 ]+1] = ec->ref_cache[0][scan8[13]+1] = + ec->ref_cache[1][scan8[5 ]+1] = ec->ref_cache[1][scan8[7 ]+1] = ec->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; + + for(list=0; listlist_count; list++){ + if(!USES_LIST(mb_type, list)){ + continue; + } + assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); + + if(USES_LIST(top_type, list)){ + ec->ref_cache[list][scan8[0] + 0 - 1*8]= + ec->ref_cache[list][scan8[0] + 1 - 1*8]= ec->ref_index_top[list][4*mb_x + 2]; + ec->ref_cache[list][scan8[0] + 2 - 1*8]= + ec->ref_cache[list][scan8[0] + 3 - 1*8]= ec->ref_index_top[list][4*mb_x + 3]; + }else{ + AV_WN32A(&ec->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); + } + + if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ + for(i=0; i<2; i++){ + int cache_idx = scan8[0] - 1 + i*2*8; + if(USES_LIST(left_type, list)){ + const int b8_x= 4*(mb_x-1) + 1; + ec->ref_cache[list][cache_idx ]= ec->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; + ec->ref_cache[list][cache_idx+8]= ec->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; + }else{ + ec->ref_cache[list][cache_idx ]= + ec->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); + } + } + }else{ + if(USES_LIST(left_type, list)){ + const int b8_x= 4*(mb_x-1) + 1; + ec->ref_cache[list][scan8[0] - 1]= ec->ref_index[list][b8_x + (left_block[0]&~1)]; + }else{ + ec->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + } + + if(USES_LIST(topright_type, list)){ + ec->ref_cache[list][scan8[0] + 4 - 1*8]= ec->ref_index_top[list][4*(mb_x+1) + 2]; + }else{ + ec->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + if(ec->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ + int topleft_partition= -1; + if(USES_LIST(topleft_type, list)){ + const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); + ec->ref_cache[list][scan8[0] - 1 - 1*8]= ec->ref_index_top[list][b8_x]; + }else{ + ec->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + } + + if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) + continue; + + if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { + ec->ref_cache[list][scan8[4 ]] = + ec->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; + + /* XXX beurk, Load mvd */ + if(USES_LIST(top_type, list)){ + AV_COPY64(ec->mvd_cache[list][scan8[0] + 0 - 1*8], ec->mvd_top[list][8*mb_x + 0]); + }else{ + AV_ZERO64(ec->mvd_cache[list][scan8[0] + 0 - 1*8]); + } + if(USES_LIST(left_type, list)){ + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 0*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[0]]); + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 1*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[1]]); + }else{ + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 0*8]); + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 1*8]); + } + if(USES_LIST(left_type, list)){ + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 2*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[2]]); + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 3*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[3]]); + }else{ + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 2*8]); + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 3*8]); + } + AV_ZERO16(ec->mvd_cache [list][scan8[4 ]]); + AV_ZERO16(ec->mvd_cache [list][scan8[12]]); + if(s->slice_type_nos == FF_B_TYPE){ + fill_rectangle(&ec->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); + + if(IS_DIRECT(top_type)){ + AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1)); + }else if(IS_8X8(top_type)){ + int b8_x = 4*mb_x; + ec->direct_cache[scan8[0] + 0 - 1*8]= ec->direct_top[b8_x + 2]; + ec->direct_cache[scan8[0] + 2 - 1*8]= ec->direct_top[b8_x + 3]; + }else{ + AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1)); + } + + if(IS_DIRECT(left_type)) + ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1; + else if(IS_8X8(left_type)) + ec->direct_cache[scan8[0] - 1 + 0*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)]; + else + ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1; + + if(IS_DIRECT(left_type)) + ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1; + else if(IS_8X8(left_type)) + ec->direct_cache[scan8[0] - 1 + 2*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)]; + else + ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1; + } + } + } + } + ec->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type); +} + +static inline void write_back_non_zero_count(EntropyContext *ec, H264Slice *s){ + H264Mb *m = ec->m; + const int mb_x= m->mb_x; + + //bottom nnz + AV_COPY32(&ec->non_zero_count[mb_x][0], &ec->non_zero_count_cache[4+8*4] ); + ec->non_zero_count[mb_x][4] = ec->non_zero_count_cache[1+8*2]; + ec->non_zero_count[mb_x][5] = ec->non_zero_count_cache[2+8*2]; + ec->non_zero_count[mb_x][6] = ec->non_zero_count_cache[1+8*5]; + ec->non_zero_count[mb_x][7] = ec->non_zero_count_cache[2+8*5]; + + for (int i=0; i<2; i++) { + ec->non_zero_count_left[i*2+0] = ec->non_zero_count_cache[7+8*1 + 2*8*i]; + ec->non_zero_count_left[i*2+1] = ec->non_zero_count_cache[7+8*2 + 2*8*i]; + ec->non_zero_count_left[4+i*2+0] = ec->non_zero_count_cache[2+8*1 + 3*8*i]; + ec->non_zero_count_left[4+i*2+1] = ec->non_zero_count_cache[2+8*2 + 3*8*i]; + } + + AV_COPY32(&m->non_zero_count[ 0], &ec->non_zero_count_cache[4+8*1]); + AV_COPY32(&m->non_zero_count[ 4], &ec->non_zero_count_cache[4+8*2]); + AV_COPY32(&m->non_zero_count[ 8], &ec->non_zero_count_cache[4+8*3]); + AV_COPY32(&m->non_zero_count[12], &ec->non_zero_count_cache[4+8*4]); + + for (int i=0; i<2; i++) { + m->non_zero_count[16 + i*2 ] = ec->non_zero_count_cache[8*1 + 8*i + 1]; + m->non_zero_count[16 + i*2 +1] = ec->non_zero_count_cache[8*1 + 8*i + 2]; + m->non_zero_count[20 + i*2 ] = ec->non_zero_count_cache[8*4 + 8*i + 1]; + m->non_zero_count[20 + i*2 +1] = ec->non_zero_count_cache[8*4 + 8*i + 2]; + } +} + +static inline void write_back_motion(EntropyContext *ec, H264Slice *s, int mb_type){ + H264Mb *m = ec->m; + const int mb_x = m->mb_x; + const int b_x = 4*m->mb_x; //try mb2b(8)_xy + int list; + + for(list=0; listlist_count; list++){ + if(!USES_LIST(mb_type, list)) + continue; + + { + uint8_t (*mvd_dst)[2] = (void *) ec->mvd[list][8*mb_x]; + uint8_t (*mvd_src)[2] = &ec->mvd_cache[list][scan8[0]]; + if(IS_SKIP(mb_type)) + AV_ZERO128(mvd_dst); + else{ + AV_COPY64(mvd_dst, mvd_src + 8*3); + AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); + AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); + AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); + } + } + int8_t *ref_index = &ec->ref_index[list][b_x]; + { + ref_index[0+0*2]= ec->ref_cache[list][scan8[0]]; + ref_index[1+0*2]= ec->ref_cache[list][scan8[4]]; + ref_index[0+1*2]= ec->ref_cache[list][scan8[8]]; + ref_index[1+1*2]= ec->ref_cache[list][scan8[12]]; + } + } + + if(s->slice_type_nos == FF_B_TYPE){ + if(IS_8X8(mb_type)){ + uint8_t *direct = &ec->direct[4*mb_x]; + direct[1] = m->sub_mb_type[1]>>1; + direct[2] = m->sub_mb_type[2]>>1; + direct[3] = m->sub_mb_type[3]>>1; + } + } +} + +static inline int get_dct8x8_allowed(EntropyContext *ec, H264Slice *s){ + H264Mb *m = ec->m; + if(s->direct_8x8_inference_flag) + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); + else + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); +} + +/** + * decodes a P_SKIP or B_SKIP macroblock + */ +static void decode_mb_skip(EntropyContext *ec, H264Slice *s){ + H264Mb *m = ec->m; + const int mb_x = m->mb_x; + int mb_type; + + if( s->slice_type_nos == FF_B_TYPE ) + mb_type= MB_TYPE_16x16|MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; + else + mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; + + fill_rectangle(&ec->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); + write_back_motion(ec, s, mb_type); + m->mb_type = ec->mb_type[mb_x] = mb_type; + m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale; + + AV_ZERO64(ec->non_zero_count[mb_x]); + AV_ZERO64(ec->non_zero_count_left); + memset(m->non_zero_count, 0, 24); +} + +static int decode_cabac_intra_mb_type(EntropyContext *ec, H264Slice *s, CABACContext *c, int ctx_base, int intra_slice) { + uint8_t *state= &c->cabac_state[ctx_base]; + int mb_type; + + if(intra_slice){ + int ctx=0; + if( ec->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) + ctx++; + if( ec->top_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) + ctx++; + if( get_cabac_noinline( c, &state[ctx] ) == 0 ) + return 0; /* I4x4 */ + state += 2; + }else{ + if( get_cabac_noinline( c, state ) == 0 ) + return 0; /* I4x4 */ + } + + if( get_cabac_terminate( c ) ) + return 25; /* PCM */ + + mb_type = 1; /* I16x16 */ + mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */ + if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */ + mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] ); + mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] ); + mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] ); + return mb_type; +} + +static int decode_cabac_mb_skip(EntropyContext *ec, H264Slice *s, H264Mb *m, CABACContext *c) { + int ctx = 0; + + if( m->mb_x>0 && !IS_SKIP( ec->left_type )) + ctx++; + if( m->mb_y>0 && !IS_SKIP( ec->top_type )) + ctx++; + + if( s->slice_type_nos == FF_B_TYPE ) + ctx += 13; + return get_cabac_noinline(c, &c->cabac_state[11+ctx] ); +} + +static int decode_cabac_mb_intra4x4_pred_mode_delta( CABACContext *c) { + int mode = 0; + + if( get_cabac(c, &c->cabac_state[68] ) ) + return -1; + + mode += 1 * get_cabac(c, &c->cabac_state[69] ); + mode += 2 * get_cabac(c, &c->cabac_state[69] ); + mode += 4 * get_cabac(c, &c->cabac_state[69] ); + + return mode; +} + +static int decode_cabac_mb_chroma_pre_mode(EntropyContext *ec, H264Slice *s, CABACContext *c) { + H264Mb *m = ec->m; + const int mb_x = m->mb_x; + + int ctx = 0; + + /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */ + if( ec->left_type && ec->chroma_pred_mode[mb_x-1] != 0 ) + ctx++; + + if( ec->top_type && ec->chroma_pred_mode_top[mb_x] != 0 ) + ctx++; + + if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 ) + return 0; + + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) + return 1; + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) + return 2; + else + return 3; +} + +static int decode_cabac_mb_cbp_luma(EntropyContext *ec, CABACContext *c) { + int cbp_b, cbp_a, ctx, cbp = 0; + + cbp_a = ec->left_cbp; + cbp_b = ec->top_cbp; + + ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]); + ctx = !(cbp & 0x01) + 2 * !(cbp_b & 0x08); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1; + ctx = !(cbp_a & 0x08) + 2 * !(cbp & 0x01); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2; + ctx = !(cbp & 0x04) + 2 * !(cbp & 0x02); + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3; + return cbp; +} +static int decode_cabac_mb_cbp_chroma(EntropyContext *ec, CABACContext *c) { + int ctx; + int cbp_a, cbp_b; + + cbp_a = (ec->left_cbp>>4)&0x03; + cbp_b = (ec-> top_cbp>>4)&0x03; + + ctx = 0; + if( cbp_a > 0 ) ctx++; + if( cbp_b > 0 ) ctx += 2; + if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 ) + return 0; + + ctx = 4; + if( cbp_a == 2 ) ctx++; + if( cbp_b == 2 ) ctx += 2; + return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] ); +} + +static int decode_cabac_p_mb_sub_type( CABACContext *c) { + if( get_cabac(c, &c->cabac_state[21] ) ) + return 0; /* 8x8 */ + if( !get_cabac(c, &c->cabac_state[22] ) ) + return 1; /* 8x4 */ + if( get_cabac(c, &c->cabac_state[23] ) ) + return 2; /* 4x8 */ + return 3; /* 4x4 */ +} +static int decode_cabac_b_mb_sub_type(CABACContext *c) { + int type; + if( !get_cabac(c, &c->cabac_state[36] ) ) + return 0; /* B_Direct_8x8 */ + if( !get_cabac(c, &c->cabac_state[37] ) ) + return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */ + type = 3; + if( get_cabac(c, &c->cabac_state[38] ) ) { + if( get_cabac(c, &c->cabac_state[39] ) ) + return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */ + type += 4; + } + type += 2*get_cabac(c, &c->cabac_state[39] ); + type += get_cabac(c, &c->cabac_state[39] ); + return type; +} + +static int decode_cabac_mb_ref(EntropyContext *ec, H264Slice *s, CABACContext *c, int list, int n ) { + int refa = ec->ref_cache[list][scan8[n] - 1]; + int refb = ec->ref_cache[list][scan8[n] - 8]; + int ref = 0; + int ctx = 0; + + if( s->slice_type_nos == FF_B_TYPE) { + if( refa > 0 && !(ec->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) ) + ctx++; + if( refb > 0 && !(ec->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) ) + ctx += 2; + } else { + if( refa > 0 ) + ctx++; + if( refb > 0 ) + ctx += 2; + } + + while( get_cabac(c, &c->cabac_state[54+ctx] ) ) { + ref++; + ctx = (ctx>>2)+4; + if(ref >= 32 /*h->ref_list[list]*/){ + return -1; + } + } + return ref; +} + +static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) { + int mvd; + + if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){ + *mvda= 0; + return 0; + } + + mvd= 1; + ctxbase+= 3; + while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) { + if( mvd < 4 ) + ctxbase++; + mvd++; + } + + if( mvd >= 9 ) { + int k = 3; + while( get_cabac_bypass(c ) ) { + mvd += 1 << k; + k++; + if(k>24){ + av_log(AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n"); + return INT_MIN; + } + } + while( k-- ) { + mvd += get_cabac_bypass(c )<mvd_cache[list][scan8[n] - 1][0] +\ + ec->mvd_cache[list][scan8[n] - 8][0];\ + int amvd1 = ec->mvd_cache[list][scan8[n] - 1][1] +\ + ec->mvd_cache[list][scan8[n] - 8][1];\ +\ + m->mvd[list][mp][0] = decode_cabac_mb_mvd( c, 40, amvd0, &mpx ); \ + m->mvd[list][mp][1] = decode_cabac_mb_mvd( c, 47, amvd1, &mpy ); \ + mp++; \ +} + +static av_always_inline int get_cabac_cbf_ctx(EntropyContext *ec, H264Slice *s, int cat, int idx, int is_dc ) { + int nza, nzb; + int ctx = 0; + + if( is_dc ) { + if( cat == 0 ) { + nza = ec->left_cbp&0x100; + nzb = ec-> top_cbp&0x100; + } else { + nza = (ec->left_cbp>>(6+idx))&0x01; + nzb = (ec-> top_cbp>>(6+idx))&0x01; + } + } else { + assert(cat == 1 || cat == 2 || cat == 4); + nza = ec->non_zero_count_cache[scan8[idx] - 1]; + nzb = ec->non_zero_count_cache[scan8[idx] - 8]; + } + + if( nza > 0 ) + ctx++; + + if( nzb > 0 ) + ctx += 2; + + return ctx + 4 * cat; +} + +DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 +}; + +static const int significant_coeff_flag_offset[2][6] = { + { 105+0, 105+15, 105+29, 105+44, 105+47, 402 }, + { 277+0, 277+15, 277+29, 277+44, 277+47, 436 } +}; +static const int last_coeff_flag_offset[2][6] = { + { 166+0, 166+15, 166+29, 166+44, 166+47, 417 }, + { 338+0, 338+15, 338+29, 338+44, 338+47, 451 } +}; +static const int coeff_abs_level_m1_offset[6] = { + 227+0, 227+10, 227+20, 227+30, 227+39, 426 +}; +static const uint8_t significant_coeff_flag_offset_8x8[2][63] = { + { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, + 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, + 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, + 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 }, + { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5, + 6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, + 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, + 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 } +}; +/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). +* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). +* map node ctx => cabac ctx for level=1 */ +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; +/* map node ctx => cabac ctx for level>1 */ +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; +static const uint8_t coeff_abs_level_transition[2][8] = { + /* update node ctx after decoding a level=1 */ + { 1, 2, 3, 3, 4, 5, 6, 7 }, + /* update node ctx after decoding a level>1 */ + { 4, 4, 4, 4, 5, 6, 7, 7 } +}; + +static av_always_inline void decode_cabac_residual_internal(EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { + H264Mb *m = ec->m; + const int mb_x = m->mb_x; + int index[64]; + + int av_unused last; + int coeff_count = 0; + int node_ctx = 0; + + uint8_t *significant_coeff_ctx_base; + uint8_t *last_coeff_ctx_base; + uint8_t *abs_level_m1_ctx_base; + + /* read coded block flag */ + if( is_dc || cat != 5 ) { + if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( ec, s, cat, n, is_dc ) ] ) == 0 ) { + if( !is_dc ) + ec->non_zero_count_cache[scan8[n]] = 0; + return; + } + } + + significant_coeff_ctx_base = c->cabac_state + + significant_coeff_flag_offset[0][cat]; + last_coeff_ctx_base = c->cabac_state + + last_coeff_flag_offset[0][cat]; + abs_level_m1_ctx_base = c->cabac_state + + coeff_abs_level_m1_offset[cat]; + + if( !is_dc && cat == 5 ) { +#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \ + for(last= 0; last < coefs; last++) { \ + uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \ + if( get_cabac( c, sig_ctx )) { \ + uint8_t *last_ctx = last_coeff_ctx_base + last_off; \ + index[coeff_count++] = last; \ + if( get_cabac( c, last_ctx ) ) { \ + last= max_coeff; \ + break; \ + } \ + } \ + }\ + if( last == max_coeff -1 ) {\ + index[coeff_count++] = last;\ + } + + const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0]; + DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); + } else { + DECODE_SIGNIFICANCE( max_coeff - 1, last, last ); + } + assert(coeff_count > 0); + + if( is_dc ) { + if( cat == 0 ) + ec->cbp[mb_x] |= 0x100; + else + ec->cbp[mb_x] |= 0x40 << n; + } else { + if( cat == 5 ) + fill_rectangle(&ec->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); + else { + assert( cat == 1 || cat == 2 || cat == 4 ); + ec->non_zero_count_cache[scan8[n]] = coeff_count; + } + } + + do { + uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; + + int j= scantable[index[--coeff_count]]; + + if( get_cabac( c, ctx ) == 0 ) { + node_ctx = coeff_abs_level_transition[0][node_ctx]; + if( is_dc ) { + block[j] = get_cabac_bypass_sign( c, -1); + }else{ + block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6; + } + } else { + int coeff_abs = 2; + ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; + node_ctx = coeff_abs_level_transition[1][node_ctx]; + + while( coeff_abs < 15 && get_cabac( c, ctx ) ) { + coeff_abs++; + } + + if( coeff_abs >= 15 ) { + int j = 0; + while( get_cabac_bypass( c ) ) { + j++; + } + + coeff_abs=1; + while( j-- ) { + coeff_abs += coeff_abs + get_cabac_bypass( c ); + } + coeff_abs+= 14; + } + + if( is_dc ) { + block[j] = get_cabac_bypass_sign( c, -coeff_abs ); + }else{ + block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6; + } + } + } while( coeff_count ); + +} + +static void decode_cabac_residual_dc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { + decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, NULL, max_coeff, 1); +} + +static void decode_cabac_residual_nondc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { + decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, qmul, max_coeff, 0); +} + +/** + * decodes a macroblock + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed + */ +int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c) { + H264Mb *m = ec->m; + int mb_x = m->mb_x; + int mb_type, partition_count, cbp = 0; + int dct8x8_allowed= s->pps.transform_8x8_mode; + + fill_decode_neighbors(ec, s); + + if( s->slice_type_nos != FF_I_TYPE ) { + int skip; + /* a skipped mb needs the aff flag from the following mb */ + skip = decode_cabac_mb_skip( ec, s, m, c); + + /* read skip flags */ + if( skip ) { + decode_mb_skip(ec, s); + m->cbp = ec->cbp[mb_x] = 0; + ec->chroma_pred_mode[mb_x] = 0; + ec->last_qscale_diff = 0; + return 0; + } + } + + if( s->slice_type_nos == FF_B_TYPE ) { + int ctx = 0; + + if( !IS_DIRECT( ec->left_type-1 ) ) + ctx++; + if( !IS_DIRECT( ec->top_type-1 ) ) + ctx++; + + if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){ + mb_type= 0; /* B_Direct_16x16 */ + }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) { + mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */ + }else{ + int bits; + bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3; + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2; + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1; + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ); + if( bits < 8 ){ + mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */ + }else if( bits == 13 ){ + mb_type= decode_cabac_intra_mb_type(ec, s, c, 32, 0); + goto decode_intra_mb; + }else if( bits == 14 ){ + mb_type= 11; /* B_L1_L0_8x16 */ + }else if( bits == 15 ){ + mb_type= 22; /* B_8x8 */ + }else{ + bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] ); + mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */ + } + } + partition_count= b_mb_type_info[mb_type].partition_count; + mb_type= b_mb_type_info[mb_type].type; + } else if( s->slice_type_nos == FF_P_TYPE ) { + if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) { + /* P-type */ + if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) { + /* P_L0_D16x16, P_8x8 */ + mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] ); + } else { + /* P_L0_D8x16, P_L0_D16x8 */ + mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] ); + } + partition_count= p_mb_type_info[mb_type].partition_count; + mb_type= p_mb_type_info[mb_type].type; + } else { + mb_type= decode_cabac_intra_mb_type(ec, s, c, 17, 0); + goto decode_intra_mb; + } + } else { + mb_type= decode_cabac_intra_mb_type(ec, s ,c, 3, 1); + if(s->slice_type == FF_SI_TYPE && mb_type) + mb_type--; + assert(s->slice_type_nos == FF_I_TYPE); +decode_intra_mb: + partition_count = 0; + cbp= i_mb_type_info[mb_type].cbp; + m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode; + mb_type= i_mb_type_info[mb_type].type; + } + + if(IS_INTRA_PCM(mb_type)) { + const uint8_t *ptr; + // We assume these blocks are very rare so we do not optimize it. + // FIXME The two following lines get the bitstream position in the cabac + // decode, I think it should be done by a function in cabac.h (or cabac.c). + ptr=c->bytestream; + if(c->low&0x1) ptr--; + if(CABAC_BITS==16){ + if(c->low&0x1FF) ptr--; + } + //printf("pcm\n"); + // The pixels are stored in the same order as levels in h->mb array. + memcpy(m->mb, ptr, 256); ptr+=256; + memcpy(m->mb+128, ptr, 128); ptr+=128; + + ff_init_cabac_decoder(c, ptr, c->bytestream_end - ptr); + + // All blocks are present + m->cbp= ec->cbp[mb_x] = 0x1ef; + ec->chroma_pred_mode[mb_x] = 0; + // In deblocking, the quantizer is 0 + m->qscale_mb_xy = ec->qscale[mb_x]= 0; + // All coeffs are present + memset(ec->non_zero_count[mb_x], 16, 8); + m->mb_type = ec->mb_type[mb_x]= mb_type; + ec->last_qscale_diff = 0; + + return 0; + } + + fill_decode_caches(ec, s, mb_type); + + int mp = 0; + if( IS_INTRA( mb_type ) ) { + int i, pred_mode; + if( IS_INTRA4x4( mb_type ) ) { + if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ) ) { + mb_type |= MB_TYPE_8x8DCT; + for( i = 0; i < 16; i+=4 ) { + m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c); + } + } else { + for( i = 0; i < 16; i++ ) { + m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c); + } + } + } + + m->chroma_pred_mode= ec->chroma_pred_mode[mb_x] = + pred_mode = decode_cabac_mb_chroma_pre_mode( ec, s, c ); + + } else if( partition_count == 4 ) { + int i, j, sub_partition_count[4], list; + + if( s->slice_type_nos == FF_B_TYPE ) { + for( i = 0; i < 4; i++ ) { + m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c ); + sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; + m->sub_mb_type[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].type; + } + if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | + m->sub_mb_type[2] | m->sub_mb_type[3]) ) { + ec->ref_cache[0][scan8[4]] = + ec->ref_cache[1][scan8[4]] = + ec->ref_cache[0][scan8[12]] = + ec->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; + + for( i = 0; i < 4; i++ ) + fill_rectangle( &ec->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 ); + } + } else { + for( i = 0; i < 4; i++ ) { + m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c ); + sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; + m->sub_mb_type[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].type; + } + } + + for( list = 0; list < s->list_count; list++ ) { + for( i = 0; i < 4; i++ ) { + if(IS_DIRECT(m->sub_mb_type[i])) continue; + if(IS_DIR(m->sub_mb_type[i], 0, list)){ + if( s->ref_count[list] > 1 ){ + m->ref_index[list][i] = decode_cabac_mb_ref(ec, s, c, list, 4*i ); + if(m->ref_index[list][i] >= s->ref_count[list]){ + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", m->ref_index[list][i], s->ref_count[list]); + return -1; + } + }else + m->ref_index[list][i] = 0; + } else { + m->ref_index[list][i] = -1; + } + ec->ref_cache[list][ scan8[4*i] ]=ec->ref_cache[list][ scan8[4*i]+1 ]= + ec->ref_cache[list][ scan8[4*i]+8 ]=ec->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i]; + } + } + + if(dct8x8_allowed){ +// assert(0); + dct8x8_allowed = get_dct8x8_allowed(ec, s); + } + + for(list=0; listlist_count; list++){ + for(i=0; i<4; i++){ +// ec->ref_cache[list][ scan8[4*i] ]=ec->ref_cache[list][ scan8[4*i]+1 ]; + if(IS_DIRECT(m->sub_mb_type[i])){ + fill_rectangle(ec->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); + continue; + } + + if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){ + const int sub_mb_type= m->sub_mb_type[i]; + const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; + for(j=0; jmvd_cache[list][ scan8[index]]; + + DECODE_CABAC_MB_MVD( ec, c, list, index) + + if(IS_SUB_8X8(sub_mb_type)){ + mvd_cache[ 1 ][0]= + mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx; + mvd_cache[ 1 ][1]= + mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy; + }else if(IS_SUB_8X4(sub_mb_type)){ + mvd_cache[ 1 ][0]= mpx; + mvd_cache[ 1 ][1]= mpy; + }else if(IS_SUB_4X8(sub_mb_type)){ + mvd_cache[ 8 ][0]= mpx; + mvd_cache[ 8 ][1]= mpy; + } + mvd_cache[ 0 ][0]= mpx; + mvd_cache[ 0 ][1]= mpy; + } + }else{ + fill_rectangle(ec->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); + } + } + } + } else if( IS_DIRECT(mb_type) ) { + mb_type |= MB_TYPE_16x16; + fill_rectangle(ec->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); + fill_rectangle(ec->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); + dct8x8_allowed &= s->direct_8x8_inference_flag; + } else { + int list, i; + if(IS_16X16(mb_type)){ + for(list=0; listlist_count; list++){ + if(IS_DIR(mb_type, 0, list)){ + int ref; + if(s->ref_count[list] > 1){ + ref= decode_cabac_mb_ref(ec, s, c, list, 0); + if(ref >= s->ref_count[list]){ + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); + return -1; + } + }else + ref=0; + m->ref_index[list][0]= ref; + fill_rectangle(&ec->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); + } + } + for(list=0; listlist_count; list++){ + if(IS_DIR(mb_type, 0, list)){ + int mpx,mpy; + DECODE_CABAC_MB_MVD( ec, c, list, 0) + + fill_rectangle(ec->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); + } + + } + } + else if(IS_16X8(mb_type)){ + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + int ref; + if(s->ref_count[list] > 1){ + ref= decode_cabac_mb_ref(ec, s, c, list, 8*i ); + if(ref >= s->ref_count[list]){ + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); + return -1; + } + }else + ref=0; + m->ref_index[list][i]= ref; + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); + }else{ + m->ref_index[list][i]= LIST_NOT_USED; + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); + } + } + } + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + int mpx,mpy; + DECODE_CABAC_MB_MVD( ec, c, list, 8*i) + + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); + }else{ + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); + } + } + } + }else{ + assert(IS_8X16(mb_type)); + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ //FIXME optimize + int ref; + if(s->ref_count[list] > 1){ + ref= decode_cabac_mb_ref(ec, s, c, list, 4*i ); + if(ref >= s->ref_count[list]){ + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); + return -1; + } + }else + ref=0; + m->ref_index[list][i]= ref; + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); + }else{ + m->ref_index[list][i]= LIST_NOT_USED; + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); + } + } + } + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + int mpx,mpy; + DECODE_CABAC_MB_MVD( ec, c, list, 4*i) + + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); + }else{ + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); + } + } + } + } + } + + if( IS_INTER( mb_type ) ||(IS_DIRECT(mb_type))) { + ec->chroma_pred_mode[mb_x] = 0; + write_back_motion( ec, s, mb_type ); + } + + if( !IS_INTRA16x16( mb_type ) ) { + cbp = decode_cabac_mb_cbp_luma( ec, c); + cbp |= decode_cabac_mb_cbp_chroma( ec, c ) << 4; + } + + ec->cbp[mb_x] = m->cbp = cbp; + + if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) { + int t = get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ); + mb_type |= MB_TYPE_8x8DCT * t; + } + m->mb_type = ec->mb_type[mb_x] = mb_type; + + if( cbp || IS_INTRA16x16( mb_type ) ) { + const uint8_t *scan, *scan8x8, *dc_scan; + const uint32_t *qmul; + + + if (s->transform_bypass && ec->curr_qscale){ + scan8x8= ff_zigzag_direct; + scan= zigzag_scan; + }else{ + scan8x8= ec->zigzag_scan8x8; + scan= ec->zigzag_scan; + } + dc_scan= luma_dc_zigzag_scan; + + // decode_cabac_mb_dqp + if(get_cabac_noinline(c, &c->cabac_state[60 + (ec->last_qscale_diff != 0)])){ + int val = 1; + int ctx= 2; + + while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) { + ctx= 3; + val++; + if(val > 102){ //prevent infinite loop + av_log(AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", m->mb_x, m->mb_y); + return -1; + } + } + + if( val&0x01 ) + val= (val + 1)>>1 ; + else + val= -((val + 1)>>1); + ec->last_qscale_diff = val; + ec->curr_qscale += val; + if(((unsigned)ec->curr_qscale) > 51){ + if(ec->curr_qscale<0) ec->curr_qscale+= 52; + else ec->curr_qscale-= 52; + } + ec->chroma_qp[0] = get_chroma_qp( s, 0, ec->curr_qscale); + ec->chroma_qp[1] = get_chroma_qp( s, 1, ec->curr_qscale); + }else + ec->last_qscale_diff=0; + + memset(m->mb, 0, 16*16 * sizeof(DCTELEM)); + if( IS_INTRA16x16( mb_type ) ) { + int i; + + //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); + decode_cabac_residual_dc( ec, s, c, m->mb, 0, 0, dc_scan, 16); + qmul = ec->dequant4_coeff[0][ec->curr_qscale]; + if( cbp&15 ) { + for( i = 0; i < 16; i++ ) { + //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i ); + decode_cabac_residual_nondc( ec, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15); + } + } else { + fill_rectangle(&ec->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1); + } + h264_luma_dc_dequant_idct_c(m->mb, qmul[0]); + } else { + + int i8x8, i4x4; + for( i8x8 = 0; i8x8 < 4; i8x8++ ) { + if( cbp & (1<mb + 64*i8x8, 5, 4*i8x8, + scan8x8, ec->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][ec->curr_qscale], 64); + } else { + qmul = ec->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][ec->curr_qscale]; + for( i4x4 = 0; i4x4 < 4; i4x4++ ) { + const int index = 4*i8x8 + i4x4; + //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index ); +//START_TIMER + decode_cabac_residual_nondc(ec, s, c, m->mb + 16*index, 2, index, scan, qmul, 16); +//STOP_TIMER("decode_residual") + } + } + } else { + uint8_t * const nnz= &ec->non_zero_count_cache[ scan8[4*i8x8] ]; + nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; + } + } + } + + if( cbp&0x30 ){ + memset(m->mb + 256, 0, 2*64 * sizeof(DCTELEM)); + for( int i = 0; i < 2; i++ ) { + const uint32_t dequant4_coeff = ec->dequant4_coeff[IS_INTRA(mb_type) ? 1+i:4+i][ec->chroma_qp[i]][0]; + + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); + decode_cabac_residual_dc(ec, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4); + chroma_dc_dequant_idct_c(m->mb + 256 + 16*4*i, dequant4_coeff); + } + } + + if( cbp&0x20 ) { + int i, j; + for( i = 0; i < 2; i++ ) { + qmul = ec->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][ec->chroma_qp[i]]; + for( j = 0; j < 4; j++ ) { + const int index = 16 + 4 * i + j; + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 ); + decode_cabac_residual_nondc( ec, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15); + } + } + } else { + uint8_t * const nnz= &ec->non_zero_count_cache[0]; + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; + } + + } else { + uint8_t * const nnz= &ec->non_zero_count_cache[0]; + fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1); + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; + ec->last_qscale_diff = 0; + } + + m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale; + write_back_non_zero_count(ec, s); + + + return 0; +} + +void free_entropy_context(EntropyContext *ec){ + av_freep(&ec->non_zero_count_row[0]); + av_freep(&ec->non_zero_count_row[1]); + av_freep(&ec->mvd_table[0][0]); + av_freep(&ec->mvd_table[0][1]); + av_freep(&ec->mvd_table[1][0]); + av_freep(&ec->mvd_table[1][1]); + + av_freep(&ec->direct_table[0]); + av_freep(&ec->direct_table[1]); + av_freep(&ec->chroma_pred_mode_table[0]); + av_freep(&ec->chroma_pred_mode_table[1]); + av_freep(&ec->cbp_table[0]); + av_freep(&ec->cbp_table[1]); + av_freep(&ec->qscale_table[0]); + av_freep(&ec->qscale_table[1]); + + av_freep(&ec->mb_type_table[0]); + av_freep(&ec->mb_type_table[1]); + av_freep(&ec->ref_index_table[0][0]); + av_freep(&ec->ref_index_table[0][1]); + av_freep(&ec->ref_index_table[1][0]); + av_freep(&ec->ref_index_table[1][1]); + + + av_free(ec); +} + +EntropyContext *get_entropy_context(H264Context *h){ + const int mb_height = h->mb_height; + const int mb_width = h->mb_width; + const int mb_stride = h->mb_stride; + + EntropyContext *ec = av_mallocz(sizeof(EntropyContext)); + + ec->mb_width = mb_width; + ec->mb_height = mb_height; + ec->b_stride = mb_width*4; + ec->mb_stride = mb_stride; + + FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[0], mb_stride * 8 * sizeof(uint8_t), fail) + FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[1], mb_stride * 8 * sizeof(uint8_t), fail) + + FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][0], 16*mb_stride * sizeof(uint8_t), fail); + FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][1], 16*mb_stride * sizeof(uint8_t), fail); + FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][0], 16*mb_stride * sizeof(uint8_t), fail); + FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][1], 16*mb_stride * sizeof(uint8_t), fail); + + FF_ALLOCZ_OR_GOTO(ec->direct_table[0], 4*mb_stride * sizeof(uint8_t) , fail); + FF_ALLOCZ_OR_GOTO(ec->direct_table[1], 4*mb_stride * sizeof(uint8_t) , fail); + + FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[0], mb_stride * sizeof(uint8_t), fail) + FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[1], mb_stride * sizeof(uint8_t), fail) + + FF_ALLOCZ_OR_GOTO(ec->cbp_table[0], mb_stride * sizeof(uint16_t), fail) + FF_ALLOCZ_OR_GOTO(ec->cbp_table[1], mb_stride * sizeof(uint16_t), fail) + + FF_ALLOCZ_OR_GOTO(ec->qscale_table[0], mb_stride * sizeof(uint8_t) , fail) + FF_ALLOCZ_OR_GOTO(ec->qscale_table[1], mb_stride * sizeof(uint8_t) , fail) + + FF_ALLOCZ_OR_GOTO(ec->mb_type_table[0] , (mb_stride+1) * sizeof(uint32_t), fail) + FF_ALLOCZ_OR_GOTO(ec->mb_type_table[1] , (mb_stride+1) * sizeof(uint32_t), fail) + + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][0], 4*mb_stride * sizeof(int8_t), fail) + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][0], 4*mb_stride * sizeof(int8_t), fail) + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][1], 4*mb_stride * sizeof(int8_t), fail) + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][1], 4*mb_stride * sizeof(int8_t), fail) + + ec->zigzag_scan = h->zigzag_scan; + ec->zigzag_scan8x8 = h->zigzag_scan8x8; + + return ec; +fail: + free_entropy_context(ec); + return NULL; +} + +void init_entropy_buf(EntropyContext *ec, H264Slice *s, int line){ + int top = (line+1)%2; + int cur = line%2; + + ec->non_zero_count_top = ec->non_zero_count_row[top]; + ec->non_zero_count = ec->non_zero_count_row[cur]; + ec->mvd_top[0] = ec->mvd_table[0][top]; + ec->mvd[0] = ec->mvd_table[0][cur]; + ec->mvd_top[1] = ec->mvd_table[1][top]; + ec->mvd[1] = ec->mvd_table[1][cur]; + ec->direct_top = ec->direct_table[top]; + ec->direct = ec->direct_table[cur]; + ec->chroma_pred_mode_top = ec->chroma_pred_mode_table[top]; + ec->chroma_pred_mode = ec->chroma_pred_mode_table[cur]; + ec->cbp_top = ec->cbp_table[top]; + ec->cbp = ec->cbp_table[cur]; + ec->qscale_top = ec->qscale_table[top] +1; + ec->qscale = ec->qscale_table[cur] +1; + ec->mb_type_top = ec->mb_type_table[top]+1; + ec->mb_type = ec->mb_type_table[cur]+1; + ec->ref_index_top[0] = ec->ref_index_table[0][top]; + ec->ref_index_top[1] = ec->ref_index_table[1][top]; + ec->ref_index[0] = ec->ref_index_table[0][cur]; + ec->ref_index[1] = ec->ref_index_table[1][cur]; + +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_entropy.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_entropy.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,20 @@ +#ifndef H264_CABAC_H +#define H264_CABAC_H + +#include "h264_types.h" +#include "cabac.h" + +/** + * decodes a CABAC coded macroblock + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed + */ + +int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c); +void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c); + +int init_entropy_buf(EntropyContext *ec, H264Slice *s, int line); +EntropyContext * get_entropy_context(H264Context *h); +void init_dequant_tables(H264Slice *s, EntropyContext *ec); +void free_entropy_context(EntropyContext *ec); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_idct.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_idct.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,270 @@ +/* + * H.264 IDCT + * Copyright (c) 2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 IDCT. + * @author Michael Niedermayer + */ + +#include "dsputil.h" +#include "h264_data.h" + +static av_always_inline void idct_internal(uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add){ + int i; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + + block[0] += 1<<(shift-1); + + for(i=0; i<4; i++){ + const int z0= block[0 + block_stride*i] + block[2 + block_stride*i]; + const int z1= block[0 + block_stride*i] - block[2 + block_stride*i]; + const int z2= (block[1 + block_stride*i]>>1) - block[3 + block_stride*i]; + const int z3= block[1 + block_stride*i] + (block[3 + block_stride*i]>>1); + + block[0 + block_stride*i]= z0 + z3; + block[1 + block_stride*i]= z1 + z2; + block[2 + block_stride*i]= z1 - z2; + block[3 + block_stride*i]= z0 - z3; + } + + for(i=0; i<4; i++){ + const int z0= block[i + block_stride*0] + block[i + block_stride*2]; + const int z1= block[i + block_stride*0] - block[i + block_stride*2]; + const int z2= (block[i + block_stride*1]>>1) - block[i + block_stride*3]; + const int z3= block[i + block_stride*1] + (block[i + block_stride*3]>>1); + + dst[i + 0*stride]= cm[ add*dst[i + 0*stride] + ((z0 + z3) >> shift) ]; + dst[i + 1*stride]= cm[ add*dst[i + 1*stride] + ((z1 + z2) >> shift) ]; + dst[i + 2*stride]= cm[ add*dst[i + 2*stride] + ((z1 - z2) >> shift) ]; + dst[i + 3*stride]= cm[ add*dst[i + 3*stride] + ((z0 - z3) >> shift) ]; + } +} + +void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride){ + idct_internal(dst, block, stride, 4, 6, 1); +} + +void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){ + idct_internal(dst, block, stride, 8, 3, 1); +} + +void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){ + idct_internal(dst, block, stride, 8, 3, 0); +} + +void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){ + int i; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + + block[0] += 32; + + for( i = 0; i < 8; i++ ) + { + const int a0 = block[0+i*8] + block[4+i*8]; + const int a2 = block[0+i*8] - block[4+i*8]; + const int a4 = (block[2+i*8]>>1) - block[6+i*8]; + const int a6 = (block[6+i*8]>>1) + block[2+i*8]; + + const int b0 = a0 + a6; + const int b2 = a2 + a4; + const int b4 = a2 - a4; + const int b6 = a0 - a6; + + const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1); + const int a3 = block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1); + const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1); + const int a7 = block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1); + + const int b1 = (a7>>2) + a1; + const int b3 = a3 + (a5>>2); + const int b5 = (a3>>2) - a5; + const int b7 = a7 - (a1>>2); + + block[0+i*8] = b0 + b7; + block[7+i*8] = b0 - b7; + block[1+i*8] = b2 + b5; + block[6+i*8] = b2 - b5; + block[2+i*8] = b4 + b3; + block[5+i*8] = b4 - b3; + block[3+i*8] = b6 + b1; + block[4+i*8] = b6 - b1; + } + for( i = 0; i < 8; i++ ) + { + const int a0 = block[i+0*8] + block[i+4*8]; + const int a2 = block[i+0*8] - block[i+4*8]; + const int a4 = (block[i+2*8]>>1) - block[i+6*8]; + const int a6 = (block[i+6*8]>>1) + block[i+2*8]; + + const int b0 = a0 + a6; + const int b2 = a2 + a4; + const int b4 = a2 - a4; + const int b6 = a0 - a6; + + const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1); + const int a3 = block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1); + const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1); + const int a7 = block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1); + + const int b1 = (a7>>2) + a1; + const int b3 = a3 + (a5>>2); + const int b5 = (a3>>2) - a5; + const int b7 = a7 - (a1>>2); + + dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ]; + dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ]; + dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ]; + dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ]; + dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ]; + dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ]; + dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ]; + dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; + } +} + +// assumes all AC coefs are 0 +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ + int i, j; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 4; j++ ) + { + for( i = 0; i < 4; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} + +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ + int i, j; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 8; j++ ) + { + for( i = 0; i < 8; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} + +void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); + else idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); + } + } +} + +void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + if(nnzc[ scan8[i] ]) idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); + else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); + } +} + +void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i+=4){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct8_add_c (dst + block_offset[i], block + i*16, stride); + } + } +} + +void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=16; i<16+8; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_c (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + else if(block[i*16]) + ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + } +} + +/** +* IDCT transforms the 16 dc values and dequantizes them. +* @param qp quantization parameter +*/ +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul){ + #define stride 16 + int i; + int temp[16]; //FIXME check if this is a good idea + static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; + static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; + + //return; + for(i=0; i<4; i++){ + const int offset= y_offset[i]; + const int z0= block[offset+stride*0] + block[offset+stride*4]; + const int z1= block[offset+stride*0] - block[offset+stride*4]; + const int z2= block[offset+stride*1] - block[offset+stride*5]; + const int z3= block[offset+stride*1] + block[offset+stride*5]; + + temp[4*i+0]= z0+z3; + temp[4*i+1]= z1+z2; + temp[4*i+2]= z1-z2; + temp[4*i+3]= z0-z3; + } + + for(i=0; i<4; i++){ + const int offset= x_offset[i]; + const int z0= temp[4*0+i] + temp[4*2+i]; + const int z1= temp[4*0+i] - temp[4*2+i]; + const int z2= temp[4*1+i] - temp[4*3+i]; + const int z3= temp[4*1+i] + temp[4*3+i]; + + block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual + block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); + block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); + block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); + } +} + +#undef xStride +#undef stride + +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){ + const int stride= 16*2; + const int xStride= 16; + int a,b,c,d,e; + + a= block[stride*0 + xStride*0]; + b= block[stride*0 + xStride*1]; + c= block[stride*1 + xStride*0]; + d= block[stride*1 + xStride*1]; + + e= a-b; + a= a+b; + b= c-d; + c= c+d; + + block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; + block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; + block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; + block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_idct.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_idct.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,19 @@ +#ifndef H264_IDCT_H +#define H264_IDCT_H + +#include "avcodec.h" + +void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); +void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); +void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul); +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_mc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_mc.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,272 @@ +#include "h264_types.h" +#include "h264_data.h" + +static inline void mc_dir_part(MBRecContext *d, MBRecState *mrs, H264Mb *m, DecodedPicture *pic, int n, int square, + int chroma_height, int delta, int list,uint8_t *dest_y, + uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset, + qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){ + const int mx= mrs->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; + const int my= mrs->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; + const int luma_xy= (mx&3) + ((my&3)<<2); + const int pic_width = 16*d->mb_width; + const int pic_height = 16*d->mb_height; + + uint8_t *src_y, *src_cb, *src_cr; + int ymx= mx>>2; + int ymy= my>>2; + int cmy= my>>3; + int cmx= mx>>3; + + //truncate the motion vectors references + if(ymy>= pic_height+2){ + ymy=pic_height+1; + }else if(ymy <=-19){ + ymy=-18; + } + if(ymx>= pic_width+2){ + ymx= pic_width+1; + }else if(ymx<=-19){ + ymx=-19; + } + + src_y = pic->data[0] + ymx + ymy*d->linesize; + qpix_op[luma_xy](dest_y, src_y, d->linesize); //FIXME try variable height perhaps? + if(!square){ + qpix_op[luma_xy](dest_y + delta, src_y + delta, d->linesize); + } + + if(cmy >= pic_height>>1){ + cmy = (pic_height>>1) -1; + }else if(cmy<=-9){ + cmy=-8; + } + if(cmx >= pic_width>>1){ + cmx = (pic_width>>1) -1; + }else if(cmx<=-9){ + cmx=-8; + } + + src_cb= pic->data[1] + cmx + cmy*d->uvlinesize; + src_cr= pic->data[2] + cmx + cmy*d->uvlinesize; + + chroma_op(dest_cb, src_cb, d->uvlinesize, chroma_height, mx&7, my&7); + chroma_op(dest_cr, src_cr, d->uvlinesize, chroma_height, mx&7, my&7); +} + +static inline void mc_part_std(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + int list0, int list1){ + qpel_mc_func *qpix_op= qpix_put; + h264_chroma_mc_func chroma_op= chroma_put; + + dest_y += 2*x_offset + 2*y_offset*d-> linesize; + dest_cb += x_offset + y_offset*d->uvlinesize; + dest_cr += x_offset + y_offset*d->uvlinesize; + x_offset += 8*m->mb_x; + y_offset += 8*m->mb_y; + + if(list0){ + DecodedPicture *ref= s->dp_ref_list[0][ mrs->ref_cache[0][ scan8[n] ] ]; + mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 0, + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op); + + qpix_op= qpix_avg; + chroma_op= chroma_avg; + } + + if(list1){ + DecodedPicture *ref= s->dp_ref_list[1][ mrs->ref_cache[1][ scan8[n] ] ]; + mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 1, + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op); + } +} + +static inline void mc_part_weighted(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, + h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, + int list0, int list1){ + dest_y += 2*x_offset + 2*y_offset*d-> linesize; + dest_cb += x_offset + y_offset*d->uvlinesize; + dest_cr += x_offset + y_offset*d->uvlinesize; + x_offset += 8*m->mb_x; + y_offset += 8*m->mb_y; + + if(list0 && list1){ + /* don't optimize for luma-only case, since B-frames usually + * use implicit weights => chroma too. */ + uint8_t *tmp_y = d->scratchpad_y + 2*x_offset +16 ; + uint8_t *tmp_cb = d->scratchpad_cb + x_offset + 8; + uint8_t *tmp_cr = d->scratchpad_cr + x_offset + 8; + +/* + uint8_t *tmp_cb = d->scratchpad; + uint8_t *tmp_cr = d->scratchpad + 8; + uint8_t *tmp_y = d->scratchpad + 8*d->uvlinesize;*/ + int refn0 = mrs->ref_cache[0][ scan8[n] ]; + int refn1 = mrs->ref_cache[1][ scan8[n] ]; + + mc_dir_part(d, mrs, m, s->dp_ref_list[0][refn0], n, square, chroma_height, delta, 0, + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put); + mc_dir_part(d, mrs, m, s->dp_ref_list[1][refn1], n, square, chroma_height, delta, 1, + tmp_y, tmp_cb, tmp_cr, x_offset, y_offset, qpix_put, chroma_put); + + if(s->use_weight == 2){ + int weight0 = s->implicit_weight[refn0][refn1][m->mb_y&1]; + int weight1 = 64 - weight0; + luma_weight_avg( dest_y, tmp_y, d-> linesize, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, 5, weight0, weight1, 0); + chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, 5, weight0, weight1, 0); + }else{ + luma_weight_avg(dest_y, tmp_y, d->linesize, s->luma_log2_weight_denom, + s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], + s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]); + chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, s->chroma_log2_weight_denom, + s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], + s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]); + chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, s->chroma_log2_weight_denom, + s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], + s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]); + } + }else{ + int list = list1 ? 1 : 0; + int refn = mrs->ref_cache[list][ scan8[n] ]; + DecodedPicture *ref= s->dp_ref_list[list][refn]; + mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, list, + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put); + + luma_weight_op(dest_y, d->linesize, s->luma_log2_weight_denom, + s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]); + if(s->use_weight_chroma){ + chroma_weight_op(dest_cb, d->uvlinesize, s->chroma_log2_weight_denom, + s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]); + chroma_weight_op(dest_cr, d->uvlinesize, s->chroma_log2_weight_denom, + s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]); + } + } +} + +static inline void mc_part(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + int x_offset, int y_offset, + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, + h264_weight_func *weight_op, h264_biweight_func *weight_avg, + int list0, int list1){ + if((s->use_weight==2 && list0 && list1 + && (s->implicit_weight[ mrs->ref_cache[0][scan8[n]] ][ mrs->ref_cache[1][scan8[n]] ][m->mb_y&1] != 32)) + || s->use_weight==1) + mc_part_weighted(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + x_offset, y_offset, qpix_put, chroma_put, + weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1); + else + mc_part_std(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, + x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1); +} + +static inline void prefetch_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int list){ + /* fetch pixels for estimated mv 4 macroblocks ahead + * optimized for 64byte cache lines */ + const int refn = mrs->ref_cache[list][scan8[0]]; + + if(refn >= 0){ + const int mx= (mrs->mv_cache[list][scan8[0]][0]>>2) + 16*m->mb_x + 8; + const int my= (mrs->mv_cache[list][scan8[0]][1]>>2) + 16*m->mb_y; + uint8_t **src= s->dp_ref_list[list][refn]->data; + int off= mx + (my + (m->mb_x&3)*4)*d->linesize + 64; + + d->dsp.prefetch(src[0]+off, d->linesize, 4); + off= (mx>>1) + ((my>>1) + (m->mb_x&7))*d->uvlinesize + 64; + d->dsp.prefetch(src[1]+off, src[2]-src[1], 2); + } +} + +void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg){ + const int mb_type= m->mb_type; + assert(IS_INTER(mb_type)); + + if (mb_type & MB_TYPE_L0) + prefetch_motion(d, mrs, s, m, 0); + if (mb_type & MB_TYPE_L1) + prefetch_motion(d, mrs, s, m, 1); + + if(IS_16X16(mb_type)){ + mc_part(d, mrs, s, m, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], + weight_op, weight_avg, + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + }else if(IS_16X8(mb_type)){ + mc_part(d, mrs, s, m, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], + &weight_op[1], &weight_avg[1], + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + mc_part(d, mrs, s, m, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4, + qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], + &weight_op[1], &weight_avg[1], + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); + }else if(IS_8X16(mb_type)){ + mc_part(d, mrs, s, m, 0, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 0, 0, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[2], &weight_avg[2], + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); + mc_part(d, mrs, s, m, 4, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 4, 0, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[2], &weight_avg[2], + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); + }else{ + int i; + + assert(IS_8X8(mb_type)); + + for(i=0; i<4; i++){ + const int sub_mb_type= m->sub_mb_type[i]; + const int n= 4*i; + int x_offset= (i&1)<<2; + int y_offset= (i&2)<<1; + + if(IS_SUB_8X8(sub_mb_type)){ + mc_part(d, mrs, s, m, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], + &weight_op[3], &weight_avg[3], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else if(IS_SUB_8X4(sub_mb_type)){ + mc_part(d, mrs, s, m, n, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], + &weight_op[4], &weight_avg[4], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + mc_part(d, mrs, s, m, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, + qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], + &weight_op[4], &weight_avg[4], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else if(IS_SUB_4X8(sub_mb_type)){ + mc_part(d, mrs, s, m, n, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[5], &weight_avg[5], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + mc_part(d, mrs, s, m, n+1, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[5], &weight_avg[5], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + }else{ + int j; + assert(IS_SUB_4X4(sub_mb_type)); + for(j=0; j<4; j++){ + int sub_x_offset= x_offset + 2*(j&1); + int sub_y_offset= y_offset + (j&2); + mc_part(d, mrs, s, m, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], + &weight_op[6], &weight_avg[6], + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); + } + } + } + } +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_mc.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_mc.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,12 @@ +#ifndef H264_MC_H +#define H264_MC_H + +#include "dsputil.h" +#include "h264_types.h" + +void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), + h264_weight_func *weight_op, h264_biweight_func *weight_avg); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_misc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_misc.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,944 @@ +#include "config.h" + +#include "h264_types.h" + +#include +#include +#include +#include +#include +#undef NDEBUG +#include + +#if HAVE_LIBSDL2 +#include +#if HAVE_LIBSDL_TTF +#include +#endif +#endif + +void start_timer(H264Context *h, int stage){ + clock_gettime(CLOCK_REALTIME, &h->start_time[stage]); +} + +void stop_timer(H264Context *h, int stage){ + clock_gettime(CLOCK_REALTIME, &h->end_time[stage]); + double time = (double) 1.e3*(h->end_time[stage].tv_sec - h->start_time[stage].tv_sec) + 1.e-6*(h->end_time[stage].tv_nsec - h->start_time[stage].tv_nsec); + h->last_time [stage] = time; + h->total_time[stage] += time; +} + +void init_sb_entry(H264Context *h, SliceBufferEntry *sbe){ + sbe->mbs = av_malloc(h->mb_width*h->mb_height* sizeof(H264Mb)); + sbe->initialized = 1; +} + +void free_sb_entry(SliceBufferEntry *sbe){ + av_free(sbe->mbs); + av_freep(&sbe->gb.raw); + if (sbe->gb.rbsp) + av_freep(&sbe->gb.rbsp); + sbe->initialized = 0; +} + +SliceBufferEntry *get_sb_entry(H264Context *h){ + SliceBufferEntry *sb = NULL; + + pthread_mutex_lock(&h->lock[PARSE]); + while (h->free_sb_cnt<=0) + pthread_cond_wait(&h->cond[PARSE], &h->lock[PARSE]); + /* use first free picture */ + for(int i=0; isb_size; i++){ + if(h->sb[i].state==0){ + sb= &h->sb[i]; + sb->state=1; + sb->lines_taken=0; + sb->lines_total=h->mb_height; + break; + } + } + h->free_sb_cnt--; + + pthread_mutex_unlock(&h->lock[PARSE]); + + memset (&sb->slice, 0, sizeof(H264Slice)); + + return sb; +} + +void release_sb_entry(H264Context *h, SliceBufferEntry *sb){ + pthread_mutex_lock(&h->lock[PARSE]); + + sb->state = 0; + h->free_sb_cnt++; + pthread_cond_signal(&h->cond[PARSE]); + + pthread_mutex_unlock(&h->lock[PARSE]); +} + +int init_dpb_entry(H264Context *h, DecodedPicture *pic, H264Slice *s, int width, int height){ + int i; + + s->curr_pic=pic; + pic->poc = s->poc; + pic->key_frame = s->key_frame; + pic->mmco_reset = s->mmco_reset; + pic->reference = s->nal_ref_idc? 3:1; + pic->cpn = s->coded_pic_num; + + if(pic->data[0]==NULL) { + int size[3] = {0}; + + width+= EDGE_WIDTH*2; + height+= EDGE_WIDTH*2; + + pic->linesize[0]= width; + pic->linesize[1]= pic->linesize[2] = width>>1; + + size[0] = width*height; + size[1] = size[2] = width*height>>2; + + for(i=0; i<3; i++){ + pic->base[i]= av_malloc(size[i]); + } + + pic->data[0] = pic->base[0] + (pic->linesize[0]*EDGE_WIDTH) + EDGE_WIDTH; + pic->data[1] = pic->base[1] + (pic->linesize[1]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1); + pic->data[2] = pic->base[2] + (pic->linesize[2]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1); + } + + const int big_mb_num= h->mb_stride*(h->mb_height+1) + 1; //the +1 is needed so memset(,,stride*height) does not sig11 + const int mb_array_size= h->mb_stride*h->mb_height; + const int b4_array_size= h->b4_stride*h->mb_height*4; + + if(pic->mb_type_base==NULL){ + FF_ALLOCZ_OR_GOTO(pic->mb_type_base , big_mb_num * sizeof(uint32_t), fail) + pic->mb_type= pic->mb_type_base + h->mb_stride+1; + + for(int i=0; i<2; i++){ + FF_ALLOCZ_OR_GOTO(pic->motion_val_base[i], 2 * (b4_array_size+4) * sizeof(int16_t), fail) + pic->motion_val[i]= pic->motion_val_base[i]+4; + FF_ALLOCZ_OR_GOTO(pic->ref_index[i], 4*mb_array_size * sizeof(uint8_t), fail) + } + FF_ALLOCZ_OR_GOTO(pic->intra4x4_pred_mode, h->mb_width*h->mb_height * 4* sizeof(int8_t), fail) + } + + return 0; + fail: + return -1; +} + +void free_dp(DecodedPicture *pic){ + if(pic->base[0]){ + for (int i=0; i<3; i++){ + av_free(pic->base[i]); + pic->data[i]= NULL; + } + } + if (pic->mb_type_base){ + av_free(pic->mb_type_base); + pic->mb_type= NULL; + for(int i=0; i<2; i++){ + av_free(pic->motion_val_base[i]); + av_free(pic->ref_index[i]); + } + av_free(pic->intra4x4_pred_mode); + } +} + +DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s){ + DecodedPicture *dp = NULL; + + pthread_mutex_lock(&h->lock[REORDER2]); + while (h->free_dpb_cnt<=0){ + #if OMPSS + assert(0); + #endif + pthread_cond_wait(&h->cond[REORDER2], &h->lock[REORDER2]); + } + /* use first free picture */ + for(int i=0; imax_dpb_cnt; i++){ + if(h->dpb[i].reference==0){ + dp= &h->dpb[i]; + break; + } + } + assert(dp); + init_dpb_entry(h, dp, s, h->width, h->height); + h->free_dpb_cnt--; + h->acdpb_cnt++; //debug + pthread_mutex_unlock(&h->lock[REORDER2]); + + return dp; +} + +void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode){ + pthread_mutex_lock(&h->lock[REORDER2]); + pic->reference &= ~mode; + if (pic->reference == 0){ + h->free_dpb_cnt++; + h->reldpb_cnt++; //debug + pthread_cond_signal(&h->cond[REORDER2]); + } + pthread_mutex_unlock(&h->lock[REORDER2]); +} + + +/** +* Extends the edges of a macroblock line. +*/ +void draw_edges(MBRecContext *d, H264Slice *s, int line){ + int i; + int mb_width=d->mb_width; + int mb_height=d->mb_height; + int last = (line+1 == mb_height); + int lines = last?16:12; + int linesize = d->linesize; + int uvlinesize = d->uvlinesize; + uint8_t *y = s->curr_pic->data[0] + 16*line*linesize; + uint8_t *cb = s->curr_pic->data[1] + 8*line*uvlinesize; + uint8_t *cr = s->curr_pic->data[2] + 8*line*uvlinesize; + + for (i=-4; idelayed_pic[0]; + + if (!out) + return NULL; + + for(i=1; w->delayed_pic[i] && !w->delayed_pic[i]->key_frame && !w->delayed_pic[i]->mmco_reset; i++){ + if(w->delayed_pic[i]->poc < out->poc){ + out = w->delayed_pic[i]; + out_idx = i; + } + } + + if(w->dp_cnt > MAX_DELAYED_PIC_COUNT || flush) { + for(i=out_idx; w->delayed_pic[i]; i++) + w->delayed_pic[i] = w->delayed_pic[i+1]; + w->dp_cnt--; + return out; + } + return NULL; +} + +/** +* Remove the extra borders, and places the three parts of the image after each other. +*/ +static int raw_encode(const DecodedPicture* src, int width, int height, unsigned char *dest) { + int i, j; +/** To write entire image including extra borders*/ +// int w = src->linesize[0]; +// int h = height+64; +// int w2 = w>>1; +// int h2 = h>>1; +// int data_planes=3; +// int size = w * h + 2 *w2*h2; +// const unsigned char* s; +// for (i=0; ibase[i]; +// for(j=0; jlinesize[i]); +// dest += w; +// s += src->linesize[i]; +// } +// } + + int w = (width*8 + 7)/8; + int h = height; + int w2 =((width >>1) * 8 + 7) / 8; + int h2 = ((height+1) >>1); //not sure about +1 + int data_planes=3; + int size = w * h + 2 *w2*h2; + const unsigned char* s; + + + for (i=0; idata[i]; + for(j=0; jlinesize[i]; + } + } + return size; +} + +#ifdef HAVE_LIBSDL2 +static SDL_Texture *get_next_texture(H264Context *h, int side){ + SDLTextureQueue *sdlq = &h->sdlq; + SDL_Texture *texture; + pthread_mutex_lock (&sdlq->sdl_lock); + if (side ){ //send + while (sdlq->ready >= sdlq->size) + pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock); + texture = sdlq->queue[sdlq->fi]; + sdlq->fi++; sdlq->fi %= sdlq->size; + } else { //recv + while (sdlq->ready <= 0 && !sdlq->exit) + pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock); + + if (sdlq->ready == 0 && sdlq->exit){ + texture = NULL; + }else{ + texture = sdlq->queue[sdlq->fo]; + sdlq->fo++; sdlq->fo %= sdlq->size; + } + } + pthread_mutex_unlock(&sdlq->sdl_lock); + + return texture; +} + +static void signal_texture(H264Context *h, int side){ + SDLTextureQueue *sdlq = &h->sdlq; + pthread_mutex_lock (&sdlq->sdl_lock); + if (side) + sdlq->ready++; + else + sdlq->ready--; + pthread_cond_signal(&sdlq->sdl_cond); + pthread_mutex_unlock(&sdlq->sdl_lock); +} + +void signal_sdl_exit(H264Context *h){ + SDLTextureQueue *sdlq = &h->sdlq; + pthread_mutex_lock (&sdlq->sdl_lock); + sdlq->exit=1; + pthread_cond_signal(&sdlq->sdl_cond); + pthread_mutex_unlock(&sdlq->sdl_lock); +} + +static void display_frame(H264Context *h, OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height, int dropable){ + static int64_t last_time = -1; + int64_t cur_time; +// SDLContext *sdlc = h->sdlc; + uint8_t *iyuv_pixels; + int pitch; + + + if (last_time == -1){ + last_time = av_gettime(); + } + + + /* do not display frames that are less than 8.125 ms apart (120fps)*/ + if (dropable){ + cur_time = av_gettime(); + + if ((cur_time - last_time) < 8125) + return; + + last_time =cur_time; + } + + if(in_picture){ + + SDL_Texture *texture= get_next_texture(h, 1); + + SDL_LockTexture( texture, NULL, (void **)&iyuv_pixels, &pitch ); + + raw_encode(in_picture, frame_width, frame_height, iyuv_pixels); + + signal_texture(h, 1); + } +} +#endif + +// TODO: Parallelize the raw_encode (either split frame or over frames) +static void do_video_out(OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height) { + int size=0; + //remove extra borders + + if(in_picture) + size= raw_encode(in_picture, frame_width, frame_height, w->bit_buffer); + + if (size < 0) { + fprintf(stderr, "Video encoding failed\n"); + }else { + if (write(fd, w->bit_buffer, size)<0) + fprintf(stderr, "Write frame failed\n"); + } + + w->video_size += size; +} + +DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height) { + DecodedPicture *out; + + if (pic){ + oc->delayed_pic[oc->dp_cnt++]=pic; + out = get_reordered_picture(oc, 0); + }else{ + out = get_reordered_picture(oc, 1); + } + + if (out){ + if (fd){ + do_video_out(oc, fd, out, frame_width, frame_height); + }else{ +#ifdef HAVE_LIBSDL2 + if (h->display){ + display_frame(h, oc, fd, out, frame_width, frame_height, !(pic==NULL)); + } +#endif + } + oc->frame_number++; + } + + return out; +} + +OutputContext *get_output_context(H264Context *h){ + const int frame_width=h->frame_width; + const int frame_height=h->frame_height; + const int frame_size = frame_width*frame_height; + + OutputContext *oc = av_mallocz(sizeof(OutputContext)); + oc->bit_buffer_size= FFMAX(1024*256, frame_size*2); // oversize a little bit to allow extra border write + oc->bit_buffer= av_mallocz(oc->bit_buffer_size); + + return oc; +} + +void free_output_context(OutputContext *oc){ + + av_free(oc->bit_buffer); + av_free(oc); +} + +SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height){ + SuperMBContext *smbc = av_mallocz(sizeof(SuperMBContext)); + + smbc->smb_width = smb_width; + smbc->smb_height = smb_height; + + smbc->nsmb_height = h->mb_height / smbc->smb_height + (h->mb_height%smbc->smb_height ? 1:0); //only need one extra if mb_height was not dividable + smbc->nsmb_width = h->mb_width / smbc->smb_width; + while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width ) + smbc->nsmb_width++; + + smbc->nsmb_3dheight= smbc->nsmb_height - ((h->mb_height/2)/smbc->smb_height +1); //assuming max motion vector of half the height + + smbc->smbs[0] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask)); + smbc->smbs[1] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask)); + for (int y=0, i=0; insmb_height; i++, y+=smbc->smb_height){ + for (int x=0, j=0; jnsmb_width; j++, x+=smbc->smb_width){ + smbc->smbs[0][i*smbc->nsmb_width +j].smb_y = y; + smbc->smbs[0][i*smbc->nsmb_width +j].smb_x = x; + smbc->smbs[1][i*smbc->nsmb_width +j].smb_y = y; + smbc->smbs[1][i*smbc->nsmb_width +j].smb_x = x; + } + } + + smbc->refcount = 1; + + return smbc; +} + +void freeSuperMBContext(SuperMBContext *smbc){ + av_free(smbc->smbs[0]); + av_free(smbc->smbs[1]); + av_free(smbc); +} + +SuperMBContext * acquire_smbc(H264Context *h ){ + SuperMBContext *smbc; + + pthread_mutex_lock (&h->smb_lock); + smbc = h->smbc; + smbc->refcount++; + pthread_mutex_unlock(&h->smb_lock); + return smbc; +} + +void release_smbc(H264Context *h, SuperMBContext *smbc){ + pthread_mutex_lock (&h->smb_lock); + smbc->refcount--; + if (smbc->refcount==0){ + freeSuperMBContext(smbc); + } + pthread_mutex_unlock(&h->smb_lock); + +} + + +#ifdef HAVE_LIBSDL2 + +// #if OMPSS +static void draw_sb_border(H264Context *h, uint32_t *rgba_pixels, int smb_x, int smb_y){ + int mb_width = h->mb_width; + int mb_height = h->mb_height; + int width = h->frame_width; + int height = h->frame_height; + + int mb_x = smb_x * h->smb_width; + int mb_y = smb_y * h->smb_height; + + uint32_t pix= 0x0000FFC0; + + for (int k=0, i=mb_y; i< mb_y + h->smb_height; i++, k++){ + for (int l=0, j=mb_x -k ; j< mb_x - k + h->smb_width; j++, l++){ + //outside frame + if (i<0 || i>=mb_height || j<0 || j>=mb_width) { + continue; + } + + //draw top + if (i==0 || k==0 || l==0){ + int mx = j*16; + int my = i*16; + uint32_t *top = rgba_pixels + my*width + mx; + int endx = mx+16 < width? 16: width-mx; + + for (int x = 0; xsmb_height-1 || l==h->smb_width-1){ + int mx = j*16; + int my = i*16 + 15; my = my < height ? my: height-1; + uint32_t *bottom = rgba_pixels + my*width + mx; + int endx = mx+16 < width? 16: width-mx; + + for (int x = 0; xsmb_width-1 ){ + int mx = j*16 + 15; mx = mx < width ? mx: width-1; + int my = i*16; + uint32_t *right = rgba_pixels + my*width + mx; + int endy = my +16 < height ? 16: height - my; + + for (int y = 0; ysbmap_texture; + + SDL_LockTexture( sbmap, NULL, (void **)&rgba_pixels, &pitch ); + + memset (rgba_pixels, 0, pitch * h->height); + for (int i=0; i< smbc->nsmb_height; i++){ + for (int j=0; j< smbc->nsmb_width; j++){ + draw_sb_border(h, rgba_pixels, j, i); + } + } + + SDL_UnlockTexture( sbmap ); +} +// #endif + +// static void calc_sb_sizes (H264Context *h, SuperMBContext *smbc){ +// smbc->smb_height = h->smb_height; +// smbc->smb_width = h->smb_width; +// +// smbc->nsmb_height = h->mb_height / smbc->smb_height + (h->mb_height%smbc->smb_height ? 1:0); //only need one extra if mb_height was not dividable +// smbc->nsmb_width = h->mb_width / smbc->smb_width; +// while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width ) +// smbc->nsmb_width++; +// } + + +static void handle_key_event(H264Context *h, SDLContext *sdlc, SDL_Keysym keysym){ + int arrow=0; + + switch (keysym.sym){ + case SDLK_ESCAPE: + if (sdlc->fullscreen){ + SDL_SetWindowFullscreen(sdlc->window, SDL_FALSE); + sdlc->fullscreen = 0; + } + break; + case SDLK_SPACE: + pthread_mutex_lock(&h->sdl_lock); + sdlc->pause = !sdlc->pause; + pthread_cond_signal(&h->sdl_cond); + pthread_mutex_unlock(&h->sdl_lock); + break; + case SDLK_f: + if (!sdlc->fullscreen){ + if (keysym.mod == KMOD_LCTRL){ +// SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full); + SDL_SetWindowFullscreen(sdlc->window, SDL_TRUE); + + sdlc->fullscreen = 1; + } + } + break; + case SDLK_m: + sdlc->showmap = !sdlc->showmap; + break; + case SDLK_UP: + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height < h->mb_height && h->smb_height < h->smb_width){ + h->smb_height++; + arrow =1; + } + break; + case SDLK_DOWN: + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height > 1 ){ + h->smb_height--; + arrow =1; + } + break; + case SDLK_LEFT: + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width > 1 && h->smb_width > h->smb_height){ + h->smb_width--; + arrow =1; + } + break; + case SDLK_RIGHT: + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width < h->mb_width){ + h->smb_width++; + arrow =1; + } + break; + } + + if (arrow){ + SuperMBContext *smbc = getSuperMBContext(h, h->smb_width, h->smb_height); + pthread_mutex_lock(&h->smb_lock); + h->smbc->refcount--; + if (h->smbc->refcount == 0) + freeSuperMBContext(h->smbc); + h->smbc = smbc; + sdlc->updatemap =1; + pthread_mutex_unlock(&h->smb_lock); + } +} + +void handle_window_event(H264Context *h, SDLContext *sdlc, SDL_WindowEvent winevent){ + SDL_Rect nrect; + switch (winevent.event){ + case SDL_WINDOWEVENT_RESIZED: + + sdlc->win_w = winevent.data1; + sdlc->win_h = winevent.data2; + + double aspect = (double) sdlc->win_w/ sdlc->win_h; + if ( aspect < sdlc->aspect){ + double r = (double) sdlc->win_w / sdlc->rect.w; + double h = (double) sdlc->rect.h * r; + + nrect.y = lrint(( (double) sdlc->win_h - h)/2); + nrect.h = lrint(h); + + nrect.x=0; + nrect.w= sdlc->win_w; + + }else { + double r = (double) sdlc->win_h / sdlc->rect.h; + double w = (double) sdlc->rect.w * r; + + nrect.x = lrint(( (double) sdlc->win_w - w)/2); + nrect.w = lrint(w); + + nrect.y=0; + nrect.h= sdlc->win_h; + } + //prob better to lock + sdlc->win_rect = nrect; + sdlc->resized=1; + break; + } +} + +void *sdl_event_listen_thread(void *arg){ + H264Context *h = (H264Context *) arg; + SDLContext *sdlc = h->sdlc; + SDL_Event event; + + while ( SDL_WaitEvent(&event) ) { + switch (event.type) { + case SDL_KEYDOWN: + handle_key_event(h, sdlc, event.key.keysym); + break; + case SDL_WINDOWEVENT: + handle_window_event(h, sdlc, event.window); + break; + case SDL_QUIT: + h->quit=1; + goto finish; + } + } +finish: + pthread_exit(NULL); + return NULL; +} + +//XInitThreads not called in SDL2 library, causes crash +//remove in future when fixed ... +#include + +SDLContext *get_SDL_context(H264Context *h){ + const int frame_width=h->frame_width; + const int frame_height=h->frame_height; + + SDLContext *sdlc = av_mallocz(sizeof(SDLContext)); + sdlc->display = h->display; + sdlc->fullscreen = h->fullscreen; + + sdlc->aspect = (double) frame_width / (double) frame_height; + sdlc->rect.x =0; + sdlc->rect.y =0; + sdlc->rect.w =frame_width; + sdlc->rect.h =frame_height; + + XInitThreads(); //workaround + + // Initializes the video subsystem + if (SDL_Init(SDL_INIT_VIDEO) < 0) { + fprintf(stderr, "Unable to init SDL: %s\n", SDL_GetError()); + #undef exit + exit(-1); + } + SDL_SetHint("SDL_HINT_RENDER_SCALE_QUALITY", "best"); + SDL_SetHint("SDL_HINT_RENDER_OPENGL_SHADERS", "1"); + + SDL_GetDesktopDisplayMode(0, &sdlc->full); + sdlc->full.format = SDL_PIXELFORMAT_IYUV; + + sdlc->wind = sdlc->full; + if (sdlc->wind.w > frame_width) sdlc->wind.w = frame_width; + if (sdlc->wind.h > frame_height) sdlc->wind.h = frame_height; + + sdlc->win_rect.x =0; + sdlc->win_rect.y =0; + sdlc->win_rect.w =sdlc->wind.w; + sdlc->win_rect.h =sdlc->wind.h; + + if (sdlc->fullscreen){ + sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, sdlc->full.w, sdlc->full.h, SDL_WINDOW_FULLSCREEN|SDL_WINDOW_SHOWN|SDL_WINDOW_RESIZABLE); + SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full); + } else { + sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, sdlc->wind.w, sdlc->wind.h, SDL_WINDOW_RESIZABLE|SDL_WINDOW_SHOWN); + SDL_SetWindowDisplayMode (sdlc->window, &sdlc->wind); + } + + sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_ACCELERATED); +// sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_SOFTWARE); + + h->sdlq.queue[0] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); + h->sdlq.queue[1] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); + + sdlc->sbmap_texture = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); + SDL_SetTextureBlendMode(sdlc->sbmap_texture, SDL_BLENDMODE_BLEND); + sdlc->updatemap = 1; + +#if HAVE_LIBSDL_TTF + //not working with SDL 2.0, try again in future when supported + if(TTF_Init()==-1) { + printf("TTF_Init: %s\n", TTF_GetError()); + exit(2); + } + + // Load a font + TTF_Font *font; + font = TTF_OpenFont("/usr/share/fonts/truetype/freefont/FreeSans.ttf", 24); + if (font == NULL) + { + printf("TTF_OpenFont() Failed: %s\n", TTF_GetError()); + TTF_Quit(); + exit(1); + } +#endif + + pthread_create(&sdlc->listen_thread, NULL, sdl_event_listen_thread, h); + + return sdlc; + +} + +void free_SDL_context(H264Context *h){ + SDLContext *sdlc = h->sdlc; + pthread_join(sdlc->listen_thread, NULL); + +#if HAVE_LIBSDL_TTF + TTF_Quit(); +#endif + SDL_DestroyTexture(h->sdlq.queue[0]); + SDL_DestroyTexture(h->sdlq.queue[1]); + SDL_DestroyTexture(sdlc->sbmap_texture); + SDL_DestroyRenderer(sdlc->renderer); + SDL_DestroyWindow(sdlc->window); + SDL_Quit(); + +} + +void *sdl_thread(void *arg){ + H264Context *h = (H264Context *) arg; + + SDLContext *sdlc = get_SDL_context(h); + h->sdlc = sdlc; + + signal_texture(h, 0); + signal_texture(h, 0); + + SDL_Texture *texture; + for (;;){ + pthread_mutex_lock(&h->sdl_lock); + while (sdlc->pause){ + pthread_cond_wait(&h->sdl_cond, &h->sdl_lock); + } + pthread_mutex_unlock(&h->sdl_lock); + + texture = get_next_texture(h, 0); + if (texture == NULL) + break; + + SDL_UnlockTexture(texture); + + //clear if resized + if (sdlc->resized){ + // KDE bug prob, reset viewport change after resize from max + SDL_RenderSetViewport(sdlc->renderer, NULL); + SDL_SetRenderDrawColor(sdlc->renderer, 0, 0, 0, 255); + SDL_RenderClear(sdlc->renderer); + sdlc->resized = 0; + } + + SDL_RenderCopy(sdlc->renderer, texture, &sdlc->rect, &sdlc->win_rect); + + if (sdlc->showmap){ + if (sdlc->updatemap){ + SuperMBContext *smbc; + pthread_mutex_lock (&h->smb_lock); + smbc = h->smbc; + smbc->refcount++; + sdlc->updatemap=0; + pthread_mutex_unlock(&h->smb_lock); + + draw_sbmap(h, smbc, sdlc); + + release_smbc(h, smbc); + } + SDL_RenderCopy(sdlc->renderer, sdlc->sbmap_texture, &sdlc->rect, &sdlc->win_rect); + } + + SDL_RenderPresent(sdlc->renderer); + signal_texture(h, 0); + } + + free_SDL_context(h); + + pthread_exit(NULL); + return NULL; +} +#endif + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_misc.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_misc.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,52 @@ +#ifndef H264_MISC_H +#define H264_MISC_H + +#include "avcodec.h" +#include "h264_types.h" + +void start_timer(H264Context *h, int stage); +void stop_timer(H264Context *h, int stage); + +void init_sb_entry(H264Context *h, SliceBufferEntry *sbe); +void free_sb_entry(SliceBufferEntry *sb); +SliceBufferEntry *get_sb_entry(H264Context *h); +void release_sb_entry(H264Context *h, SliceBufferEntry *sb); + +DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s); +void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode); + +void draw_edges(MBRecContext *d, H264Slice *s, int line); + +int ff_init_slice(NalContext *n, H264Slice *s); +void free_picture(PictureInfo *pic); +void free_dp(DecodedPicture *pic); + +void av_start_timer(); +int copyEDtoH264Slice(H264Slice *ms, H264Slice *es); +void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose); + +int ff_alloc_picture_info(NalContext *n, H264Slice *s, PictureInfo *pic); +DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height); +OutputContext *get_output_context(H264Context *h); +void free_output_context(OutputContext *oc); + +void freeSuperMBContext(SuperMBContext *smbc); +SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height); +void release_smbc(H264Context *h, SuperMBContext *smbc); +SuperMBContext * acquire_smbc(H264Context *h ); + +#if HAVE_LIBSDL2 +void signal_sdl_exit(H264Context *h); +void *sdl_thread(void *arg); +SDLContext *get_SDL_context(H264Context *h); +void free_SDL_context(SDLContext *sdlc); +#endif + +/** +* gets the chroma qp. +*/ +static inline int get_chroma_qp(H264Slice *s, int t, int qscale){ + return s->pps.chroma_qp_table[t][qscale]; +} + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_nal.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_nal.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,628 @@ +#include "h264_types.h" +#include "h264_data.h" + +#include "golomb.h" +#include "h264_sei.h" +#include "h264_refs.h" +#include "h264_ps.h" +#include "h264_pred_mode.h" +#include "h264_misc.h" + +static int ff_h264_decode_rbsp_trailing(const uint8_t *src){ + int v= *src; + int r; + + for(r=1; r<9; r++){ + if(v&1) return r; + v>>=1; + } + return 0; +} + +static int pred_weight_table(H264Slice *s, GetBitContext *gb){ + int luma_def, chroma_def; + + s->use_weight= 0; + s->use_weight_chroma= 0; + s->luma_log2_weight_denom= get_ue_golomb(gb); + s->chroma_log2_weight_denom= get_ue_golomb(gb); + luma_def = 1<luma_log2_weight_denom; + chroma_def = 1<chroma_log2_weight_denom; + + for(int list=0; list<2; list++){ + for(int i=0; iref_count[list]; i++){ + int luma_weight_flag, chroma_weight_flag; + + luma_weight_flag= get_bits1(gb); + if(luma_weight_flag){ + s->luma_weight[i][list][0]= get_se_golomb(gb); + s->luma_weight[i][list][1]= get_se_golomb(gb); + if( s->luma_weight[i][list][0] != luma_def + || s->luma_weight[i][list][1] != 0) { + s->use_weight= 1; + } + }else{ + s->luma_weight[i][list][0]= luma_def; + s->luma_weight[i][list][1]= 0; + } + + chroma_weight_flag= get_bits1(gb); + if(chroma_weight_flag){ + int j; + for(j=0; j<2; j++){ + s->chroma_weight[i][list][j][0]= get_se_golomb(gb); + s->chroma_weight[i][list][j][1]= get_se_golomb(gb); + if( s->chroma_weight[i][list][j][0] != chroma_def + || s->chroma_weight[i][list][j][1] != 0) { + s->use_weight_chroma= 1; + } + } + }else{ + int j; + for(j=0; j<2; j++){ + s->chroma_weight[i][list][j][0]= chroma_def; + s->chroma_weight[i][list][j][1]= 0; + } + } + } + if(s->slice_type_nos != FF_B_TYPE) break; + } + s->use_weight= s->use_weight || s->use_weight_chroma; + return 0; +} + +/** +* Initialize implicit_weight table. +*/ +static void implicit_weight_table(H264Slice *s){ + int ref0, ref1, cur_poc, ref_start, ref_count0, ref_count1; + + cur_poc = s->poc; + if( s->ref_count[0] == 1 && s->ref_count[1] == 1 && s->ref_list[0][0]->poc + s->ref_list[1][0]->poc == 2*cur_poc){ + s->use_weight= 0; + s->use_weight_chroma= 0; + return; + } + ref_start= 0; + ref_count0= s->ref_count[0]; + ref_count1= s->ref_count[1]; + + s->use_weight= 2; + s->use_weight_chroma= 2; + s->luma_log2_weight_denom= 5; + s->chroma_log2_weight_denom= 5; + + for(ref0=ref_start; ref0 < ref_count0; ref0++){ + int poc0 = s->ref_list[0][ref0]->poc; + for(ref1=ref_start; ref1 < ref_count1; ref1++){ + int poc1 = s->ref_list[1][ref1]->poc; + int td = av_clip(poc1 - poc0, -128, 127); + int w= 32; + if(td){ + int tb = av_clip(cur_poc - poc0, -128, 127); + int tx = (16384 + (FFABS(td) >> 1)) / td; + int dist_scale_factor = (tb*tx + 32) >> 8; + if(dist_scale_factor >= -64 && dist_scale_factor <= 128) + w = 64 - dist_scale_factor; + } + s->implicit_weight[ref0][ref1][0]= + s->implicit_weight[ref0][ref1][1]= w; + } + } +} + +/** +* instantaneous decoder refresh. +*/ +static void idr(NalContext *n, H264Slice *s){ + ff_h264_remove_all_refs(n, s); + n->prev_frame_num= 0; + n->prev_frame_num_offset= 0; + n->poc_offset += (n->prev_poc_msb<<16) + n->prev_poc_lsb; + n->prev_poc_msb= + n->prev_poc_lsb= 0; +} + +static int init_poc(NalContext *n, H264Slice *s, GetBitContext *gb){ + const int max_frame_num= 1<sps.log2_max_frame_num; + int frame_poc; + + if(n->sps.poc_type==0){ + n->poc_lsb= get_bits(gb, n->sps.log2_max_poc_lsb); + } + + if(n->sps.poc_type==1 && !n->sps.delta_pic_order_always_zero_flag){ + n->delta_poc= get_se_golomb(gb); + } + + n->frame_num_offset= n->prev_frame_num_offset; + if(n->frame_num < n->prev_frame_num) + n->frame_num_offset += max_frame_num; + + if(n->sps.poc_type==0){ + const int max_poc_lsb= 1<sps.log2_max_poc_lsb; + + if(n->poc_lsb < n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb >= max_poc_lsb/2) + n->poc_msb = n->prev_poc_msb + max_poc_lsb; + else if(n->poc_lsb > n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb < -max_poc_lsb/2) + n->poc_msb = n->prev_poc_msb - max_poc_lsb; + else + n->poc_msb = n->prev_poc_msb; + + frame_poc = n->poc_msb + n->poc_lsb; + }else if(n->sps.poc_type==1){ + int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc; + int i; + + if(n->sps.poc_cycle_length != 0) + abs_frame_num = n->frame_num_offset + n->frame_num; + else + abs_frame_num = 0; + + if(s->nal_ref_idc==0 && abs_frame_num > 0) + abs_frame_num--; + + expected_delta_per_poc_cycle = 0; + for(i=0; i < n->sps.poc_cycle_length; i++) + expected_delta_per_poc_cycle += n->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse + + if(abs_frame_num > 0){ + int poc_cycle_cnt = (abs_frame_num - 1) / n->sps.poc_cycle_length; + int frame_num_in_poc_cycle = (abs_frame_num - 1) % n->sps.poc_cycle_length; + + expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle; + for(i = 0; i <= frame_num_in_poc_cycle; i++) + expectedpoc = expectedpoc + n->sps.offset_for_ref_frame[ i ]; + } else + expectedpoc = 0; + if(s->nal_ref_idc == 0) + expectedpoc = expectedpoc + n->sps.offset_for_non_ref_pic; + frame_poc = expectedpoc + n->delta_poc; + }else{ + int poc= 2*(n->frame_num_offset + n->frame_num); + if(!s->nal_ref_idc) + poc--; + frame_poc= poc; + } + s->current_picture_info->poc= s->poc = frame_poc + n->poc_offset; + s->coded_pic_num = n->coded_pic_num++; + + return 0; +} + +static void ref2frame(NalContext *n, H264Slice *s){ + for(int j=0; jlist_count; j++){ + int *ref2frm= s->ref2frm[j]; + + ref2frm[0]= + ref2frm[1]= -1; + + for(int i=0; iref_count[j]; i++){ + ref2frm[i+2]= 15; + if(s->ref_list[j][i]->cpn >=0){ + int k; + for(k=0; kshort_ref_count; k++){ + if(n->short_ref[k]->cpn == s->ref_list[j][i]->cpn){ + ref2frm[i+2]= k; + break; + } + } + } + } + } +} + +/** +* decodes a slice header. +* This will also call MPV_common_init() and frame_start() as needed. +* +* @param h h264context +* @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding) +* +* @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded +*/ +static int decode_slice_header(NalContext *n, H264Slice *s, GetBitContext *gb){ + unsigned int first_mb_in_slice; + unsigned int pps_id; + int num_ref_idx_active_override_flag; + unsigned int slice_type, tmp; + + first_mb_in_slice= get_ue_golomb(gb); + (void) first_mb_in_slice; + + slice_type= get_ue_golomb_31(gb); + if(slice_type > 9){ + av_log(AV_LOG_ERROR, "slice type too large (%d)\n", s->slice_type); + return -1; + } + if(slice_type > 4) + slice_type -= 5; + + slice_type= golomb_to_pict_type[ slice_type ]; + + s->slice_type= slice_type; + s->slice_type_nos= slice_type & 3; + s->current_picture_info->slice_type_nos = s->slice_type_nos; + s->current_picture_info->reference= s->nal_ref_idc? 2:0; + s->key_frame = s->slice_type == FF_I_TYPE; + + pps_id= get_ue_golomb(gb); + + if(pps_id>=MAX_PPS_COUNT){ + av_log(AV_LOG_ERROR, "pps_id out of range\n"); + return -1; + } + if(!n->pps_buffers[pps_id]) { + av_log(AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id); + return -1; + } + s->pps= *n->pps_buffers[pps_id]; + + if(!n->sps_buffers[s->pps.sps_id]) { + av_log(AV_LOG_ERROR, "non-existing SPS %u referenced\n", s->pps.sps_id); + return -1; + } + n->sps = *n->sps_buffers[s->pps.sps_id]; + + n->mb_width= n->sps.mb_width; + n->mb_height= n->sps.mb_height; + + int chroma444 = (n->sps.chroma_format_idc == 3); + n->width = 16*n->mb_width - (2>>chroma444)*FFMIN(n->sps.crop_right, (8<sps.frame_mbs_only_flag) + n->height= 16*n->mb_height - (2>>chroma444)*FFMIN(n->sps.crop_bottom, (8<height= 16*n->mb_height - (4>>chroma444)*FFMIN(n->sps.crop_bottom, (8<direct_8x8_inference_flag = n->sps.direct_8x8_inference_flag; + s->transform_bypass = n->sps.transform_bypass; + + n->frame_num= get_bits(gb, n->sps.log2_max_frame_num); + if(n->frame_num != n->prev_frame_num && n->frame_num != (n->prev_frame_num+1)%(1<sps.log2_max_frame_num)){ + av_log(AV_LOG_ERROR, "unexpected frame_num \n"); + } + + s->current_picture_info->frame_num= n->frame_num; //FIXME frame_num cleanup + n->max_pic_num= 1<< n->sps.log2_max_frame_num; + + if(s->nal_unit_type == NAL_IDR_SLICE){ + get_ue_golomb(gb); /* idr_pic_id */ + } + + init_poc(n, s, gb); + + if(s->pps.redundant_pic_cnt_present){ + n->redundant_pic_count= get_ue_golomb(gb); + } + + //set defaults, might be overridden a few lines later + s->ref_count[0]= s->pps.ref_count[0]; + s->ref_count[1]= s->pps.ref_count[1]; + + if(s->slice_type_nos != FF_I_TYPE){ + if(s->slice_type_nos == FF_B_TYPE){ + s->direct_spatial_mv_pred= get_bits1(gb); + } + num_ref_idx_active_override_flag= get_bits1(gb); + + if(num_ref_idx_active_override_flag){ + s->ref_count[0]= get_ue_golomb(gb) + 1; + if(s->slice_type_nos==FF_B_TYPE) + s->ref_count[1]= get_ue_golomb(gb) + 1; + + if(s->ref_count[0]-1 > 32-1 || s->ref_count[1]-1 > 32-1){ + av_log(AV_LOG_ERROR, "reference overflow\n"); + s->ref_count[0]= s->ref_count[1]= 1; + return -1; + } + } + if(s->slice_type_nos == FF_B_TYPE) + s->list_count= 2; + else + s->list_count= 1; + }else + s->list_count= 0; + + + if(s->slice_type_nos!=FF_I_TYPE){ + ff_h264_fill_default_ref_list(n, s); + ff_h264_decode_ref_pic_list_reordering(n, s, gb); + ref2frame(n, s); + + for(int i=0; i<2; i++){ + for(int j=0; jref_count[i]; j++){ + if (s->ref_list[i][j]==NULL || s->ref_list[i][j]->reference < 2) // Don't know why sometimes the ref_count=1 while there are no references + s->ref_list_cpn[i][j] = -1; + else + s->ref_list_cpn[i][j] = s->ref_list[i][j]->cpn; + } + } + } + + if( (s->pps.weighted_pred && s->slice_type_nos == FF_P_TYPE ) + || (s->pps.weighted_bipred_idc==1 && s->slice_type_nos== FF_B_TYPE ) ){ + pred_weight_table(s, gb); + } + else if(s->pps.weighted_bipred_idc==2 && s->slice_type_nos== FF_B_TYPE){ + implicit_weight_table( s); + }else { + s->use_weight = 0; + } + + if(s->nal_ref_idc){ + ff_h264_ref_pic_marking(n, s, gb); + n->prev_poc_msb= n->poc_msb; + n->prev_poc_lsb= n->poc_lsb; + } + + n->prev_frame_num_offset= n->frame_num_offset; + n->prev_frame_num= n->frame_num; + + if(s->slice_type_nos != FF_B_TYPE){ + s->ip_id= n->ip_id++; + } + + if(s->slice_type_nos==FF_B_TYPE && !s->direct_spatial_mv_pred){ + ff_h264_direct_dist_scale_factor(s); + } + ff_h264_direct_ref_list_init(s); + + + if( s->slice_type_nos != FF_I_TYPE && s->pps.cabac ){ + tmp = get_ue_golomb_31(gb); + if(tmp > 2){ + av_log(AV_LOG_ERROR, "cabac_init_idc overflow\n"); + return -1; + } + s->cabac_init_idc= tmp; + } + + tmp = s->pps.init_qp + get_se_golomb(gb); + if(tmp>51){ + av_log(AV_LOG_ERROR, "QP %u out of range\n", tmp); + return -1; + } + s->qscale= tmp; + + //FIXME qscale / qp ... stuff + if(s->slice_type == FF_SP_TYPE){ + get_bits1(gb); /* sp_for_switch_flag */ + } + if(s->slice_type==FF_SP_TYPE || s->slice_type == FF_SI_TYPE){ + get_se_golomb(gb); /* slice_qs_delta */ + } + + s->slice_alpha_c0_offset = 52; + s->slice_beta_offset = 52; + if( s->pps.deblocking_filter_parameters_present ) { + tmp= get_ue_golomb_31(gb); + if(tmp > 1){ + av_log(AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp); + return -1; + } + + if(tmp < 2) + tmp^= 1; // 1<->0 + + if( tmp ) { + s->slice_alpha_c0_offset += get_se_golomb(gb) << 1; + s->slice_beta_offset += get_se_golomb(gb) << 1; + if( (unsigned) s->slice_alpha_c0_offset > 104U + ||(unsigned) s->slice_beta_offset > 104U){ + av_log(AV_LOG_ERROR, "deblocking filter parameters %d %d out of range\n", s->slice_alpha_c0_offset, s->slice_beta_offset); + return -1; + } + } + } + + s->qp_thresh= 15 + 52 - FFMIN(s->slice_alpha_c0_offset, s->slice_beta_offset) - FFMAX3(0, s->pps.chroma_qp_index_offset[0], s->pps.chroma_qp_index_offset[1]); + + return 0; +} + +PictureInfo *get_pib_entry(NalContext *nc, int coded_pic_num){ + PictureInfo *pic = NULL; + + for(int i=0; ipicture[i].reference==0){ + pic= &nc->picture[i]; + break; + } + } + pic->cpn = coded_pic_num; + + return pic; +} + +int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb1){ + GetBitContext *gb = gb1; + uint8_t *buf = gb1->raw; + int buf_size = gb1->buf_size; + int next_avc = buf_size; + int buf_index=0; + uint8_t *dst=NULL; +// gb->raw = gb1->raw; +// gb->rbsp = NULL; + s->release_cnt=0; + ff_h264_reset_sei(n); + + s->current_picture_info = get_pib_entry(n, n->coded_pic_num); + + for(;;){ + int consumed; + int dst_length; + int bit_length; + const uint8_t *ptr; + int err; + + if (buf_index >= buf_size){ + break; + } else { + // start code prefix search + for(; buf_index + 3 < buf_size; buf_index++){ + // This should always succeed in the first iteration. + if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1) + break; + } + if(buf_index+3 >= buf_size) break; + buf_index+=3; + } + + { + int length = next_avc - buf_index; + int i, si, di; + uint8_t *src= buf+buf_index; + // src[0]&0x80; //forbidden bit + s->nal_ref_idc= src[0]>>5; + s->nal_unit_type= src[0]&0x1F; + + src++; length--; + + for(i=0; i+10 && src[i-1]==0) i--; + if(i+2=length-1){ //no escaped 0 + dst_length= length; + consumed= length+1; //+1 for the header + ptr=src; + }else{ + av_fast_malloc(&gb->rbsp, &gb->rbsp_size, length+FF_INPUT_BUFFER_PADDING_SIZE); + dst = gb->rbsp; +// if (dst){ +// av_free(dst); +// } +// dst = av_malloc(length+FF_INPUT_BUFFER_PADDING_SIZE); + + if (dst == NULL){ + return -1; + } + + //printf("decoding esc\n"); + memcpy(dst, src, i); + si=di=i; + while(si+23){ + dst[di++]= src[si++]; + dst[di++]= src[si++]; + }else if(src[si]==0 && src[si+1]==0){ + if(src[si+2]==3){ //escape + dst[di++]= 0; + dst[di++]= 0; + si+=3; + continue; + }else //next start code + goto nsc; + } + + dst[di++]= src[si++]; + } + while(sirbsp=ptr; + } + } + if (ptr==NULL || dst_length < 0){ + return -1; + } + + //error prevention, should not touch dst_length + while(ptr[dst_length - 1] == 0 && dst_length > 0) + dst_length--; + + bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(ptr + dst_length - 1)); + buf_index += consumed; + + err = 0; + init_get_bits(gb, ptr, bit_length); + switch(s->nal_unit_type){ + case NAL_IDR_SLICE: + idr(n, s); //FIXME ensure we don't loose some frames if there is reordering + case NAL_SLICE: + if((err = decode_slice_header(n, s, gb))) + break; + s->key_frame |= (s->nal_unit_type == NAL_IDR_SLICE) || (n->sei_recovery_frame_cnt >= 0); + break; + case NAL_DPA: + case NAL_DPB: + case NAL_DPC: + av_log(AV_LOG_ERROR,"no slices/data partitioning support\n"); + break; + case NAL_SEI: + ff_h264_decode_sei(n, gb); + break; + case NAL_SPS: + ff_h264_decode_seq_parameter_set(n, gb); + break; + case NAL_PPS: + ff_h264_decode_picture_parameter_set(n, gb, bit_length); + break; + case NAL_AUD: + case NAL_END_SEQUENCE: + case NAL_END_STREAM: + case NAL_FILLER_DATA: + case NAL_SPS_EXT: + case NAL_AUXILIARY_SLICE: + break; + default: + av_log(AV_LOG_ERROR, "Unknown NAL code: %d (%d bits)\n", s->nal_unit_type, bit_length); + } + if (err < 0) + av_log(AV_LOG_ERROR, "decode_slice_header error\n"); + + } + + return buf_index; +} + +NalContext *get_nal_context(int width, int height){ + const int mb_height = (height + 15) / 16; + const int mb_width = (width + 15) / 16; + const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16 + + NalContext *nc = av_mallocz(sizeof(NalContext)); + nc->width = width; + nc->height = height; + nc->mb_height = mb_height; + nc->mb_width = mb_width; + nc->b4_stride = mb_width*4 + 1; + nc->mb_stride = mb_stride; + nc->outputed_poc = INT_MIN; + + for(int i=0; i<16; i++){ + nc->picture[i].cpn =-1; + } + + return nc; +} + +void free_nal_context(NalContext *nc){ + for(int i = 0; i < MAX_SPS_COUNT; i++){ + if (nc->sps_buffers[i]){ + av_free( nc->sps_buffers[i]); + } + } + for(int i = 0; i < MAX_PPS_COUNT; i++){ + if (nc->pps_buffers[i]){ + av_free( nc->pps_buffers[i]); + } + } + av_free(nc); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_nal.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_nal.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,11 @@ +#ifndef H264_NAL_H +#define H264_NAL_H + +#include "avcodec.h" +#include "h264_types.h" + +int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb); +NalContext *get_nal_context(int width, int height); +void free_nal_context(NalContext *nc); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_numa.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_numa.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,33 @@ + +#include +#include "h264.h" +#include "malloc.h" + +/* +* Pthread version with affinity lock for ED and MBD threads. Deprecated +*/ +int av_transcode_pthread_affinity(int ifile, int ofile, int frame_width, int frame_height, h264_options *opts) { + H264Context *h; + pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; + + h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height, opts); + timer_start = av_gettime(); + + pthread_create(&read_thr, NULL, read_thread, h); + pthread_create(&parsenal_thr, NULL, parsenal_thread, h); + pthread_create(&entropy_thr, NULL, entropy_IPB_thread, h); + pthread_create(&mbdec_thr, NULL, mbdec_thread, h); + pthread_create(&write_thr, NULL, write_thread, h); + + + pthread_join(read_thr, NULL); + pthread_join(parsenal_thr, NULL); + pthread_join(entropy_thr, NULL); + pthread_join(mbdec_thr, NULL); + pthread_join(write_thr, NULL); + + /* finished ! */ + ff_h264_decode_end(h); + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_ompss.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_ompss.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,401 @@ +/* +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder +* Copyright (c) 2003 Michael Niedermayer +* +* This file is part of FFmpeg. +* +* FFmpeg is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License as published by the Free Software Foundation; either +* version 2.1 of the License, or (at your option) any later version. +* +* FFmpeg is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public +* License along with FFmpeg; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include "h264_types.h" +#include "h264_parser.h" +#include "h264_nal.h" +#include "h264_entropy.h" +#include "h264_rec.h" +#include "h264_pred_mode.h" +#include "h264_misc.h" +// #undef NDEBUG +#include + +#pragma omp task inout(*pc, *nc) output(*sbe) +static void parse_task(H264Context *h, ParserContext *pc, NalContext *nc, SliceBufferEntry *sbe){ + H264Slice *s; + + if (!sbe->initialized){ + init_sb_entry(h, sbe); + sbe->lines_total=h->mb_height; + } + + av_read_frame_internal(pc, &sbe->gb); + s = &sbe->slice; + + decode_nal_units(nc, s, &sbe->gb); +} + +#pragma omp task inout(*ec) inout(*sbe) +static void decode_slice_entropy_task(H264Context *h, EntropyContext *ec, SliceBufferEntry *sbe){ + int i,j; + H264Slice *s = &sbe->slice; + GetBitContext *gb = &sbe->gb; + H264Mb *mbs = sbe->mbs; +// GetBitContext *gb = s->gb; + CABACContext *c = &ec->c; + + if( !s->pps.cabac ){ + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); + return ; + } + + init_dequant_tables(s, ec); + ec->curr_qscale = s->qscale; + ec->last_qscale_diff = 0; + ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale); + ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale); + + /* realign */ + align_get_bits( gb ); + /* init cabac */ + ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); + + ff_h264_init_cabac_states(ec, s, c); + + for(j=0; jmb_height; j++){ + init_entropy_buf(ec, s, j); + for(i=0; imb_width; i++){ + int eos,ret; + H264Mb *m = &mbs[i + j*ec->mb_width]; + m->mb_x=i; + m->mb_y=j; + ec->m = m; + + ret = ff_h264_decode_mb_cabac(ec, s, c); + eos = get_cabac_terminate( c); + (void) eos; + if( ret < 0 || c->bytestream > c->bytestream_end + 2) { + av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); + return ; + } + } + } +} + +static void decode_super_mb_block(MBRecContext *d, H264Slice *s, SuperMBContext *smbc, H264Mb *mbs, int smb_x, int smb_y){ + MBRecState mrs; +// memset(&mrs, 0, sizeof(MBRecState)); + + for (int k=0, i= smb_y; i< smb_y + smbc->smb_height; i++, k++){ + init_mbrec_context(d, &mrs, s, i); + for (int j= smb_x -k ; j< smb_x - k + smbc->smb_width; j++){ + if (i< d->mb_height && j >= 0 && j < d->mb_width){ + h264_decode_mb_internal (d, &mrs, s, &mbs[i*d->mb_width+j]); + } + } + } +} + +#pragma omp task input(*d, *sbe, *ml, *mur) inout(*m) +static void decode_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml, +SuperMBTask *mur, SuperMBTask *m){ + H264Slice *s = &sbe->slice; + H264Mb *mbs = sbe->mbs; + decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y); +} + +#pragma omp task input(*d, *sbe) inout(*sm) +static void draw_edges_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *sm, int line){ + H264Slice *s = &sbe->slice; + for (int i=line*smbc->smb_height; i< (line+1)*smbc->smb_height && i< d->mb_height; i++) + draw_edges(d, s, i); +} + +static void decode_mb_in_slice(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){ + int i,j; + + SuperMBContext *smbc = acquire_smbc(h); + int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width; + SuperMBTask *smbs = smbc->smbs[0]; + + SuperMBTask *sm=NULL, *sml, *smur; + for(j=0; j< smb_height; j++){ + for(i=0; i< smb_width; i++){ + sm = smbs + j*smb_width + i; + sml = sm - ((i > 0) ? 1: 0); + smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); + decode_super_mb_task(d, sbe, smbc, sml, smur, sm); + } + draw_edges_task(d, sbe, smbc, sm, j); + } + #pragma omp taskwait on(*sm) + + release_smbc(h, smbc); +} + +#pragma omp task inout(*d) inout(*sbe) +static void decode_slice_mb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){ + H264Slice *s = &sbe->slice; + + for (int i=0; i<2; i++){ + for(int j=0; j< s->ref_count[i]; j++){ + if (s->ref_list_cpn[i][j] ==-1) + continue; + int k; + for (k=0; k< h->max_dpb_cnt; k++){ + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ + s->dp_ref_list[i][j] = &h->dpb[k]; + break; + } + } + } + } + + #pragma omp critical (dpb) + get_dpb_entry(h, s); + + if (!h->no_mbd){ + decode_mb_in_slice (h, d, sbe); + } + + for (int i=0; irelease_cnt; i++){ + for(int j=0; jmax_dpb_cnt; j++){ + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ + #pragma omp critical (dpb) + release_dpb_entry(h, &h->dpb[j], 2); + break; + } + } + } + s->release_cnt=0; +} + +// for static 3d wave +/*-------------------------------------------------------------------------------*/ +#pragma omp task input(*d, *sbe, *ml, *mur, *mprev) inout(*m) +static void decode_3dwave_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml, +SuperMBTask *mur, SuperMBTask *mprev, SuperMBTask *m){ + H264Slice *s = &sbe->slice; + H264Mb *mbs = sbe->mbs; + + decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y); +} + +// int init_ref_count=0; +#pragma omp task inout(*d, *sbe, *init) +static void init_ref_list_and_get_dpb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe, int *init){ + H264Slice *s = &sbe->slice; + for (int i=0; i<2; i++){ + for(int j=0; j< s->ref_count[i]; j++){ + if (s->ref_list_cpn[i][j] ==-1) + continue; + int k; + for (k=0; kmax_dpb_cnt; k++){ + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ + s->dp_ref_list[i][j] = &h->dpb[k]; + break; + } + } + } + } + + #pragma omp critical (dpb) + get_dpb_entry(h, s); + +} + +static SuperMBTask* add_decode_slice_3dwave_tasks(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc){ + int i,j; + + int smb_3d_height =smbc->nsmb_3dheight; + int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width; + int smb_diff_prev = smb_height - smb_3d_height; + SuperMBTask *sm=NULL, *sml, *smur, *smprev; + + SuperMBTask *smbs = smbc->smbs[smbc->index++]; smbc->index%=2; + SuperMBTask *smbs_prev = smbc->smbs[smbc->index]; // index rotates -> next == prev + + for(j=0; j 0) ? 1: 0); + smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); + smprev = smbs_prev + (j + smb_diff_prev+1)*smb_width -1; + decode_3dwave_super_mb_task(d, sbe, smbc, sml, smur, smprev, sm); + } + draw_edges_task(d, sbe, smbc, sm, j); + } + + for(; j< smb_height; j++){ + for(i=0; i< smb_width; i++){ + sm = smbs + j*smb_width + i; + sml = sm - ((i > 0) ? 1: 0); + smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); + decode_super_mb_task(d, sbe, smbc, sml, smur, sm); + } + draw_edges_task(d, sbe, smbc, sm, j); + } + return sm; +} + +#pragma omp task inout(*d, *sbe, *release) input (*lastsmb) +static void release_ref_list_task(H264Context *h, SuperMBContext *smbc, MBRecContext *d, SliceBufferEntry *sbe, SuperMBTask *lastsmb, int *release){ + H264Slice *s = &sbe->slice; + for (int i=0; irelease_cnt; i++){ + for(int j=0; jmax_dpb_cnt; j++){ + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ + #pragma omp critical (dpb) + release_dpb_entry(h, &h->dpb[j], 2); + break; + } + } + } + s->release_cnt=0; + + release_smbc(h, smbc); + +} + +// static void decode_mb_static_3dwave(H264Context *h, int mb_height, int mb_width, MBRecContext *d, H264Slice *s, H264Mb *mbs, SuperMBTask *smbs, SuperMBTask *smbs_prev){ +// +// } +/*-------------------------------------------------------------------------------*/ +//end for static 3d wave + +#pragma omp task inout (*oc) input(*sbe) +static void output_task(H264Context *h, OutputContext *oc, SliceBufferEntry *sbe){ + DecodedPicture* out =output_frame(h, oc, sbe->slice.curr_pic, h->ofile, h->frame_width, h->frame_height); + if (out){ + #pragma omp critical (dpb) + release_dpb_entry(h, out, 1); + } + print_report(oc->frame_number, oc->video_size, 0, h->verbose); +} + +/* +* The following code is the main loop of the file converter +*/ +//Put VMS entry point here +int h264_decode_ompss( H264Context *h) { + const int bufs = h->pipe_bufs; + + ParserContext *pc; + NalContext *nc; + EntropyContext *ec[bufs]; + MBRecContext *rc[2]; + OutputContext *oc; + SliceBufferEntry *sbe; + SuperMBContext *smbc; + + DecodedPicture *out; + int frames=0; + +#if HAVE_LIBSDL2 + pthread_t sdl_thr; + if (h->display){ + pthread_create(&sdl_thr, NULL, sdl_thread, h); + } +#endif + sbe= av_mallocz(sizeof(SliceBufferEntry) * bufs); + + + pc = get_parse_context(h->ifile); + nc = get_nal_context(h->width, h->height); + + for(int i=0; istatic_3d && bufs < h->num_frames ){ + int num_pre_ed =0; + for (num_pre_ed=0; num_pre_ed< bufs -1 && !pc->final_frame; num_pre_ed++){ + parse_task( h, pc, nc, &sbe[k%bufs] ); + decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); + #pragma omp taskwait on(*pc) + k++; + } + + while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ + parse_task( h, pc, nc, &sbe[k%bufs] ); + decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); + + k++; + + init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init); + smbc = acquire_smbc(h); + SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc); + release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release); + + output_task (h, oc, &sbe[k%bufs]); + #pragma omp taskwait on(*pc) + } + + for (int i=0; i< num_pre_ed; i++){ + k++; + init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init); + smbc = acquire_smbc(h); + SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc); + release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release); + + output_task (h, oc, &sbe[k%bufs]); + } + + } else { + while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ + parse_task( h, pc, nc, &sbe[k%bufs] ); + + decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); + + decode_slice_mb_task(h, rc[0], &sbe[k%bufs]); + + output_task (h, oc, &sbe[k%bufs]); + #pragma omp taskwait on(*pc) + k++; + } + } + #pragma omp taskwait + + while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; + + print_report(oc->frame_number, oc->video_size, 1, h->verbose); + h->num_frames = oc->frame_number; + /* finished ! */ + + free_parse_context(pc); + free_nal_context (nc); + free_output_context(oc); + for (int i=0; idisplay){ + signal_sdl_exit(h); + pthread_join(sdl_thr, NULL); + } +#endif + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_parser.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_parser.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,224 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... parser + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 parser. + * @author Michael Niedermayer + */ + +#include + +#include "golomb.h" +#include "libavutil/error.h" +#include "h264_types.h" + +#undef NDEBUG +#include + +#define END_NOT_FOUND (-100) + +static int ff_h264_find_frame_end(ParserContext *s, const uint8_t *buf, int buf_size) +{ + int i; + uint32_t state; + + state= s->state; + if(state>13) + state= 7; + + for(i=0; i7, 1->4, 0->5 + else if(buf[i]) state = 7; + else state>>=1; //2->1, 1->0, 0->0 + }else if(state<=5){ + int v= buf[i] & 0x1F; + if(v==6 || v==7 || v==8 || v==9){ + if(s->frame_start_found){ + i++; + goto found; + } + }else if(v==1 || v==2 || v==5){ + if(s->frame_start_found){ + state+=8; + continue; + }else + s->frame_start_found = 1; + } + state= 7; + }else{ + if(buf[i] & 0x80) + goto found; + state= 7; + } + } + s->state= state; + return END_NOT_FOUND; + +found: + s->state=7; + s->frame_start_found= 0; + return i-(state&5); +} + +static int ff_combine_frame(ParserContext *s, GetBitContext *gb, int next, uint8_t **buf, int *buf_size) +{ + int i; + /* Copy overread bytes from last frame into buffer. */ + for(i =0; s->overread_cnt>0; s->overread_cnt--, i++){ + gb->raw[s->index++]= s->overread[i]; + } + + /* EOF - END_NOT_FOUND means no next frame start is found in current partial read. If buf_size of the partial read is 0 we are at EOF */ + if(!*buf_size && next == END_NOT_FOUND){ + next= 0; + } + s->last_index= s->index; + + /* copy into buffer end return */ + if(next == END_NOT_FOUND){ + gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, (*buf_size) + s->index + FF_INPUT_BUFFER_PADDING_SIZE); + memcpy(&gb->raw[s->index], *buf, *buf_size); + s->index += *buf_size; + return -1; + } + + ///end found + *buf_size= s->index + next; + /* append to buffer */ + + gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, next + s->index + FF_INPUT_BUFFER_PADDING_SIZE); + memcpy(&gb->raw[s->index], *buf, next + FF_INPUT_BUFFER_PADDING_SIZE ); + s->index = 0; + + /* store overread bytes */ + for(i=0; next < 0; next++, i++){ + s->state = (s->state<<8) | gb->raw[s->last_index + next]; + s->overread[i] = gb->raw[s->last_index + next]; + s->overread_cnt++; + } + + return 0; +} + +static int h264_parse(ParserContext *s, GetBitContext *gb, + uint8_t *buf, int buf_size) +{ + int next; + + next= ff_h264_find_frame_end(s, buf, buf_size); + + if (ff_combine_frame(s, gb, next, &buf, &buf_size) < 0) { + gb->buf_size = 0; + return buf_size; + } + + if(next<0 && next != END_NOT_FOUND){ + assert(s->last_index + next >= 0 ); + ff_h264_find_frame_end(s, &gb->raw[s->last_index + next], -next); //update state + } + + gb->buf_size = buf_size; + return next; +} + +static int ff_raw_read_partial_packet(ParserContext *pc) +{ + int len= -1; + + if (!pc->eof_reached){ + len = read( pc->ifile, pc->data, pc->buffer_size); +// printf("read task %d\t%d\n", pc->ifile, len); fflush(NULL); + if (len < pc->buffer_size) { + pc->eof_reached = 1; + } + } + + return len; +} + +void av_read_frame_internal(ParserContext *pc, GetBitContext *gb){ + int len; + uint8_t dummy_buf[FF_INPUT_BUFFER_PADDING_SIZE]={0}; + av_fast_malloc(&gb->raw, &gb->alloc_size, 2048+FF_INPUT_BUFFER_PADDING_SIZE); + + //Parsing is performed before read, since there are ussually leftovers from parsing the previous frame. + for(;;) { + if (pc->cur_len>0){ + len = h264_parse(pc, gb, pc->cur_ptr, pc->cur_len); + if (len<0) + len =0; + //* increment read pointer */ + pc->cur_ptr += len; + pc->cur_len -= len; + + if (gb->buf_size) { + break; + } + } + + //check for ret and not parser->eof_reached as one "read" can contain more than 1 frame + pc->size= ff_raw_read_partial_packet(pc); + if (pc->size < 0) { + pc->final_frame =1; + /* return the last frames, if any */ + h264_parse(pc, gb, dummy_buf, 0); + break; + } + pc->cur_ptr = pc->data; + pc->cur_len = pc->size; + } + + assert(gb->raw!=NULL); + +} + +ParserContext *get_parse_context(int ifile){ + ParserContext *pc = av_mallocz(sizeof(ParserContext)); + pc->buffer_size = 2048; + pc->final_frame = 0; + pc->cur_len= 0; + pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE); + pc->size = 2048; + pc->eof_reached =0; + pc->ifile = ifile; + + return pc; +} + +void free_parse_context(ParserContext *pc){ + av_free(pc->data); + av_free(pc); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_parser.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_parser.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,10 @@ +#ifndef H264_PARSER_H +#define H264_PARSER_H + +#include "h264_types.h" + +void av_read_frame_internal(ParserContext *pc, GetBitContext *gb); +ParserContext *get_parse_context(int ifile); +void free_parse_context(ParserContext *pc); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_pred.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,945 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 prediction functions. + * @author Michael Niedermayer + */ + +#include "avcodec.h" +#include "h264_pred.h" +//#include "dsputil.h" + +static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const uint32_t a= ((uint32_t*)(src-stride))[0]; + ((uint32_t*)(src+0*stride))[0]= a; + ((uint32_t*)(src+1*stride))[0]= a; + ((uint32_t*)(src+2*stride))[0]= a; + ((uint32_t*)(src+3*stride))[0]= a; +} + +static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; + ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; + ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; + ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; +} + +static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; + + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; +} + +static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; + + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; +} + +static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; + + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; +} + +static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + ((uint32_t*)(src+0*stride))[0]= + ((uint32_t*)(src+1*stride))[0]= + ((uint32_t*)(src+2*stride))[0]= + ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; +} + + +#define LOAD_TOP_RIGHT_EDGE\ + const int av_unused t4= topright[0];\ + const int av_unused t5= topright[1];\ + const int av_unused t6= topright[2];\ + const int av_unused t7= topright[3];\ + +#define LOAD_DOWN_LEFT_EDGE\ + const int av_unused l4= src[-1+4*stride];\ + const int av_unused l5= src[-1+5*stride];\ + const int av_unused l6= src[-1+6*stride];\ + const int av_unused l7= src[-1+7*stride];\ + +#define LOAD_LEFT_EDGE\ + const int av_unused l0= src[-1+0*stride];\ + const int av_unused l1= src[-1+1*stride];\ + const int av_unused l2= src[-1+2*stride];\ + const int av_unused l3= src[-1+3*stride];\ + +#define LOAD_TOP_EDGE\ + const int av_unused t0= src[ 0-1*stride];\ + const int av_unused t1= src[ 1-1*stride];\ + const int av_unused t2= src[ 2-1*stride];\ + const int av_unused t3= src[ 3-1*stride];\ + +static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; + src[0+2*stride]= + src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; + src[0+1*stride]= + src[1+2*stride]= + src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; + src[0+0*stride]= + src[1+1*stride]= + src[2+2*stride]= + src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[1+0*stride]= + src[2+1*stride]= + src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[2+0*stride]= + src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; +} + +static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE +// LOAD_LEFT_EDGE + + src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; + src[1+0*stride]= + src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; + src[2+0*stride]= + src[1+1*stride]= + src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; + src[3+0*stride]= + src[2+1*stride]= + src[1+2*stride]= + src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; + src[3+1*stride]= + src[2+2*stride]= + src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; + src[3+2*stride]= + src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; + src[3+3*stride]=(t6 + 3*t7 + 2)>>2; +} + +static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+0*stride]= + src[1+2*stride]=(lt + t0 + 1)>>1; + src[1+0*stride]= + src[2+2*stride]=(t0 + t1 + 1)>>1; + src[2+0*stride]= + src[3+2*stride]=(t1 + t2 + 1)>>1; + src[3+0*stride]=(t2 + t3 + 1)>>1; + src[0+1*stride]= + src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[1+1*stride]= + src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[2+1*stride]= + src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; + src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; +} + +static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){ + LOAD_TOP_EDGE + LOAD_TOP_RIGHT_EDGE + + src[0+0*stride]=(t0 + t1 + 1)>>1; + src[1+0*stride]= + src[0+2*stride]=(t1 + t2 + 1)>>1; + src[2+0*stride]= + src[1+2*stride]=(t2 + t3 + 1)>>1; + src[3+0*stride]= + src[2+2*stride]=(t3 + t4+ 1)>>1; + src[3+2*stride]=(t4 + t5+ 1)>>1; + src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[1+1*stride]= + src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; + src[2+1*stride]= + src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; + src[3+1*stride]= + src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; + src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; +} + +static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + LOAD_LEFT_EDGE + + src[0+0*stride]=(l0 + l1 + 1)>>1; + src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; + src[2+0*stride]= + src[0+1*stride]=(l1 + l2 + 1)>>1; + src[3+0*stride]= + src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; + src[2+1*stride]= + src[0+2*stride]=(l2 + l3 + 1)>>1; + src[3+1*stride]= + src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; + src[3+2*stride]= + src[1+3*stride]= + src[0+3*stride]= + src[2+2*stride]= + src[2+3*stride]= + src[3+3*stride]=l3; +} + + +static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){ + (void) topright; + const int lt= src[-1-1*stride]; + LOAD_TOP_EDGE + LOAD_LEFT_EDGE + + src[0+0*stride]= + src[2+1*stride]=(lt + l0 + 1)>>1; + src[1+0*stride]= + src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; + src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; + src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; + src[0+1*stride]= + src[2+2*stride]=(l0 + l1 + 1)>>1; + src[1+1*stride]= + src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; + src[0+2*stride]= + src[2+3*stride]=(l1 + l2+ 1)>>1; + src[1+2*stride]= + src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; + src[0+3*stride]=(l2 + l3 + 1)>>1; + src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; +} + +static void pred16x16_vertical_c(uint8_t *src, int stride){ + int i; + const uint32_t a= ((uint32_t*)(src-stride))[0]; + const uint32_t b= ((uint32_t*)(src-stride))[1]; + const uint32_t c= ((uint32_t*)(src-stride))[2]; + const uint32_t d= ((uint32_t*)(src-stride))[3]; + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= a; + ((uint32_t*)(src+i*stride))[1]= b; + ((uint32_t*)(src+i*stride))[2]= c; + ((uint32_t*)(src+i*stride))[3]= d; + } +} + +static void pred16x16_horizontal_c(uint8_t *src, int stride){ + int i; + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; + } +} + +static void pred16x16_dc_c(uint8_t *src, int stride){ + int i, dc=0; + + for(i=0;i<16; i++){ + dc+= src[-1+i*stride]; + } + + for(i=0;i<16; i++){ + dc+= src[i-stride]; + } + + dc= 0x01010101*((dc + 16)>>5); + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= dc; + } +} + +static void pred16x16_left_dc_c(uint8_t *src, int stride){ + int i, dc=0; + + for(i=0;i<16; i++){ + dc+= src[-1+i*stride]; + } + + dc= 0x01010101*((dc + 8)>>4); + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= dc; + } +} + +static void pred16x16_top_dc_c(uint8_t *src, int stride){ + int i, dc=0; + + for(i=0;i<16; i++){ + dc+= src[i-stride]; + } + dc= 0x01010101*((dc + 8)>>4); + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= dc; + } +} + +static void pred16x16_128_dc_c(uint8_t *src, int stride){ + int i; + + for(i=0; i<16; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= + ((uint32_t*)(src+i*stride))[2]= + ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U; + } +} + +static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){ + int i, j, k; + int a; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + const uint8_t * const src0 = src+7-stride; + const uint8_t *src1 = src+8*stride-1; + const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + for(k=2; k<=8; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + if(svq3){ + H = ( 5*(H/4) ) / 16; + V = ( 5*(V/4) ) / 16; + + /* required for 100% accuracy */ + i = H; H = V; V = i; + }else if(rv40){ + H = ( H + (H>>2) ) >> 4; + V = ( V + (V>>2) ) >> 4; + }else{ + H = ( 5*H+32 ) >> 6; + V = ( 5*V+32 ) >> 6; + } + + a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); + for(j=16; j>0; --j) { + int b = a; + a += V; + for(i=-16; i<0; i+=4) { + src[16+i] = cm[ (b ) >> 5 ]; + src[17+i] = cm[ (b+ H) >> 5 ]; + src[18+i] = cm[ (b+2*H) >> 5 ]; + src[19+i] = cm[ (b+3*H) >> 5 ]; + b += 4*H; + } + src += stride; + } +} + +static void pred16x16_plane_c(uint8_t *src, int stride){ + pred16x16_plane_compat_c(src, stride, 0, 0); +} + + +static void pred8x8_vertical_c(uint8_t *src, int stride){ + int i; + const uint32_t a= ((uint32_t*)(src-stride))[0]; + const uint32_t b= ((uint32_t*)(src-stride))[1]; + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= a; + ((uint32_t*)(src+i*stride))[1]= b; + } +} + +static void pred8x8_horizontal_c(uint8_t *src, int stride){ + int i; + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; + } +} + +static void pred8x8_128_dc_c(uint8_t *src, int stride){ + int i; + + for(i=0; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; + } +} + +static void pred8x8_left_dc_c(uint8_t *src, int stride){ + int i; + int dc0, dc2; + + dc0=dc2=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride]; + dc2+= src[-1+(i+4)*stride]; + } + dc0= 0x01010101*((dc0 + 2)>>2); + dc2= 0x01010101*((dc2 + 2)>>2); + + for(i=0; i<4; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= dc0; + } + for(i=4; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= + ((uint32_t*)(src+i*stride))[1]= dc2; + } +} + + +static void pred8x8_top_dc_c(uint8_t *src, int stride){ + int i; + int dc0, dc1; + + dc0=dc1=0; + for(i=0;i<4; i++){ + dc0+= src[i-stride]; + dc1+= src[4+i-stride]; + } + dc0= 0x01010101*((dc0 + 2)>>2); + dc1= 0x01010101*((dc1 + 2)>>2); + + for(i=0; i<4; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc1; + } + for(i=4; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc1; + } +} + +static void pred8x8_dc_c(uint8_t *src, int stride){ + int i; + int dc0, dc1, dc2, dc3; + + dc0=dc1=dc2=0; + for(i=0;i<4; i++){ + dc0+= src[-1+i*stride] + src[i-stride]; + dc1+= src[4+i-stride]; + dc2+= src[-1+(i+4)*stride]; + } + dc3= 0x01010101*((dc1 + dc2 + 4)>>3); + dc0= 0x01010101*((dc0 + 4)>>3); + dc1= 0x01010101*((dc1 + 2)>>2); + dc2= 0x01010101*((dc2 + 2)>>2); + + for(i=0; i<4; i++){ + ((uint32_t*)(src+i*stride))[0]= dc0; + ((uint32_t*)(src+i*stride))[1]= dc1; + } + for(i=4; i<8; i++){ + ((uint32_t*)(src+i*stride))[0]= dc2; + ((uint32_t*)(src+i*stride))[1]= dc3; + } +} + +//the following 4 function should not be optimized! +static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){ + pred8x8_top_dc_c(src, stride); + pred4x4_dc_c(src, NULL, stride); +} + +static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){ + pred8x8_dc_c(src, stride); + pred4x4_top_dc_c(src, NULL, stride); +} + +static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){ + pred8x8_left_dc_c(src, stride); + pred4x4_128_dc_c(src + 4*stride , NULL, stride); + pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride); +} + +static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){ + pred8x8_left_dc_c(src, stride); + pred4x4_128_dc_c(src , NULL, stride); + pred4x4_128_dc_c(src + 4, NULL, stride); +} + +static void pred8x8_plane_c(uint8_t *src, int stride){ + int j, k; + int a; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + const uint8_t * const src0 = src+3-stride; + const uint8_t *src1 = src+4*stride-1; + const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; + int H = src0[1] - src0[-1]; + int V = src1[0] - src2[ 0]; + for(k=2; k<=4; ++k) { + src1 += stride; src2 -= stride; + H += k*(src0[k] - src0[-k]); + V += k*(src1[0] - src2[ 0]); + } + H = ( 17*H+16 ) >> 5; + V = ( 17*V+16 ) >> 5; + + a = 16*(src1[0] + src2[8]+1) - 3*(V+H); + for(j=8; j>0; --j) { + int b = a; + a += V; + src[0] = cm[ (b ) >> 5 ]; + src[1] = cm[ (b+ H) >> 5 ]; + src[2] = cm[ (b+2*H) >> 5 ]; + src[3] = cm[ (b+3*H) >> 5 ]; + src[4] = cm[ (b+4*H) >> 5 ]; + src[5] = cm[ (b+5*H) >> 5 ]; + src[6] = cm[ (b+6*H) >> 5 ]; + src[7] = cm[ (b+7*H) >> 5 ]; + src += stride; + } +} + +#define SRC(x,y) src[(x)+(y)*stride] +#define PL(y) \ + const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; +#define PREDICT_8x8_LOAD_LEFT \ + const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ + + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ + PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ + const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 + +#define PT(x) \ + const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOP \ + const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ + + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ + PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ + const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ + + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 + +#define PTR(x) \ + t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOPRIGHT \ + int t8, t9, t10, t11, t12, t13, t14, t15; \ + if(has_topright) { \ + PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ + t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ + } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); + +#define PREDICT_8x8_LOAD_TOPLEFT \ + const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 + +#define PREDICT_8x8_DC(v) \ + int y; \ + for( y = 0; y < 8; y++ ) { \ + ((uint32_t*)src)[0] = \ + ((uint32_t*)src)[1] = v; \ + src += stride; \ + } + +static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + (void) has_topleft; (void) has_topright; + PREDICT_8x8_DC(0x80808080); +} + +static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + (void) has_topleft; (void) has_topright; + PREDICT_8x8_LOAD_LEFT; + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; + PREDICT_8x8_DC(dc); +} + +static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + PREDICT_8x8_LOAD_TOP; + const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; + PREDICT_8x8_DC(dc); +} + +static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOP; + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 + +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; + PREDICT_8x8_DC(dc); +} + +static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + (void) has_topleft; (void) has_topright; + PREDICT_8x8_LOAD_LEFT; +#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ + ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y + ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); +#undef ROW +} + +static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + int y; + PREDICT_8x8_LOAD_TOP; + src[0] = t0; + src[1] = t1; + src[2] = t2; + src[3] = t3; + src[4] = t4; + src[5] = t5; + src[6] = t6; + src[7] = t7; + for( y = 1; y < 8; y++ ) + *(uint64_t*)(src+y*stride) = *(uint64_t*)src; +} + +static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; + SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; + SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; + SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; + SRC(7,7)= (t14 + 3*t15 + 2) >> 2; +} + +static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; + SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; +} + +static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; + SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; + SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; + SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; + SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; + SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; + SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; + SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(7,0)= (t6 + t7 + 1) >> 1; +} + +static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,7)= (l6 + l7 + 1) >> 1; + SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; + SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; + SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; + SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; + SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; + SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; + SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; + SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; + SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; + SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; + SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; + SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; + SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; + SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; +} + +static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + t1 + 1) >> 1; + SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; + SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; + SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; + SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; + SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; + SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; + SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; + SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; + SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; + SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(7,6)= (t10 + t11 + 1) >> 1; + SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; +} + +static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride){ + (void) has_topleft; (void) has_topright; + PREDICT_8x8_LOAD_LEFT; + SRC(0,0)= (l0 + l1 + 1) >> 1; + SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; + SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; + SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; + SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; + SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; + SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; + SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; + SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= + SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= + SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= + SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; +} +#undef PREDICT_8x8_LOAD_LEFT +#undef PREDICT_8x8_LOAD_TOP +#undef PREDICT_8x8_LOAD_TOPLEFT +#undef PREDICT_8x8_LOAD_TOPRIGHT +#undef PREDICT_8x8_DC +#undef PTR +#undef PT +#undef PL +#undef SRC + +static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ + int i; + pix -= stride; + for(i=0; i<4; i++){ + uint8_t v = pix[0]; + pix[1*stride]= v += block[0]; + pix[2*stride]= v += block[4]; + pix[3*stride]= v += block[8]; + pix[4*stride]= v + block[12]; + pix++; + block++; + } +} + +static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ + int i; + for(i=0; i<4; i++){ + uint8_t v = pix[-1]; + pix[0]= v += block[0]; + pix[1]= v += block[1]; + pix[2]= v += block[2]; + pix[3]= v + block[3]; + pix+= stride; + block+= 4; + } +} + +static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ + int i; + pix -= stride; + for(i=0; i<8; i++){ + uint8_t v = pix[0]; + pix[1*stride]= v += block[0]; + pix[2*stride]= v += block[8]; + pix[3*stride]= v += block[16]; + pix[4*stride]= v += block[24]; + pix[5*stride]= v += block[32]; + pix[6*stride]= v += block[40]; + pix[7*stride]= v += block[48]; + pix[8*stride]= v + block[56]; + pix++; + block++; + } +} + +static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ + int i; + for(i=0; i<8; i++){ + uint8_t v = pix[-1]; + pix[0]= v += block[0]; + pix[1]= v += block[1]; + pix[2]= v += block[2]; + pix[3]= v += block[3]; + pix[4]= v += block[4]; + pix[5]= v += block[5]; + pix[6]= v += block[6]; + pix[7]= v + block[7]; + pix+= stride; + block+= 8; + } +} + +static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<16; i++) + pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); +} + +static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<16; i++) + pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); +} + +static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<4; i++) + pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); +} + +static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ + int i; + for(i=0; i<4; i++) + pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); +} + + +/** + * Sets the intra prediction function pointers. + */ +void ff_h264_pred_init(H264PredContext *h){ + + h->pred4x4[VERT_PRED ]= pred4x4_vertical_c; + h->pred4x4[HOR_PRED ]= pred4x4_horizontal_c; + h->pred4x4[DC_PRED ]= pred4x4_dc_c; + h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c; + h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c; + h->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c; + h->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c; + h->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c; + h->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c; + h->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c; + h->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c; + h->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c; + + h->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c; + h->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c; + h->pred8x8l[DC_PRED ]= pred8x8l_dc_c; + h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c; + h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c; + h->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c; + h->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c; + h->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c; + h->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c; + h->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c; + h->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c; + h->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c; + + h->pred8x8[VERT_PRED8x8 ]= pred8x8_vertical_c; + h->pred8x8[HOR_PRED8x8 ]= pred8x8_horizontal_c; + h->pred8x8[PLANE_PRED8x8 ]= pred8x8_plane_c; + + h->pred8x8[DC_PRED8x8 ]= pred8x8_dc_c; + h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c; + h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= pred8x8_mad_cow_dc_l0t; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= pred8x8_mad_cow_dc_0lt; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= pred8x8_mad_cow_dc_l00; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= pred8x8_mad_cow_dc_0l0; + + h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c; + + h->pred16x16[DC_PRED8x8 ]= pred16x16_dc_c; + h->pred16x16[VERT_PRED8x8 ]= pred16x16_vertical_c; + h->pred16x16[HOR_PRED8x8 ]= pred16x16_horizontal_c; + h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c; + + h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c; + + h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c; + h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c; + h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c; + + //special lossless h/v prediction for h264 + h->pred4x4_add [VERT_PRED ]= pred4x4_vertical_add_c; + h->pred4x4_add [ HOR_PRED ]= pred4x4_horizontal_add_c; + h->pred8x8l_add [VERT_PRED ]= pred8x8l_vertical_add_c; + h->pred8x8l_add [ HOR_PRED ]= pred8x8l_horizontal_add_c; + h->pred8x8_add [VERT_PRED8x8]= pred8x8_vertical_add_c; + h->pred8x8_add [ HOR_PRED8x8]= pred8x8_horizontal_add_c; + h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c; + h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c; + + if (HAVE_NEON) ff_h264_pred_init_arm(h); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_pred.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,90 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 prediction functions. + * @author Michael Niedermayer + */ + +#ifndef AVCODEC_H264PRED_H +#define AVCODEC_H264PRED_H + +#include "libavutil/common.h" +#include "dsputil.h" + +/** + * Prediction types + */ +//@{ +#define VERT_PRED 0 +#define HOR_PRED 1 +#define DC_PRED 2 +#define DIAG_DOWN_LEFT_PRED 3 +#define DIAG_DOWN_RIGHT_PRED 4 +#define VERT_RIGHT_PRED 5 +#define HOR_DOWN_PRED 6 +#define VERT_LEFT_PRED 7 +#define HOR_UP_PRED 8 + +#define LEFT_DC_PRED 9 +#define TOP_DC_PRED 10 +#define DC_128_PRED 11 + +#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN 12 +#define HOR_UP_PRED_RV40_NODOWN 13 +#define VERT_LEFT_PRED_RV40_NODOWN 14 + +#define DC_PRED8x8 0 +#define HOR_PRED8x8 1 +#define VERT_PRED8x8 2 +#define PLANE_PRED8x8 3 + +#define LEFT_DC_PRED8x8 4 +#define TOP_DC_PRED8x8 5 +#define DC_128_PRED8x8 6 + +#define ALZHEIMER_DC_L0T_PRED8x8 7 +#define ALZHEIMER_DC_0LT_PRED8x8 8 +#define ALZHEIMER_DC_L00_PRED8x8 9 +#define ALZHEIMER_DC_0L0_PRED8x8 10 +//@} + +/** + * Context for storing H.264 prediction functions + */ +typedef struct H264PredContext{ + void (*pred4x4 [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp? + void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride); + void (*pred8x8 [4+3+4])(uint8_t *src, int stride); + void (*pred16x16[4+3])(uint8_t *src, int stride); + + void (*pred4x4_add [2])(uint8_t *pix/*align 4*/, const DCTELEM *block/*align 16*/, int stride); + void (*pred8x8l_add [2])(uint8_t *pix/*align 8*/, const DCTELEM *block/*align 16*/, int stride); + void (*pred8x8_add [3])(uint8_t *pix/*align 8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); + void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); +}H264PredContext; + +void ff_h264_pred_init(H264PredContext *h); +void ff_h264_pred_init_arm(H264PredContext *h); + + +#endif /* AVCODEC_H264PRED_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred_mode.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_pred_mode.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1013 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 direct mb/block decoding. + * @author Michael Niedermayer + */ + +#include "dsputil.h" +#include "avcodec.h" +#include "h264_data.h" +#include "h264.h" +#include "rectangle.h" + +//#undef NDEBUG +#include + +static const uint8_t left_block_options[4][16]={ + {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, + {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, + {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, + {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} +}; + + +// static void check_cache_copy(MBRecContext *mrc, H264Slice *s, H264Mb *m){ +// for (int list=0; list<2; list++){ +// for (int i=0; i<40; i++){ +// assert (m->ref_cache[list][i] == m->ref_cache_copy[list][i]); +// assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy[list][i][0]); +// assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy[list][i][1]); +// } +// } +// } + +// static void check_cache_copy2(MBRecContext *mrc, H264Slice *s, H264Mb *m){ +// for (int list=0; list<2; list++){ +// for (int i=0; i<40; i++){ +// assert (m->ref_cache[list][i] == m->ref_cache_copy2[list][i]); +// assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy2[list][i][0]); +// assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy2[list][i][1]); +// } +// } +// } + +static void fill_decode_caches_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ + int topleft_type, top_type, topright_type, left_type; + const uint8_t * left_block= left_block_options[0]; + const int mb_x = m->mb_x; + int i; + + mrs->top_type = mrs->mb_type_top[mb_x ]; + mrs->left_type = mrs->mb_type [mb_x-1]; + + topleft_type = mrs->mb_type_top[mb_x-1]; + top_type = mrs->mb_type_top[mb_x ]; + topright_type= mrs->mb_type_top[mb_x+1]; + left_type = mrs->mb_type [mb_x-1]; + + int type_mask= s->pps.constrained_intra_pred ? 1 : -1; + + if(!IS_SKIP(mb_type)){ +// memset(mrc->non_zero_count_cache, 0, sizeof(mrc->non_zero_count_cache)); + AV_COPY32(&mrs->non_zero_count_cache[4+8*1], &m->non_zero_count[ 0]); + AV_COPY32(&mrs->non_zero_count_cache[4+8*2], &m->non_zero_count[ 4]); + AV_COPY32(&mrs->non_zero_count_cache[4+8*3], &m->non_zero_count[ 8]); + AV_COPY32(&mrs->non_zero_count_cache[4+8*4], &m->non_zero_count[12]); + + for (int i=0; i<2; i++) { + mrs->non_zero_count_cache[8*1 + 8*i + 1] = m->non_zero_count[16 + i*2 ]; + mrs->non_zero_count_cache[8*1 + 8*i + 2] = m->non_zero_count[16 + i*2 +1]; + mrs->non_zero_count_cache[8*4 + 8*i + 1] = m->non_zero_count[20 + i*2 ]; + mrs->non_zero_count_cache[8*4 + 8*i + 2] = m->non_zero_count[20 + i*2 +1]; + } + + if(IS_INTRA(mb_type)){ +// memset(mrc->intra4x4_pred_mode_cache, 0, sizeof(mrc->intra4x4_pred_mode_cache)); + + mrs->topleft_samples_available= + mrs->top_samples_available= + mrs->left_samples_available= 0xFFFF; + mrs->topright_samples_available= 0xEEEA; + + if(!(top_type & type_mask)){ + mrs->topleft_samples_available= 0xB3FF; + mrs->top_samples_available= 0x33FF; + mrs->topright_samples_available= 0x26EA; + } + + if(!(left_type & type_mask)){ + mrs->topleft_samples_available&= 0xDF5F; + mrs->left_samples_available&= 0x5F5F; + } + + if(!(topleft_type & type_mask)) + mrs->topleft_samples_available&= 0x7FFF; + + if(!(topright_type & type_mask)) + mrs->topright_samples_available&= 0xFBFF; + + if(IS_INTRA4x4(mb_type)){ + if(IS_INTRA4x4(top_type)){ + AV_COPY32(mrs->intra4x4_pred_mode_cache+4+8*0, &mrs->intra4x4_pred_mode_top[4*mb_x]); + }else{ + mrs->intra4x4_pred_mode_cache[4+8*0]= + mrs->intra4x4_pred_mode_cache[5+8*0]= + mrs->intra4x4_pred_mode_cache[6+8*0]= + mrs->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask); + } + + if(IS_INTRA4x4(left_type)){ +#if OMPSS + mrs->intra4x4_pred_mode_cache[3+8*1]= m->intra4x4_pred_mode_left[0]; + mrs->intra4x4_pred_mode_cache[3+8*2]= m->intra4x4_pred_mode_left[1]; + mrs->intra4x4_pred_mode_cache[3+8*3]= m->intra4x4_pred_mode_left[2]; + mrs->intra4x4_pred_mode_cache[3+8*4]= m->intra4x4_pred_mode_left[3]; +#else + mrs->intra4x4_pred_mode_cache[3+8*1]= mrs->intra4x4_pred_mode_left[0]; + mrs->intra4x4_pred_mode_cache[3+8*2]= mrs->intra4x4_pred_mode_left[1]; + mrs->intra4x4_pred_mode_cache[3+8*3]= mrs->intra4x4_pred_mode_left[2]; + mrs->intra4x4_pred_mode_cache[3+8*4]= mrs->intra4x4_pred_mode_left[3]; +#endif + }else{ + mrs->intra4x4_pred_mode_cache[3+8*1]= + mrs->intra4x4_pred_mode_cache[3+8*2]= + mrs->intra4x4_pred_mode_cache[3+8*3]= + mrs->intra4x4_pred_mode_cache[3+8*4]= 2 - 3*!(left_type & type_mask); + } + } + } + } + + if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ + int list; + +// memset(mrs->mv_cache, 0, sizeof(mrs->mv_cache)); +// memset(mrs->ref_cache, 0, sizeof(mrs->ref_cache)); + + mrs->ref_cache[0][scan8[5 ]+1] = mrs->ref_cache[0][scan8[7 ]+1] = mrs->ref_cache[0][scan8[13]+1] = + mrs->ref_cache[1][scan8[5 ]+1] = mrs->ref_cache[1][scan8[7 ]+1] = mrs->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; + + for(list=0; listlist_count; list++){ + if(!USES_LIST(mb_type, list)){ + continue; + } + assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); + + if(USES_LIST(top_type, list)){ + const int b_xy= 4*mb_x + 3*mrc->b_stride; + AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]); + mrs->ref_cache[list][scan8[0] + 0 - 1*8]= + mrs->ref_cache[list][scan8[0] + 1 - 1*8]= mrs->ref_index_top[list][4*mb_x + 2]; + mrs->ref_cache[list][scan8[0] + 2 - 1*8]= + mrs->ref_cache[list][scan8[0] + 3 - 1*8]= mrs->ref_index_top[list][4*mb_x + 3]; + }else{ + AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]); + AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); + } + + if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ + for(i=0; i<2; i++){ + int cache_idx = scan8[0] - 1 + i*2*8; + if(USES_LIST(left_type, list)){ + const int b_xy= 4*(mb_x-1) + 3; + const int b8_x= 4*(mb_x-1) + 1; + AV_COPY32(mrs->mv_cache[list][cache_idx ], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[0+i*2]]); + AV_COPY32(mrs->mv_cache[list][cache_idx+8], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[1+i*2]]); + mrs->ref_cache[list][cache_idx ]= mrs->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; + mrs->ref_cache[list][cache_idx+8]= mrs->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; + }else{ + AV_ZERO32(mrs->mv_cache [list][cache_idx ]); + AV_ZERO32(mrs->mv_cache [list][cache_idx+8]); + mrs->ref_cache[list][cache_idx ]= + mrs->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); + } + } + }else{ + if(USES_LIST(left_type, list)){ + const int b_x = 4*(mb_x-1) + 3; + const int b8_x= 4*(mb_x-1) + 1; + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1], mrs->motion_val[list][b_x + mrc->b_stride*left_block[0]]); + mrs->ref_cache[list][scan8[0] - 1]= mrs->ref_index[list][b8_x + (left_block[0]&~1)]; + }else{ + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1]); + mrs->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + } + + if(USES_LIST(topright_type, list)){ + const int b_xy= 4*(mb_x+1) + 3*mrc->b_stride; + AV_COPY32(mrs->mv_cache[list][scan8[0] + 4 - 1*8], mrs->motion_val_top[list][b_xy]); + mrs->ref_cache[list][scan8[0] + 4 - 1*8]= mrs->ref_index_top[list][4*(mb_x+1) + 2]; + }else{ + AV_ZERO32(mrs->mv_cache [list][scan8[0] + 4 - 1*8]); + mrs->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + if(mrs->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ + int topleft_partition= -1; + if(USES_LIST(topleft_type, list)){ + const int b_xy = 4*(mb_x-1) + 3 + mrc->b_stride + (topleft_partition & 2*mrc->b_stride); + const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 - 1*8], mrs->motion_val_top[list][b_xy]); + mrs->ref_cache[list][scan8[0] - 1 - 1*8]= mrs->ref_index_top[list][b8_x]; + }else{ + AV_ZERO32(mrs->mv_cache[list][scan8[0] - 1 - 1*8]); + mrs->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; + } + } + + if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) + continue; + + if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { + mrs->ref_cache[list][scan8[4 ]] = + mrs->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; + AV_ZERO32(mrs->mv_cache [list][scan8[4 ]]); + AV_ZERO32(mrs->mv_cache [list][scan8[12]]); + } + } + } +} + +static inline void write_back_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ + const int b_stride = mrc->b_stride; + const int b_x = 4*m->mb_x; //try mb2b(8)_xy + const int b8_x= 4*m->mb_x; + int list; + + if(!USES_LIST(mb_type, 0)) + fill_rectangle(&mrs->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); + + for(list=0; listlist_count; list++){ + int y; + int16_t (*mv_dst)[2]; + int16_t (*mv_src)[2]; + + if(!USES_LIST(mb_type, list)) + continue; + + mv_dst = &mrs->motion_val[list][b_x]; + mv_src = &mrs->mv_cache[list][scan8[0]]; + for(y=0; y<4; y++){ + AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); + } + + { + int8_t *ref_index = &mrs->ref_index[list][b8_x]; + ref_index[0+0*2]= mrs->ref_cache[list][scan8[0]]; + ref_index[1+0*2]= mrs->ref_cache[list][scan8[4]]; + ref_index[0+1*2]= mrs->ref_cache[list][scan8[8]]; + ref_index[1+1*2]= mrs->ref_cache[list][scan8[12]]; + } + } +} + + +/** +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. +*/ +static int check_intra4x4_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){ + static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0}; + static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED}; + int i; + + if(!(mrs->top_samples_available&0x8000)){ + for(i=0; i<4; i++){ + int status= top[ mrs->intra4x4_pred_mode_cache[scan8[0] + i] ]; + if(status<0){ + av_log(AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); + return -1; + } else if(status){ + mrs->intra4x4_pred_mode_cache[scan8[0] + i]= status; + } + } + } + + if((mrs->left_samples_available&0x8888)!=0x8888){ + static const int mask[4]={0x8000,0x2000,0x80,0x20}; + for(i=0; i<4; i++){ + if(!(mrs->left_samples_available&mask[i])){ + int status= left[ mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i] ]; + if(status<0){ + av_log(AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); + return -1; + } else if(status){ + mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status; + } + } + } + } + return 0; +} + +/** +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. +*/ +static int check_intra_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mode){ + static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1}; + static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8}; + + if(mode > 6) { + av_log(AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y); + return -1; + } + + if(!(mrs->top_samples_available&0x8000)){ + mode= top[ mode ]; + if(mode<0){ + av_log(AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y); + return -1; + } + } + + if((mrs->left_samples_available&0x8080) != 0x8080){ + mode= left[ mode ]; + if(mrs->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred + mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(mrs->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8); + } + if(mode<0){ + av_log(AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y); + return -1; + } + } + return mode; +} + +/** + * gets the predicted intra4x4 prediction mode. + */ +static inline int pred_intra_mode(MBRecContext *mrc, MBRecState *mrs, int n){ + const int index8= scan8[n]; + const int left= mrs->intra4x4_pred_mode_cache[index8 - 1]; + const int top = mrs->intra4x4_pred_mode_cache[index8 - 8]; + const int min= FFMIN(left, top); + + if(min<0) return DC_PRED; + else return min; +} + +static void write_back_intra_pred_mode_rec(MBRecContext *mrc, MBRecState *mrs, H264Mb *m, int mb_x){ + int8_t *mode= &mrs->intra4x4_pred_mode[4*mb_x]; + + AV_COPY32(mode, mrs->intra4x4_pred_mode_cache + 4 + 8*4); +#if OMPSS + if (m->mb_x < mrc->mb_width-1){ + H264Mb *mr= m+1; + mode = mr->intra4x4_pred_mode_left; + mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1]; + mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2]; + mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3]; + mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4]; + } +#else + mode = mrs->intra4x4_pred_mode_left; + mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1]; + mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2]; + mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3]; + mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4]; +#endif +} + +static void pred_spatial_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ + int b4_stride = mrc->b_stride; + const int mb_x = m->mb_x; + int mb_type_col[2]; + const int16_t (*l1mv0)[2], (*l1mv1)[2]; + const int8_t *l1ref0, *l1ref1; + const int is_b8x8 = IS_8X8(*mb_type); + unsigned int sub_mb_type= MB_TYPE_L0L1; + int i8, i4; + int ref[2]; + int mv[2]; + int list; + + //assert(h->ref_list[1][0].reference&3); + +#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) + + /* ref = min(neighbors) */ + for(list=0; list<2; list++){ + int left_ref = mrs->ref_cache[list][scan8[0] - 1]; + int top_ref = mrs->ref_cache[list][scan8[0] - 8]; + int refc = mrs->ref_cache[list][scan8[0] - 8 + 4]; + const int16_t *C= mrs->mv_cache[list][ scan8[0] - 8 + 4]; + if(refc == PART_NOT_AVAILABLE){ + refc = mrs->ref_cache[list][scan8[0] - 8 - 1]; + C = mrs->mv_cache[list][scan8[0] - 8 - 1]; + } + ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); + if(ref[list] >= 0){ + //this is just pred_motion() but with the cases removed that cannot happen for direct blocks + const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ]; + const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ]; + + int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); + if(match_count > 1){ //most common + mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]), + mid_pred(A[1], B[1], C[1]) ); + }else { + assert(match_count==1); + if(left_ref==ref[list]){ + mv[list]= AV_RN32A(A); + }else if(top_ref==ref[list]){ + mv[list]= AV_RN32A(B); + }else{ + mv[list]= AV_RN32A(C); + } + } + }else{ + int mask= ~(MB_TYPE_L0 << (2*list)); + mv[list] = 0; + ref[list] = -1; + if(!is_b8x8) + *mb_type &= mask; + sub_mb_type &= mask; + } + } + + if(ref[0] < 0 && ref[1] < 0){ + ref[0] = ref[1] = 0; + if(!is_b8x8) + *mb_type |= MB_TYPE_L0L1; + sub_mb_type |= MB_TYPE_L0L1; + } + + if(!(is_b8x8|mv[0]|mv[1])){ + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); + fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); + fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); + fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; + return; + } + + mb_type_col[0] = + mb_type_col[1] = mrs->list1_mb_type[mb_x]; + + sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ + *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ + *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); + }else{ + if(!s->direct_8x8_inference_flag){ + /* FIXME save sub mb types from previous frames (or derive from MVs) + * so we know exactly what block size to use */ + sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ + } + *mb_type |= MB_TYPE_8x8; + } + + l1mv0 = (void *) &mrs->list1_motion_val[0][4*mb_x]; + l1mv1 = (void *) &mrs->list1_motion_val[1][4*mb_x]; + l1ref0 = &mrs->list1_ref_index [0][4*mb_x]; + l1ref1 = &mrs->list1_ref_index [1][4*mb_x]; +// if(!b8_stride){ +// if(m->mb_y&1){ +// l1ref0 += 2; +// l1ref1 += 2; +// l1mv0 += 2*b4_stride; +// l1mv1 += 2*b4_stride; +// } +// } + + if(IS_16X16(*mb_type)){ + int a,b; + + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); + fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); + if(!IS_INTRA(mb_type_col[0]) && ( (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1) + || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1 + ))){ + a=b=0; + if(ref[0] > 0) + a= mv[0]; + if(ref[1] > 0) + b= mv[1]; + }else{ + a= mv[0]; + b= mv[1]; + } + fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, a, 4); + fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, b, 4); + }else{ + int n=0; + for(i8=0; i8<4; i8++){ + const int x8 = i8&1; + const int y8 = i8>>1; + + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) + continue; + m->sub_mb_type[i8] = sub_mb_type; + + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4); + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4); + fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1); + fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1); + + /* col_zero_flag */ + if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 )) + ){ + const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1; + if(IS_SUB_8X8(sub_mb_type)){ + const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ + if(ref[0] == 0) + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); + if(ref[1] == 0) + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); + n+=4; + } + }else{ + int k=0; + for(i4=0; i4<4; i4++){ + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ + if(ref[0] == 0) + AV_ZERO32(mrs->mv_cache[0][scan8[i8*4+i4]]); + if(ref[1] == 0) + AV_ZERO32(mrs->mv_cache[1][scan8[i8*4+i4]]); + k++; + } + } + if(!(k&3)) + m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8; + n+=k; + } + } + } + if(!is_b8x8 && !(n&15)){ + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; + } + } +} + +static void pred_temp_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ + const int mb_x = m->mb_x; + int b4_stride = mrc->b_stride; + int mb_type_col[2]; + const int16_t (*l1mv0)[2], (*l1mv1)[2]; + const int8_t *l1ref0, *l1ref1; + const int is_b8x8 = IS_8X8(*mb_type); + unsigned int sub_mb_type; + int i8, i4; + const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]}; + const int *dist_scale_factor = s->dist_scale_factor; + + mb_type_col[0] = + mb_type_col[1] = mrs->list1_mb_type[mb_x]; + + sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ + *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ + *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); + }else{ + if(!s->direct_8x8_inference_flag){ + /* FIXME save sub mb types from previous frames (or derive from MVs) + * so we know exactly what block size to use */ + sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ + } + *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; + } + + l1mv0 = (void *) &mrs->list1_motion_val[0][4*mb_x]; + l1mv1 = (void *) &mrs->list1_motion_val[1][4*mb_x]; + l1ref0 = &mrs->list1_ref_index [0][4*mb_x]; + l1ref1 = &mrs->list1_ref_index [1][4*mb_x]; + + /* one-to-one mv scaling */ + if(IS_16X16(*mb_type)){ + int ref, mv0, mv1; + + fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1); + if(IS_INTRA(mb_type_col[0])){ + ref=mv0=mv1=0; + }else{ + const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]] + : map_col_to_list0[1][l1ref1[0]]; + const int scale = dist_scale_factor[ref0]; + const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0]; + int mv_l0[2]; + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; + ref= ref0; + mv0= pack16to32(mv_l0[0],mv_l0[1]); + mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]); + } + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1); + fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4); + fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4); + }else{ + for(i8=0; i8<4; i8++){ + const int x8 = i8&1; + const int y8 = i8>>1; + int ref0, scale; + const int16_t (*l1mv)[2]= l1mv0; + + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) + continue; + m->sub_mb_type[i8] = sub_mb_type; + fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1); + if(IS_INTRA(mb_type_col[0])){ + fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1); + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); + continue; + } + + ref0 = l1ref0[i8]; + if(ref0 >= 0) + ref0 = map_col_to_list0[0][ref0 ]; + else{ + ref0 = map_col_to_list0[1][l1ref1[i8]]; + l1mv= l1mv1; + } + scale = dist_scale_factor[ref0]; + + fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1); + if(IS_SUB_8X8(sub_mb_type)){ + const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; + int mx = (scale * mv_col[0] + 128) >> 8; + int my = (scale * mv_col[1] + 128) >> 8; + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4); + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4); + }else + for(i4=0; i4<4; i4++){ + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; + int16_t *mv_l0 = mrs->mv_cache[0][scan8[i8*4+i4]]; + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; + AV_WN32A(mrs->mv_cache[1][scan8[i8*4+i4]], + pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1])); + } + } + } +} + +void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ + if(s->direct_spatial_mv_pred){ + pred_spatial_direct_motion_rec(mrc, mrs, s, m, mb_type); + }else{ + pred_temp_direct_motion_rec(mrc, mrs, s, m, mb_type); + } +} + +static inline int fetch_diagonal_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, const int16_t **C, int i, int list, int part_width){ + const int topright_ref= mrs->ref_cache[list][ i - 8 + part_width ]; + + if(topright_ref != PART_NOT_AVAILABLE){ + *C= mrs->mv_cache[list][ i - 8 + part_width ]; + return topright_ref; + }else{ + *C= mrs->mv_cache[list][ i - 8 - 1 ]; + return mrs->ref_cache[list][ i - 8 - 1 ]; + } +} + +/** + * gets the predicted MV. + * @param n the block index + * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4) + * @param mx the x component of the predicted motion vector + * @param my the y component of the predicted motion vector + */ +static inline void pred_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int part_width, int list, int ref, int * const mx, int * const my){ + const int index8= scan8[n]; + const int top_ref= mrs->ref_cache[list][ index8 - 8 ]; + const int left_ref= mrs->ref_cache[list][ index8 - 1 ]; + const int16_t * const A= mrs->mv_cache[list][ index8 - 1 ]; + const int16_t * const B= mrs->mv_cache[list][ index8 - 8 ]; + const int16_t * C; + int diagonal_ref, match_count; + + assert(part_width==1 || part_width==2 || part_width==4); + +/* mv_cache + B . . A T T T T + U . . L . . , . + U . . L . . . . + U . . L . . , . + . . . L . . . . +*/ + + diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, index8, list, part_width); + match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref); + + if(match_count > 1){ //most common + *mx= mid_pred(A[0], B[0], C[0]); + *my= mid_pred(A[1], B[1], C[1]); + }else if(match_count==1){ + if(left_ref==ref){ + *mx= A[0]; + *my= A[1]; + }else if(top_ref==ref){ + *mx= B[0]; + *my= B[1]; + }else{ + *mx= C[0]; + *my= C[1]; + } + }else{ + if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){ + *mx= A[0]; + *my= A[1]; + }else{ + *mx= mid_pred(A[0], B[0], C[0]); + *my= mid_pred(A[1], B[1], C[1]); + } + } + +} + +/** + * gets the directionally predicted 16x8 MV. + * @param n the block index + * @param mx the x component of the predicted motion vector + * @param my the y component of the predicted motion vector + */ +static inline void pred_16x8_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){ + if(n==0){ + const int top_ref= mrs->ref_cache[list][ scan8[0] - 8 ]; + const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ]; + + if(top_ref == ref){ + *mx= B[0]; + *my= B[1]; + return; + } + }else{ + const int left_ref= mrs->ref_cache[list][ scan8[8] - 1 ]; + const int16_t * const A= mrs->mv_cache[list][ scan8[8] - 1 ]; + + if(left_ref == ref){ + *mx= A[0]; + *my= A[1]; + return; + } + } + + //RARE + pred_motion(mrc, mrs, s, n, 4, list, ref, mx, my); +} + +/** + * gets the directionally predicted 8x16 MV. + * @param n the block index + * @param mx the x component of the predicted motion vector + * @param my the y component of the predicted motion vector + */ +static inline void pred_8x16_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){ + if(n==0){ + const int left_ref= mrs->ref_cache[list][ scan8[0] - 1 ]; + const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ]; + + if(left_ref == ref){ + *mx= A[0]; + *my= A[1]; + return; + } + }else{ + const int16_t * C; + int diagonal_ref; + + diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, scan8[4], list, 2); + if(diagonal_ref == ref){ + *mx= C[0]; + *my= C[1]; + return; + } + } + + //RARE + pred_motion(mrc, mrs, s, n, 2, list, ref, mx, my); +} + +static inline void pred_pskip_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb * m, int * const mx, int * const my){ + const int top_ref = mrs->ref_cache[0][ scan8[0] - 8 ]; + const int left_ref= mrs->ref_cache[0][ scan8[0] - 1 ]; + + if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE + || !( top_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 8 ])) + || !(left_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 1 ]))){ + + *mx = *my = 0; + return; + } + + pred_motion(mrc, mrs, s, 0, 4, 0, 0, mx, my); + + return; +} + +#define ADD_MVD(list) \ +{ \ + mx += m->mvd[list][mp][0]; \ + my += m->mvd[list][mp][1]; \ + mp++; \ +} + +int pred_motion_mb_rec (MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){ + int mp=0; + int mb_type = m->mb_type; + const int mb_x = m->mb_x; + +// mrc->m =m; + + fill_decode_caches_rec(mrc, mrs, s, m, mb_type); + if (IS_SKIP(mb_type)){ + mb_type=0; + + if( s->slice_type_nos == FF_B_TYPE ) + { + mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; + ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); + } + else + { + int mx, my; + + mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; //FIXME check required + pred_pskip_motion(mrc, mrs, s, m, &mx, &my); + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); + fill_rectangle(mrs->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); + } + + write_back_motion_rec(mrc, mrs, s, m, mb_type); + m->mb_type = mrs->mb_type[mb_x]= mb_type; + return 0; + } + + + if (IS_INTRA_PCM(mb_type)){ + mrs->mb_type[mb_x] = mb_type; + return 0; + } + else if (IS_INTRA(mb_type)){ + int i, pred_mode; + + if( IS_INTRA4x4( mb_type ) ) { + if ( IS_8x8DCT(mb_type) ) { + for( i = 0; i < 16; i+=4 ) { + int pred = pred_intra_mode(mrc, mrs, i ); + int mode = m->intra4x4_pred_mode[i]; + + mode = mode < 0 ? pred : mode + ( mode >= pred ); + fill_rectangle( &mrs->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 ); + } + } else { + for( i = 0; i < 16; i++ ) { + int pred = pred_intra_mode(mrc, mrs, i ); + int mode = m->intra4x4_pred_mode[i]; + mode = mode < 0 ? pred : mode + ( mode >= pred ); + mrs->intra4x4_pred_mode_cache[ scan8[i] ] = mode; + } + } + write_back_intra_pred_mode_rec(mrc, mrs, m, mb_x); + if( check_intra4x4_pred_mode(mrc, mrs, s, m) < 0 ) return -1; + } else { + m->intra16x16_pred_mode= check_intra_pred_mode(mrc, mrs, s, m, m->intra16x16_pred_mode ); + if( m->intra16x16_pred_mode < 0 ) return -1; + } + + pred_mode = m->chroma_pred_mode; + pred_mode= check_intra_pred_mode( mrc, mrs, s, m, pred_mode ); + if( pred_mode < 0 ) return -1; + m->chroma_pred_mode= pred_mode; + + } + else if (IS_8X8(mb_type)){ + int i, j, list; + + if( s->slice_type_nos == FF_B_TYPE ) { + if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | + m->sub_mb_type[2] | m->sub_mb_type[3]) ) { + ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); + mrs->ref_cache[0][scan8[4]] = + mrs->ref_cache[1][scan8[4]] = + mrs->ref_cache[0][scan8[12]] = + mrs->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; + } + } + + for(list=0; listlist_count; list++){ + for(i=0; i<4; i++){ + if(IS_DIRECT(m->sub_mb_type[i])){ + mrs->ref_cache[list][ scan8[4*i] ]=mrs->ref_cache[list][ scan8[4*i]+1 ]; + continue; + } else { + mrs->ref_cache[list][ scan8[4*i] ]=mrs->ref_cache[list][ scan8[4*i]+1 ]= + mrs->ref_cache[list][ scan8[4*i]+8 ]=mrs->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i]; + + if(IS_DIR(m->sub_mb_type[i], 0, list) ){ + const int sub_mb_type= m->sub_mb_type[i]; + const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; + + int sub_partition_count = IS_SUB_8X8(sub_mb_type) ? 1 : (IS_SUB_4X4(sub_mb_type)? 4 :2); + for(j=0; jmv_cache[list][ scan8[index]]; + pred_motion(mrc, mrs, s, index, block_width, list, mrs->ref_cache[list][ scan8[index] ], &mx, &my); + + ADD_MVD(list) + + if(IS_SUB_8X8(sub_mb_type)){ + mv_cache[ 1 ][0]= + mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx; + mv_cache[ 1 ][1]= + mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my; + }else if(IS_SUB_8X4(sub_mb_type)){ + mv_cache[ 1 ][0]= mx; + mv_cache[ 1 ][1]= my; + }else if(IS_SUB_4X8(sub_mb_type)){ + mv_cache[ 8 ][0]= mx; + mv_cache[ 8 ][1]= my; + } + mv_cache[ 0 ][0]= mx; + mv_cache[ 0 ][1]= my; + } + }else{ + fill_rectangle(mrs->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); + } + } + } + } + } else if( IS_DIRECT(mb_type) ) { + mb_type &= ~MB_TYPE_16x16; //FIXME not nice + ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); + } + else { + int list, i; + if(IS_16X16(mb_type)){ + for(list=0; listlist_count; list++){ + if(IS_DIR(mb_type, 0, list)){ + int ref; + int mx,my; + + ref = m->ref_index[list][0]; + fill_rectangle(&mrs->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); + pred_motion(mrc, mrs, s, 0, 4, list, mrs->ref_cache[list][ scan8[0] ], &mx, &my); + ADD_MVD(list) + fill_rectangle(mrs->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); + } + } + } + else if(IS_16X8(mb_type)){ + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ + int ref; + int mx,my; + ref = m->ref_index[list][i]; + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); + + pred_16x8_motion(mrc, mrs, s, 8*i, list, mrs->ref_cache[list][scan8[0] + 16*i], &mx, &my); + ADD_MVD(list) + + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); + }else{ + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); + } + } + } + + }else{ + assert(IS_8X16(mb_type)); + + for(list=0; listlist_count; list++){ + for(i=0; i<2; i++){ + if(IS_DIR(mb_type, i, list)){ //FIXME optimize + int ref; + int mx,my; + ref = m->ref_index[list][i]; + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); + pred_8x16_motion(mrc, mrs, s, i*4, list, mrs->ref_cache[list][ scan8[0] + 2*i ], &mx, &my); + ADD_MVD(list) + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); + }else{ + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); + } + } + } + } + } + + if (IS_INTER(mb_type)||(IS_DIRECT(mb_type))) + write_back_motion_rec(mrc, mrs, s, m, mb_type); + m->mb_type = mrs->mb_type[mb_x]= mb_type; + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pred_mode.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_pred_mode.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,10 @@ +#ifndef H264_DIRECT_H +#define H264_DIRECT_H + +#include "h264_types.h" + +void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int *mb_type); +int pred_motion_mb_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m); + + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_ps.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_ps.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,462 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 parameter set decoding. + * @author Michael Niedermayer + */ + +#include "dsputil.h" +#include "avcodec.h" +#include "h264_types.h" +#include "h264_data.h" +#include "golomb.h" + + +//#undef NDEBUG +#include + +static const int pixel_aspect[17][2]={ + {0, 1}, + {1, 1}, + {12, 11}, + {10, 11}, + {16, 11}, + {40, 33}, + {24, 11}, + {20, 11}, + {32, 11}, + {80, 33}, + {18, 11}, + {15, 11}, + {64, 33}, + {160,99}, + {4, 3}, + {3, 2}, + {2, 1}, +}; + +const uint8_t ff_h264_chroma_qp[52]={ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, + 12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27, + 28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37, + 37,38,38,38,39,39,39,39 +}; + +static const uint8_t default_scaling4[2][16]={ +{ 6,13,20,28, + 13,20,28,32, + 20,28,32,37, + 28,32,37,42 +},{ + 10,14,20,24, + 14,20,24,27, + 20,24,27,30, + 24,27,30,34 +}}; + +static const uint8_t default_scaling8[2][64]={ +{ 6,10,13,16,18,23,25,27, + 10,11,16,18,23,25,27,29, + 13,16,18,23,25,27,29,31, + 16,18,23,25,27,29,31,33, + 18,23,25,27,29,31,33,36, + 23,25,27,29,31,33,36,38, + 25,27,29,31,33,36,38,40, + 27,29,31,33,36,38,40,42 +},{ + 9,13,15,17,19,21,22,24, + 13,13,17,19,21,22,24,25, + 15,17,19,21,22,24,25,27, + 17,19,21,22,24,25,27,28, + 19,21,22,24,25,27,28,30, + 21,22,24,25,27,28,30,32, + 22,24,25,27,28,30,32,33, + 24,25,27,28,30,32,33,35 +}}; + +static inline int decode_hrd_parameters(GetBitContext *gb, SPS *sps){ + int cpb_count, i; + cpb_count = get_ue_golomb_31(gb) + 1; + + if(cpb_count > 32){ + av_log(AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count); + return -1; + } + + get_bits(gb, 4); /* bit_rate_scale */ + get_bits(gb, 4); /* cpb_size_scale */ + for(i=0; iinitial_cpb_removal_delay_length = get_bits(gb, 5) + 1; + sps->cpb_removal_delay_length = get_bits(gb, 5) + 1; + sps->dpb_output_delay_length = get_bits(gb, 5) + 1; + sps->time_offset_length = get_bits(gb, 5); + sps->cpb_cnt = cpb_count; + return 0; +} + +static inline int decode_vui_parameters(GetBitContext *gb, SPS *sps){ + int aspect_ratio_info_present_flag; + unsigned int aspect_ratio_idc; + + aspect_ratio_info_present_flag= get_bits1(gb); + + if( aspect_ratio_info_present_flag ) { + aspect_ratio_idc= get_bits(gb, 8); + if( aspect_ratio_idc == EXTENDED_SAR ) { + sps->num= get_bits(gb, 16); + sps->den= get_bits(gb, 16); + }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(int[2])){ + //sps->sar= pixel_aspect[aspect_ratio_idc]; + }else{ + av_log( AV_LOG_ERROR, "illegal aspect ratio idc %d\n", aspect_ratio_idc); + // return -1; + } + }else{ + sps->num= + sps->den= 0; + } + + if(get_bits1(gb)){ /* overscan_info_present_flag */ + get_bits1(gb); /* overscan_appropriate_flag */ + } + + sps->video_signal_type_present_flag = get_bits1(gb); + if(sps->video_signal_type_present_flag){ + get_bits(gb, 3); /* video_format */ + sps->full_range = get_bits1(gb); /* video_full_range_flag */ + + sps->colour_description_present_flag = get_bits1(gb); + if(sps->colour_description_present_flag){ + sps->color_primaries = get_bits(gb, 8); /* colour_primaries */ + sps->color_trc = get_bits(gb, 8); /* transfer_characteristics */ + sps->colorspace = get_bits(gb, 8); /* matrix_coefficients */ + if (sps->color_primaries >= AVCOL_PRI_NB) + sps->color_primaries = AVCOL_PRI_UNSPECIFIED; + if (sps->color_trc >= AVCOL_TRC_NB) + sps->color_trc = AVCOL_TRC_UNSPECIFIED; + if (sps->colorspace >= AVCOL_SPC_NB) + sps->colorspace = AVCOL_SPC_UNSPECIFIED; + } + } + + if(get_bits1(gb)){ /* chroma_location_info_present_flag */ + av_log(AV_LOG_ERROR, "chroma_location_info_present_flag found, but not supported\n"); + (void) (get_ue_golomb(gb)+1); /* chroma_sample_location_type_top_field */ + (void) get_ue_golomb(gb); /* chroma_sample_location_type_bottom_field */ + } + + sps->timing_info_present_flag = get_bits1(gb); + if(sps->timing_info_present_flag){ + sps->num_units_in_tick = get_bits_long(gb, 32); + sps->time_scale = get_bits_long(gb, 32); + if(!sps->num_units_in_tick || !sps->time_scale){ + av_log(AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick); + return -1; + } + sps->fixed_frame_rate_flag = get_bits1(gb); + } + + sps->nal_hrd_parameters_present_flag = get_bits1(gb); + if(sps->nal_hrd_parameters_present_flag) + if(decode_hrd_parameters(gb, sps) < 0) + return -1; + sps->vcl_hrd_parameters_present_flag = get_bits1(gb); + if(sps->vcl_hrd_parameters_present_flag) + if(decode_hrd_parameters(gb, sps) < 0) + return -1; + if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag) + get_bits1(gb); /* low_delay_hrd_flag */ + sps->pic_struct_present_flag = get_bits1(gb); + + sps->bitstream_restriction_flag = get_bits1(gb); + if(sps->bitstream_restriction_flag){ + get_bits1(gb); /* motion_vectors_over_pic_boundaries_flag */ + get_ue_golomb(gb); /* max_bytes_per_pic_denom */ + get_ue_golomb(gb); /* max_bits_per_mb_denom */ + get_ue_golomb(gb); /* log2_max_mv_length_horizontal */ + get_ue_golomb(gb); /* log2_max_mv_length_vertical */ + sps->num_reorder_frames= get_ue_golomb(gb); + get_ue_golomb(gb); /*max_dec_frame_buffering*/ + + if(sps->num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){ + av_log(AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames); + return -1; + } + } + + return 0; +} + +static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size, const uint8_t *jvt_list, const uint8_t *fallback_list){ + int i, last = 8, next = 8; + const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct; + if(!get_bits1(gb)) /* matrix not written, we use the predicted one */ + memcpy(factors, fallback_list, size*sizeof(uint8_t)); + else + for(i=0;iscaling_matrix_present; + const uint8_t *fallback[4] = { + fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0], + fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1], + fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0], + fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1] + }; + if(get_bits1(gb)){ + sps->scaling_matrix_present |= is_sps; + decode_scaling_list(gb, scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y + decode_scaling_list(gb, scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr + decode_scaling_list(gb, scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb + decode_scaling_list(gb, scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y + decode_scaling_list(gb, scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr + decode_scaling_list(gb, scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb + if(is_sps || pps->transform_8x8_mode){ + decode_scaling_list(gb, scaling_matrix8[0],64,default_scaling8[0],fallback[2]); // Intra, Y + decode_scaling_list(gb, scaling_matrix8[1],64,default_scaling8[1],fallback[3]); // Inter, Y + } + } +} + +int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb){ + int profile_idc, level_idc; + unsigned int sps_id; + int i; + SPS *sps; + + profile_idc= get_bits(gb, 8); + get_bits1(gb); //constraint_set0_flag + get_bits1(gb); //constraint_set1_flag + get_bits1(gb); //constraint_set2_flag + get_bits1(gb); //constraint_set3_flag + get_bits(gb, 4); // reserved + level_idc= get_bits(gb, 8); + sps_id= get_ue_golomb_31(gb); + + if(sps_id >= MAX_SPS_COUNT) { + av_log(AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id); + return -1; + } + if (!n->sps_buffers[sps_id]) + n->sps_buffers[sps_id]= av_mallocz(sizeof(SPS)); + + sps = n->sps_buffers[sps_id]; + if(sps == NULL) + return -1; + + sps->profile_idc= profile_idc; + sps->level_idc= level_idc; + + memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4)); + memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8)); + sps->scaling_matrix_present = 0; + + if(sps->profile_idc >= 100){ //high profile + sps->chroma_format_idc= get_ue_golomb_31(gb); + if(sps->chroma_format_idc == 3) + sps->residual_color_transform_flag = get_bits1(gb); + sps->bit_depth_luma = get_ue_golomb(gb) + 8; + sps->bit_depth_chroma = get_ue_golomb(gb) + 8; + sps->transform_bypass = get_bits1(gb); + decode_scaling_matrices(gb, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8); + }else{ + sps->chroma_format_idc= 1; + sps->bit_depth_luma = 8; + sps->bit_depth_chroma = 8; + } + + sps->log2_max_frame_num= get_ue_golomb(gb) + 4; + sps->poc_type= get_ue_golomb_31(gb); + + if(sps->poc_type == 0){ //FIXME #define + sps->log2_max_poc_lsb= get_ue_golomb(gb) + 4; + } else if(sps->poc_type == 1){//FIXME #define + sps->delta_pic_order_always_zero_flag= get_bits1(gb); + sps->offset_for_non_ref_pic= get_se_golomb(gb); + sps->offset_for_top_to_bottom_field= get_se_golomb(gb); + sps->poc_cycle_length = get_ue_golomb(gb); + + if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){ + av_log(AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length); + goto fail; + } + + for(i=0; ipoc_cycle_length; i++) + sps->offset_for_ref_frame[i]= get_se_golomb(gb); + }else if(sps->poc_type != 2){ + av_log(AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type); + goto fail; + } + + sps->ref_frame_count= get_ue_golomb_31(gb); + if(sps->ref_frame_count >= 32){ + av_log(AV_LOG_ERROR, "too many reference frames\n"); + goto fail; + } + sps->gaps_in_frame_num_allowed_flag= get_bits1(gb); + sps->mb_width = get_ue_golomb(gb) + 1; + sps->mb_height= get_ue_golomb(gb) + 1; + + + sps->frame_mbs_only_flag= get_bits1(gb); + if(!sps->frame_mbs_only_flag){ + av_log(AV_LOG_ERROR, "MBAFF support not included\n"); + get_bits1(gb); + }else + sps->mb_aff= 0; + + sps->direct_8x8_inference_flag= get_bits1(gb); + if(!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag){ + av_log(AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n"); + goto fail; + } + + sps->crop= get_bits1(gb); + if(sps->crop){ + sps->crop_left = get_ue_golomb(gb); + sps->crop_right = get_ue_golomb(gb); + sps->crop_top = get_ue_golomb(gb); + sps->crop_bottom= get_ue_golomb(gb); + if(sps->crop_left || sps->crop_top){ + av_log( AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n"); + } + if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){ + av_log( AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n"); + } + }else { + + sps->crop_left = + sps->crop_right = + sps->crop_top = + sps->crop_bottom= 0; + } + + sps->vui_parameters_present_flag= get_bits1(gb); + if( sps->vui_parameters_present_flag ) + if (decode_vui_parameters(gb, sps) < 0) + goto fail; + + + n->sps = *sps; + + if( sps->bitstream_restriction_flag){ + n->has_b_frames = sps->num_reorder_frames; + } + else + n->has_b_frames= MAX_DELAYED_PIC_COUNT; + + return 0; +fail: + av_free(sps); + return -1; +} + +static void +build_qp_table(PPS *pps, int t, int index) +{ + int i; + for(i = 0; i < 52; i++) + pps->chroma_qp_table[t][i] = ff_h264_chroma_qp[av_clip(i + index, 0, 51)]; +} + +int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length){ + unsigned int pps_id= get_ue_golomb(gb); + PPS *pps; + + if(pps_id >= MAX_PPS_COUNT) { + av_log(AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id); + return -1; + } + if (!n->pps_buffers[pps_id]) + n->pps_buffers[pps_id]= av_mallocz(sizeof(PPS)); + pps = n->pps_buffers[pps_id]; + if(pps == NULL) + return -1; + pps->sps_id= get_ue_golomb_31(gb); + if((unsigned)pps->sps_id>=MAX_SPS_COUNT || n->sps_buffers[pps->sps_id] == NULL){ + av_log(AV_LOG_ERROR, "sps_id out of range\n"); + goto fail; + } + + pps->cabac= get_bits1(gb); + pps->pic_order_present= get_bits1(gb); + if(pps->pic_order_present){ + av_log(AV_LOG_ERROR, "no interlaces support\n"); + } + pps->slice_group_count= get_ue_golomb(gb) + 1; + if(pps->slice_group_count > 1 ){ + pps->mb_slice_group_map_type= get_ue_golomb(gb); + av_log(AV_LOG_ERROR, "multiple slices not supported\n"); + } + pps->ref_count[0]= get_ue_golomb(gb) + 1; + pps->ref_count[1]= get_ue_golomb(gb) + 1; + if(pps->ref_count[0]> 32 || pps->ref_count[1]> 32){ + av_log(AV_LOG_ERROR, "reference overflow (pps)\n"); + goto fail; + } + + pps->weighted_pred= get_bits1(gb); + pps->weighted_bipred_idc= get_bits(gb, 2); + pps->init_qp= get_se_golomb(gb) + 26; + pps->init_qs= get_se_golomb(gb) + 26; + pps->chroma_qp_index_offset[0]= get_se_golomb(gb); + pps->deblocking_filter_parameters_present= get_bits1(gb); + pps->constrained_intra_pred= get_bits1(gb); + pps->redundant_pic_cnt_present = get_bits1(gb); + + pps->transform_8x8_mode= 0; + memcpy(pps->scaling_matrix4, n->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4)); + memcpy(pps->scaling_matrix8, n->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8)); + + if(get_bits_count(gb) < bit_length){ + pps->transform_8x8_mode= get_bits1(gb); + decode_scaling_matrices(gb, n->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8); + pps->chroma_qp_index_offset[1]= get_se_golomb(gb); //second_chroma_qp_index_offset + } else { + pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0]; + } + + build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]); + build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]); + if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) + pps->chroma_qp_diff= 1; + + return 0; +fail: + av_free(pps); + return -1; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_ps.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_ps.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,9 @@ +#ifndef H264_PS_H +#define H264_PS_H + +#include "h264_types.h" + +int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb); +int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pthread.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_pthread.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,604 @@ +#include "config.h" + +#include "h264_types.h" +#include "h264_parser.h" +#include "h264_nal.h" +#include "h264_entropy.h" +#include "h264_rec.h" +#include "h264_misc.h" +// #undef NDEBUG +#include +#include + +#define XOANON 1 + +#ifdef XOANON +static int ed_rec_affinity[40] = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39 }; +static int ed_rec_smt_aff[80] = { 0, 40, 4, 44, 8, 48, 12, 52, 16, 56, 20, 60, 24, 64, 28, 68, 32, 72, 36, 76, + 1, 41, 5, 45, 9, 49, 13, 53, 17, 57, 21, 61, 25, 65, 29, 69, 33, 73, 37, 77, + 2, 42, 6, 46, 10, 50, 14, 54, 18, 58, 22, 62, 26, 66, 30, 70, 34, 74, 38, 78, + 3, 43, 7, 47, 11, 51, 15, 55, 19, 59, 23, 63, 27, 67, 31, 71, 35, 75, 39, 79 }; +#else +static int ed_rec_affinity[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; +static int ed_rec_smt_aff[20] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, }; +#endif + +static int frames=0; + +static void notify_one_worker(H264Context *h){ + pthread_mutex_lock(&h->task_lock); + pthread_cond_signal(&h->task_cond); + pthread_mutex_unlock(&h->task_lock); +} + +static void notify_all_workers(H264Context *h){ + pthread_mutex_lock(&h->task_lock); + pthread_cond_broadcast(&h->task_cond); + pthread_mutex_unlock(&h->task_lock); +} + +static void push_sbe (SliceBufferQueue *sbq, SliceBufferEntry *sbe, int notify ){ + pthread_mutex_lock(&sbq->lock); + while (sbq->cnt >= sbq->size) + pthread_cond_wait(&sbq->cond, &sbq->lock); + sbq->queue[sbq->fi] = sbe; + sbq->cnt++; + sbq->fi++; sbq->fi %= sbq->size; + if (notify) + pthread_cond_signal(&sbq->cond); + pthread_mutex_unlock(&sbq->lock); +} + +static SliceBufferEntry* pop_sbe (SliceBufferQueue *sbq, int block){ + SliceBufferEntry *sbe=NULL; + + pthread_mutex_lock(&sbq->lock); + if (block){ + while (sbq->cnt <= 0) + pthread_cond_wait(&sbq->cond, &sbq->lock); + }else { + if (sbq->cnt <= 0) + goto nonblock; + } + sbe = sbq->queue[sbq->fo]; + sbq->cnt--; + sbq->fo++; sbq->fo %= sbq->size; + pthread_cond_signal(&sbq->cond); +nonblock: + pthread_mutex_unlock(&sbq->lock); + + return sbe; +} + +// static void push_rle (RingLineQueue *rlq, SliceBufferEntry *sbe, int line, int notify){ +// +// //check for free slots +// pthread_mutex_lock(&rlq->wslock); +// while (rlq->free <= 0){ +// pthread_cond_wait(&rlq->wscond, &rlq->wslock); +// } +// //free slot is available, decrement one in this lock +// rlq->free--; +// pthread_mutex_unlock(&rlq->wslock); +// +// pthread_mutex_lock(&rlq->swlock); +// rlq->queue[rlq->fi]->sbe=sbe; +// rlq->queue[rlq->fi]->line=line; +// rlq->queue[rlq->fi]->mb_cnt=0; +// rlq->fi++; rlq->fi %= rlq->size; +// rlq->ready++; +// if(notify) +// pthread_cond_signal(&rlq->swcond); +// pthread_mutex_unlock(&rlq->swlock); +// } + +// static RingLineEntry* pop_rle (RingLineQueue *rlq, int block){ +// RingLineEntry *rle=NULL; +// +// pthread_mutex_lock(&rlq->swlock); +// if (block){ +// while (rlq->ready <= 0) +// pthread_cond_wait(&rlq->swcond, &rlq->swlock); +// }else { +// if (rlq->ready <= 0) +// goto nonblock; +// } +// rle = rlq->queue[rlq->fo]; +// rlq->fo++; rlq->fo %= rlq->size; +// rlq->ready--; +// nonblock: +// pthread_mutex_unlock(&rlq->swlock); +// +// return rle; +// } +// +// static void rel_rle (RingLineQueue *rlq){ +// pthread_mutex_lock(&rlq->wslock); +// rlq->free++; +// pthread_cond_signal(&rlq->wscond); +// pthread_mutex_unlock(&rlq->wslock); +// } + +static RingLineEntry* pop_rle (SliceBufferQueue *sbq, RingLineQueue *rlq, int *has_token){ + RingLineEntry *rle=NULL; + SliceBufferEntry *sbe=NULL; + int line=-1; + + pthread_mutex_lock(&sbq->lock); + if (sbq->cnt <= 0) + goto unlock; + sbe = sbq->queue[sbq->fo]; + line = sbe->lines_taken; + + + pthread_mutex_lock(&rlq->swlock); + if (!*has_token){ + if (rlq->free <= 0) + goto unlock2; + rlq->free--; + *has_token=1; + } + rle = rlq->queue[rlq->fo]; + rlq->fo++; rlq->fo %= rlq->size; + rle->sbe=sbe; + rle->line = line; + rle->mb_cnt =0; + if (++sbe->lines_taken >= sbe->lines_total){ + sbq->cnt--; + sbq->fo++; sbq->fo %= sbq->size; + pthread_cond_signal(&sbq->cond); + } +unlock2: + pthread_mutex_unlock(&rlq->swlock); +unlock: + pthread_mutex_unlock(&sbq->lock); + + + return rle; +} + +static void rel_rle (RingLineQueue *rlq, int *rec_token){ + pthread_mutex_lock(&rlq->swlock); + rlq->free++; + *rec_token=0; +// pthread_cond_signal(&rlq->swcond); + pthread_mutex_unlock(&rlq->swlock); + +} + +//get either a entropy or a line reconstruct task +static void pop_next_task(H264Context *h, SliceBufferEntry **psbe, RingLineEntry **prle, int *rec_token){ + + pthread_mutex_lock(&h->task_lock); + + for(;;){ + if ( (*psbe = pop_sbe(&h->sb_q[ENTROPY], 0)) ){ + if (*rec_token){ + rel_rle(&h->rl_q, rec_token); + pthread_cond_signal(&h->task_cond); + } + break; + } + else if ( (*prle = pop_rle(&h->sb_q[MBDEC], &h->rl_q, rec_token)) ) + break; + pthread_cond_wait(&h->task_cond, &h->task_lock); + } + + pthread_mutex_unlock(&h->task_lock); +} + +void *parse_thread(void *arg){ + H264Context *h = (H264Context *) arg; + ParserContext *pc = get_parse_context(h->ifile); + NalContext *nc = get_nal_context(h->width, h->height); + H264Slice *s; + SliceBufferEntry *sbe = NULL; + + while(!pc->final_frame && frames++ num_frames && !h->quit){ + sbe = get_sb_entry(h); + + av_read_frame_internal(pc, &sbe->gb); + s = &sbe->slice; + + decode_nal_units(nc, s, &sbe->gb); + + push_sbe(&h->sb_q[ENTROPY], sbe, 0); + notify_one_worker(h); + } + + if (!h->no_mbd){ + sbe = get_sb_entry(h); + sbe->state=-1; + sbe->slice.coded_pic_num=nc->coded_pic_num; + sbe->lines_total=h->threads; + + push_sbe(&h->sb_q[REORDER], sbe, 1); + }else{ + for (int i=0; ithreads; i++){ + sbe = get_sb_entry(h); + sbe->state=-1; + push_sbe(&h->sb_q[ENTROPY], sbe, 1); + notify_one_worker(h); + } + } + free_nal_context(nc); + free_parse_context(pc); + + pthread_exit(NULL); + return NULL; +} + +int decode_slice_entropy(EntropyContext *ec, SliceBufferEntry *sbe){ + int i,j; + H264Slice *s = &sbe->slice; + GetBitContext *gb = &sbe->gb; + CABACContext *c = &ec->c; + H264Mb *mbs = sbe->mbs; + + if( !s->pps.cabac ){ + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); + return -1; + } + + init_dequant_tables(s, ec); + ec->curr_qscale = s->qscale; + ec->last_qscale_diff = 0; + ec->chroma_qp[0] = get_chroma_qp( s, 0, s->qscale); + ec->chroma_qp[1] = get_chroma_qp( s, 1, s->qscale); + + /* realign */ + align_get_bits( gb ); + /* init cabac */ + ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); + + ff_h264_init_cabac_states(ec, s, c); + + for(j=0; jmb_height; j++){ + init_entropy_buf(ec, s, j); + for(i=0; imb_width; i++){ + int eos,ret; + H264Mb *m = &mbs[i + j*ec->mb_width]; + //memset(m, 0, sizeof(H264Mb)); + m->mb_x=i; + m->mb_y=j; + ec->m = m; + + ret = ff_h264_decode_mb_cabac(ec, s, c); + eos = get_cabac_terminate( c); (void) eos; + + if( ret < 0 || c->bytestream > c->bytestream_end + 2) { + av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); + return -1; + } + } + } + + return 0; +} + +static int decode_slice_mb(MBRecContext *d, RingLineEntry *rle, int frames){ + SliceBufferEntry *sbe= rle->sbe; + H264Slice *s = &sbe->slice; + H264Mb *mbs = sbe->mbs; + + int mb_width= d->mb_width; + int i; + const int line = rle->line; + + init_mbrec_context(d, d->mrs, s, line); + + H264Mb *m = &mbs[line*mb_width]; + d->top=rle->prev_line->top; + d->top_next=rle->top; + +// assert(rle->mb_cnt ==0); + for(i=0; i< mb_width; i++){ + if (frames || line>0){ + while (rle->mb_cnt >= rle->prev_line->mb_cnt -1); + } + h264_decode_mb_internal( d, d->mrs, s, &m[i]); + rle->mb_cnt++; + } + draw_edges(d, s, line); + + return 0; +} + +// static int decode_slice_mb_static(MBRecContext *d, H264Slice *s, RLThreadContext *r, RLThreadContext *rp, int frames){ +// int mb_height= d->mb_height; +// int mb_width= d->mb_width; +// int thread_num = r->thread_num; +// int thread_total = r->thread_total; +// int i; +// int j = thread_num; +// +// r->mb_cnt=frames* mb_height*mb_width; +// for(; jmbs[j*mb_width]; +// for(i=0; i< mb_width; i++){ +// if (j>0){ +// while (r->mb_cnt- (thread_num? 0:mb_width) >= rp->mb_cnt-1); +// } +// h264_decode_mb_internal(d, s, m++); +// r->mb_cnt++; +// } +// draw_edges(d, s, j); +// } +// return 0; +// } + +static void *ed_rec_thread(void *arg){ + H264Context *h = (H264Context*) arg; + EntropyContext *ec=NULL; + MBRecContext *mrc=NULL; + + RingLineEntry *rle=NULL; + SliceBufferEntry *sbe=NULL; + H264Slice *s; + int rec_token=0; + + if (!h->no_mbd){ + mrc = get_mbrec_context(h); + } + ec = get_entropy_context(h); + + for(;;){ + pop_next_task(h, &sbe, &rle, &rec_token); + if (sbe){ + if (h->no_mbd && sbe->state<0){ + break; + } + if (!sbe->initialized){ + init_sb_entry(h, sbe); + } + decode_slice_entropy(ec, sbe); + + if (h->no_mbd){ + release_sb_entry(h, sbe); + sbe=NULL; + } else { + push_sbe(&h->sb_q[REORDER], sbe, 1); + } + } else if (rle){ + if (rle->sbe->state<0) + break; + s = &rle->sbe->slice; + + decode_slice_mb(mrc, rle, s->coded_pic_num); + + if (rle->line == h->mb_height-1){ + push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1); + } + rle->mb_cnt++; + } + } + + //make sure threads quit in order of rle assignment + if (!h->no_mbd){ + while (rle->prev_line->mb_cnt <= h->mb_width); + rel_rle(&h->rl_q, &rec_token); + notify_one_worker(h); + rle->mb_cnt = h->mb_width +1; + if (rle->line == h->threads-1){ + push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1); + } + + free_mbrec_context(mrc); + } + + free_entropy_context(ec); + + pthread_exit(NULL); + return NULL; +} + +static void *reorder_thread(void *arg){ + H264Context *h = (H264Context *) arg; + int i; + SliceBufferEntry *reorder[h->sb_size]; + SliceBufferEntry *sbe, *next_sbe; + H264Slice *s; + int reorder_cnt=0; + unsigned next_pic_num=0; + + for(;;){ + + sbe = pop_sbe(&h->sb_q[REORDER], 1); + + s = &sbe->slice; + for(i=reorder_cnt; i>0; i--){ + if (s->coded_pic_num < reorder[i-1]->slice.coded_pic_num) + break; + reorder[i]=reorder[i-1]; + } + reorder[i]=sbe; + + while(reorder_cnt>=0){ + if (next_pic_num!=reorder[reorder_cnt]->slice.coded_pic_num){ + break; + } + next_sbe = reorder[reorder_cnt]; + H264Slice *es = &next_sbe->slice; + + if (next_sbe->state<0) + goto end; + + for (int i=0; i<2; i++){ + for(int j=0; j< es->ref_count[i]; j++){ + if (es->ref_list_cpn[i][j] ==-1) + continue; + int k; + for (k=0; kmax_dpb_cnt; k++){ + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == es->ref_list_cpn[i][j]){ + es->dp_ref_list[i][j] = &h->dpb[k]; + break; + } + } + } + } + next_sbe->dp = get_dpb_entry(h, es); + + push_sbe(&h->sb_q[MBDEC], next_sbe, 0); + notify_all_workers(h); + +// for (int i=0; i< h->mb_height; i++){ +// push_rle(&h->rl_q, next_sbe, i, 0); +// notify_one_worker(h); +// } + + + next_pic_num++; + reorder_cnt--; + } + reorder_cnt++; + } + +end: + { + push_sbe(&h->sb_q[MBDEC], next_sbe, 0); + notify_all_workers(h); + if (h->no_mbd){ + push_sbe(&h->sb_q[OUTPUT], next_sbe, 1); + } +// for (int i=0; i< h->threads; i++){ +// push_rle(&h->rl_q, next_sbe, i, 0); +// notify_one_worker(h); +// } + } + + pthread_exit(NULL); + return NULL; +} + +void create_ed_rec_threads(H264Context *h){ + cpu_set_t cpuset; + int* aff; + + if (h->setaff){ + aff = h->smt ? ed_rec_smt_aff : ed_rec_affinity ; + for (int i=0; ithreads; i++){ + pthread_attr_init(&h->ed_rec_attr[i]); + CPU_ZERO(&cpuset); + CPU_SET(aff[i], &cpuset); + pthread_attr_setaffinity_np(&h->ed_rec_attr[i], sizeof(cpu_set_t), &cpuset); + pthread_create(&h->ed_rec_thr[i], &h->ed_rec_attr[i], ed_rec_thread, h); + } + } else { + for (int i=0; ithreads; i++){ + pthread_create(&h->ed_rec_thr[i], NULL, ed_rec_thread, h); + } + } +} + +void join_ed_rec_threads(H264Context *h){ + for (int i=0; i< h->threads; i++){ + pthread_join(h->ed_rec_thr[i], NULL); + } +} + +void *output_thread(void *arg){ + H264Context *h = (H264Context *) arg; + + OutputContext *oc = get_output_context( h ); + + SliceBufferEntry *sbe = NULL; + H264Slice *s=NULL; + for(;;) { + DecodedPicture *out, *dp; + sbe = pop_sbe(&h->sb_q[OUTPUT], 1); + + if (sbe->state <0) + break; + + s = &sbe->slice; + for (int i=0; irelease_cnt; i++){ + for(int j=0; jmax_dpb_cnt; j++){ + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ + release_dpb_entry(h, &h->dpb[j], 2); + break; + } + } + } + + dp=sbe->dp; + release_sb_entry(h, sbe); + + out =output_frame(h, oc, dp, h->ofile, h->frame_width, h->frame_height); + if (out){ + release_dpb_entry(h, out, 1); + } + + print_report(oc->frame_number, oc->video_size, 0, h->verbose); + + } + /* at the end of stream, we must flush the decoder buffers */ + while (output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height)); + print_report(oc->frame_number, oc->video_size, 1, h->verbose); + + free_output_context(oc); + + pthread_exit(NULL); + return NULL; +} + +/* +* The following code is the main loop of the file converter +*/ +int h264_decode_pthread(H264Context *h) { + pthread_t parse_thr, reorder_thr, output_thr; + + av_start_timer(); + + pthread_create(&parse_thr, NULL, parse_thread, h); + if (!h->no_mbd){ + pthread_create(&reorder_thr, NULL, reorder_thread, h); + pthread_create(&output_thr, NULL, output_thread, h); + } +#if HAVE_LIBSDL2 + pthread_t sdl_thr; + if (h->display){ + pthread_create(&sdl_thr, NULL, sdl_thread, h); + } +#endif + create_ed_rec_threads(h); + + + if (h->rl_side_touch){ + pthread_mutex_lock(&h->ilock); + while (h->init_threads< h->threads) + pthread_cond_wait(&h->icond, &h->ilock); + pthread_mutex_unlock(&h->ilock); + + pthread_mutex_lock(&h->tlock); + h->touch_start =1; + pthread_cond_broadcast(&h->tcond); + pthread_mutex_unlock(&h->tlock); + + pthread_mutex_lock(&h->tdlock); + while (h->touch_done < h->threads) + pthread_cond_wait(&h->tdcond, &h->tdlock); + pthread_mutex_unlock(&h->tdlock); + + pthread_mutex_lock(&h->slock); + h->start =1; + pthread_cond_broadcast(&h->scond); + pthread_mutex_unlock(&h->slock); + } + join_ed_rec_threads(h); + pthread_join(parse_thr, NULL); + if (!h->no_mbd){ + pthread_join(reorder_thr, NULL); + pthread_join(output_thr, NULL); + } +#if HAVE_LIBSDL2 + if (h->display) + signal_sdl_exit(h); + pthread_join(sdl_thr, NULL); +#endif + + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_pthread.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_pthread.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,14 @@ +#ifndef H264_PTHREAD_H +#define H264_PTHREAD_H + +#include "h264_types.h" + +int decode_B_slice_entropy(EntropyContext *ec, EDSlice *s, EDThreadContext *eb, EDThreadContext *eb_prev); +int decode_slice_entropy(EntropyContext *hc, EDSlice *s); + +void *read_thread(void *arg); +void *parsenal_thread(void *arg); +void *mbrec_thread(void *arg); +void *write_thread(void *arg); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_rec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_rec.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,412 @@ +#include "config.h" + +#include "dsputil.h" +#include "h264_types.h" +#include "h264_data.h" +#include "h264_mc.h" +#include "h264_deblock.h" +#include "h264_pred_mode.h" +//#undef NDEBUG +#include + +void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line){ + DecodedPicture *pic = s->curr_pic; + int mb_stride = mrc->mb_stride; + int mb_width = mrc->mb_width; + mrs->mb_type_top = pic->mb_type + (line -1)*mb_stride; + mrs->mb_type = pic->mb_type + line*mb_stride; + mrs->ref_index_top[0] = pic->ref_index[0] + 4*(line -1)*mb_stride; + mrs->ref_index_top[1] = pic->ref_index[1] + 4*(line -1)*mb_stride; + mrs->ref_index[0] = pic->ref_index[0] + 4*line*mb_stride; + mrs->ref_index[1] = pic->ref_index[1] + 4*line*mb_stride; + + mrs->motion_val_top[0] = pic->motion_val[0] + 4*mb_width*4*(line-1); + mrs->motion_val_top[1] = pic->motion_val[1] + 4*mb_width*4*(line-1); + mrs->motion_val[0] = pic->motion_val[0] + 4*mb_width*4*line; + mrs->motion_val[1] = pic->motion_val[1] + 4*mb_width*4*line; + + mrs->intra4x4_pred_mode_top = pic->intra4x4_pred_mode + 4*mb_width*(line-1); + mrs->intra4x4_pred_mode = pic->intra4x4_pred_mode + 4*mb_width*line; + + mrs->non_zero_count_top = pic->non_zero_count + 8*mb_width*(line-1); + mrs->non_zero_count = pic->non_zero_count + 8*mb_width*line; + + if (s->slice_type_nos == FF_B_TYPE){ + mrs->list1_mb_type = s->dp_ref_list[1][0]->mb_type + line*mb_stride; + mrs->list1_ref_index[0] = s->dp_ref_list[1][0]->ref_index[0] + 4*line*mb_stride; + mrs->list1_ref_index[1] = s->dp_ref_list[1][0]->ref_index[1] + 4*line*mb_stride; + mrs->list1_motion_val[0] = s->dp_ref_list[1][0]->motion_val[0] + 4*mb_width*4*line; + mrs->list1_motion_val[1] = s->dp_ref_list[1][0]->motion_val[1] + 4*mb_width*4*line; + } + +} + +#if OMPSS +static void backup_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ + int i; + uint8_t * top_border_y1 = m->top_border; + uint8_t * top_border_y2 = m->top_border + 8; + uint8_t * top_border_cb = m->top_border + 16; + uint8_t * top_border_cr = m->top_border + 24; + uint8_t * top_border_next = m->top_border_next; + + src_y -= linesize; + src_cb -= uvlinesize; + src_cr -= uvlinesize; + + m->left_border[0]= m->top_border[15]; + for(i=1; i<17 ; i++){ + m->left_border[i]= src_y[15 + i*linesize]; + } + + *(uint64_t*)(top_border_y1) = *(uint64_t*)(src_y + 16*linesize); + *(uint64_t*)(top_border_next) = *(uint64_t*)(src_y + 16*linesize); + *(uint64_t*)(top_border_y2) = *(uint64_t*)(src_y +8+16*linesize); + + m->left_border[17]= m->top_border[16+7]; + m->left_border[17+9]= m->top_border[24+7]; + for(i=1; i<9; i++){ + m->left_border[17 +i]= src_cb[7+i*uvlinesize]; + m->left_border[17+9+i]= src_cr[7+i*uvlinesize]; + } + *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); + *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); +} + +static void xchg_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ + int temp8, i; + uint64_t temp64; + + uint8_t * top_border_y1 = m->top_border; + uint8_t * top_border_y2 = m->top_border + 8; + uint8_t * top_border_cb = m->top_border + 16; + uint8_t * top_border_cr = m->top_border + 24; + uint8_t * top_border_next = m->top_border_next; + + int deblock_left; + int deblock_top; + + deblock_left = (m->mb_x > 0); + deblock_top = (m->mb_y > 0); + + src_y -= ( linesize + 1); + src_cb -= (uvlinesize + 1); + src_cr -= (uvlinesize + 1); + + #define XCHG(a,b,t,xchg)\ + t= a;\ + if(xchg)\ + a= b;\ + b= t; + + if(deblock_left){ + for(i = !deblock_top; i<16; i++){ + XCHG(m->left_border[i], src_y [i* linesize], temp8, xchg); + } + XCHG(m->left_border[i], src_y [i* linesize], temp8, 1); + + for(i = !deblock_top; i<8; i++){ + XCHG(m->left_border[17 +i], src_cb[i*uvlinesize], temp8, xchg); + XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, xchg); + } + XCHG(m->left_border[17 +i], src_cb[i*uvlinesize], temp8, 1); + XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, 1); + } + + if(deblock_top){ + XCHG(*(uint64_t*)(top_border_y1) , *(uint64_t*)(src_y +1), temp64, xchg); + XCHG(*(uint64_t*)(top_border_y2) , *(uint64_t*)(src_y +9), temp64, 1); + XCHG(*(uint64_t*)(top_border_next), *(uint64_t*)(src_y +17), temp64, 1); + + XCHG(*(uint64_t*)(top_border_cb) , *(uint64_t*)(src_cb+1), temp64, 1); + XCHG(*(uint64_t*)(top_border_cr) , *(uint64_t*)(src_cr+1), temp64, 1); + } +} +#else + +static void backup_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ + int i; + uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y; + uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb; + uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr; + + uint8_t* left_border_y = d->left.unfiltered_y; + uint8_t* left_border_cb = d->left.unfiltered_cb; + uint8_t* left_border_cr = d->left.unfiltered_cr; + + src_y -= linesize; + src_cb -= uvlinesize; + src_cr -= uvlinesize; + + // There are two lines saved, the line above the top macroblock of a pair, + // and the line above the bottom macroblock + left_border_y[0] = top_border_y[15]; + for(i=1; i<17; i++){ + left_border_y[i] = src_y[15+i* linesize]; + } + *(uint64_t*)(top_border_y ) = *(uint64_t*)(src_y + 16*linesize); + *(uint64_t*)(top_border_y +8) = *(uint64_t*)(src_y +8+16*linesize); + + left_border_cb[0] = top_border_cb[7]; + left_border_cr[0] = top_border_cr[7]; + for(i=1; i<9; i++){ + left_border_cb[i] = src_cb[7+i*uvlinesize]; + left_border_cr[i] = src_cr[7+i*uvlinesize]; + } + *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); + *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); +} + +static void xchg_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ + + int temp8, i; + uint64_t temp64; + int deblock_left; + int deblock_top; + + uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y; + uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb; + uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr; + uint8_t* top_border_y_next = d->top[m->mb_x +1].unfiltered_y; + + uint8_t* left_border_y = d->left.unfiltered_y; + uint8_t* left_border_cb = d->left.unfiltered_cb; + uint8_t* left_border_cr = d->left.unfiltered_cr; + + deblock_left = (m->mb_x > 0); + deblock_top = (m->mb_y > 0); + + src_y -= ( linesize + 1); + src_cb -= (uvlinesize + 1); + src_cr -= (uvlinesize + 1); + + #define XCHG(a,b,t,xchg)\ + t= a;\ + if(xchg)\ + a= b;\ + b= t; + + if(deblock_left){ + for(i = !deblock_top; i<16; i++){ + XCHG(left_border_y[i], src_y [i* linesize], temp8, xchg); + } + XCHG(left_border_y[i], src_y [i* linesize], temp8, 1); + + for(i = !deblock_top; i<8; i++){ + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg); + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg); + } + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1); + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1); + } + + if(deblock_top){ + XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg); + XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1); + if(m->mb_x+1 < d->mb_width){ + XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1); + } + XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1); + XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1); + } +} + +#endif + +void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m){ + int i; + const int mb_x= m->mb_x; + const int mb_y= m->mb_y; + int *block_offset = d->block_offset; + + void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); + + int linesize = d->linesize; + int uvlinesize = d->uvlinesize; + + uint8_t *dest_y = s->curr_pic->data[0] + (mb_x + mb_y * linesize ) * 16; + uint8_t *dest_cb = s->curr_pic->data[1] + (mb_x + mb_y * uvlinesize) * 8; + uint8_t *dest_cr = s->curr_pic->data[2] + (mb_x + mb_y * uvlinesize) * 8; + + pred_motion_mb_rec (d, mrs, s, m); + + const int mb_type= m->mb_type; + + d->dsp.prefetch(dest_y + (m->mb_x&3)*4*linesize + 64, d->linesize, 4); + d->dsp.prefetch(dest_cb + (m->mb_x&7)*uvlinesize + 64, dest_cr - dest_cb, 2); + + if(IS_INTRA(mb_type)){ +#if OMPSS + xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); +#else + xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); +#endif + + d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cb, uvlinesize); + d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cr, uvlinesize); + + if(IS_INTRA4x4(mb_type)){ + if(IS_8x8DCT(mb_type)){ + idct_dc_add = d->hdsp.h264_idct8_dc_add; + idct_add = d->hdsp.h264_idct8_add; + + for(i=0; i<16; i+=4){ + uint8_t * const ptr= dest_y + block_offset[i]; + const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ]; + + const int nnz = mrs->non_zero_count_cache[ scan8[i] ]; + d->hpc.pred8x8l[ dir ](ptr, (mrs->topleft_samples_available<topright_samples_available<mb[i*16]) + idct_dc_add(ptr, m->mb + i*16, linesize); + else + idct_add (ptr, m->mb + i*16, linesize); + } + } + }else{ + idct_dc_add = d->hdsp.h264_idct_dc_add; + idct_add = d->hdsp.h264_idct_add; + + for(i=0; i<16; i++){ + uint8_t * const ptr= dest_y + block_offset[i]; + const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ]; + uint8_t *topright; + int nnz, tr; + if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ + const int topright_avail= (mrs->topright_samples_available<hpc.pred4x4[ dir ](ptr, topright, linesize); + nnz = mrs->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(nnz == 1 && m->mb[i*16]) + idct_dc_add(ptr, m->mb + i*16, linesize); + else + idct_add (ptr, m->mb + i*16, linesize); + } + } + } + }else{ + d->hpc.pred16x16[ m->intra16x16_pred_mode ](dest_y , linesize); + } +#if OMPSS + xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); +#else + xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); +#endif + }else { + hl_motion(d, mrs, s, m, dest_y, dest_cb, dest_cr, + d->hdsp.qpel_put, d->dsp.put_h264_chroma_pixels_tab, + d->hdsp.qpel_avg, d->dsp.avg_h264_chroma_pixels_tab, + d->hdsp.weight_h264_pixels_tab, d->hdsp.biweight_h264_pixels_tab); + } + + if(!IS_INTRA4x4(mb_type)){ + + if(IS_INTRA16x16(mb_type)){ + + d->hdsp.h264_idct_add16intra(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); + + }else if(m->cbp&15){ + + if(IS_8x8DCT(mb_type)){ + d->hdsp.h264_idct8_add4(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); + }else{ + d->hdsp.h264_idct_add16(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); + } + } + } + + if(m->cbp&0x30){ + uint8_t *dest[2] = {dest_cb, dest_cr}; + + idct_add = d->hdsp.h264_idct_add; + idct_dc_add = d->hdsp.h264_idct_dc_add; + for(i=16; i<16+8; i++){ + if(mrs->non_zero_count_cache[ scan8[i] ]) + idct_add (dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize); + else if(m->mb[i*16]) + idct_dc_add(dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize); + } + } + +#if OMPSS + backup_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize); + if (mb_x+1 mb_width){ + H264Mb *mr = m+1; + memcpy(mr->left_border, m->left_border, sizeof(m->left_border)); + } + if (mb_y +1 mb_height){ + H264Mb *md = m + d->mb_width; + memcpy(md->top_border, m->top_border, sizeof(m->top_border)); + if (mb_x>0){ + H264Mb *mdl = m + d->mb_width -1; + memcpy(mdl->top_border_next, m->top_border_next, sizeof(m->top_border_next)); + } + } +#else + backup_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize); + if (mb_y +1 mb_height && d->top_next != d->top){ + memcpy(&d->top_next[mb_x],&d->top[mb_x], sizeof(TopBorder)); + } +#endif + + ff_h264_filter_mb(d, mrs, s, m, dest_y, dest_cb, dest_cr); +} + +MBRecContext *get_mbrec_context(H264Context *h){ + MBRecContext *d = av_mallocz(sizeof(MBRecContext)); + + ff_h264dsp_init(&d->hdsp); + ff_h264_pred_init(&d->hpc); + dsputil_init(&d->dsp); + +#if !OMPSS + d->mrs = av_mallocz(sizeof(MBRecState)); +#endif + d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab; + d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab; + d->mb_height = h->mb_height; + d->mb_width = h->mb_width; + d->mb_stride = h->mb_stride; + d->b_stride = h->b_stride; + d->height = h->height; + d->width = h->width; + d->linesize = h->width + EDGE_WIDTH*2; + d->uvlinesize = d->linesize>>1; + + d->scratchpad_y = av_malloc(d->linesize*16*sizeof(uint8_t)); + d->scratchpad_cb= av_malloc(d->uvlinesize*8*sizeof(uint8_t)); + d->scratchpad_cr= av_malloc(d->uvlinesize*8*sizeof(uint8_t)); + + for (int i=0; i<16; i++){ + d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3); + } + for (int i=0; i<4; i++){ + d->block_offset[16+i]= + d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3); + } + + + + return d; +} + +void free_mbrec_context(MBRecContext *d){ +#if !OMPSS + av_free(d->mrs); +#endif + av_free(d->scratchpad_y); + av_free(d->scratchpad_cb); + av_free(d->scratchpad_cr); + av_free(d); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_rec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_rec.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,12 @@ +#ifndef H264_REC_H +#define H264_REC_H + +#include "h264_types.h" + +MBRecContext *get_mbrec_context(H264Context *h); +void free_mbrec_context( MBRecContext *d); +void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m); + +void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_refs.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_refs.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,461 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 reference picture handling. + * @author Michael Niedermayer + */ + +#include "dsputil.h" +#include "h264_types.h" +#include "golomb.h" + +//#undef NDEBUG +#include + +static int build_def_list(PictureInfo **def, PictureInfo **in, int len, int is_long){ + int i[2]={0}; + int index=0; + + while(i[0]reference))) + i[0]++; + while(i[1]reference & 0))) + i[1]++; + if(i[0] < len){ + in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num; + def[index++]= in[ i[0]++ ]; + } + if(i[1] < len){ + in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num; + def[index++]= in[ i[1]++ ]; + } + } + + return index; +} + +static int add_sorted(PictureInfo **sorted, PictureInfo **src, int len, int limit, int dir){ + int i, best_poc; + int out_i= 0; + + for(;;){ + best_poc= dir ? INT_MIN : INT_MAX; + + for(i=0; ipoc; + if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){ + best_poc= poc; + sorted[out_i]= src[i]; + } + } + if(best_poc == (dir ? INT_MIN : INT_MAX)) + break; + limit= sorted[out_i++]->poc - dir; + } + return out_i; +} + +int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s){ + int i,len; + + if(s->slice_type_nos==FF_B_TYPE){ + PictureInfo *sorted[32]; + int cur_poc, list; + int lens[2]; + + cur_poc= s->poc; + + for(list= 0; list<2; list++){ + len= add_sorted(sorted, n->short_ref, n->short_ref_count, cur_poc, !list); + len+=add_sorted(sorted+len, n->short_ref, n->short_ref_count, cur_poc, list); + assert(len<=32); + len= build_def_list(s->ref_list[list], sorted, len, 0); + len+=build_def_list(s->ref_list[list] +len, n->long_ref, 16 , 1); + assert(len<=32); + + for(int i=len; iref_count[list]; i++) + s->ref_list[list][i] = NULL; + + lens[list]= len; + } + + if(lens[0] == lens[1] && lens[1] > 1){ + for(i=0; s->ref_list[0][i]->poc == s->ref_list[1][i]->poc && iref_list[1][0], s->ref_list[1][1]); + } + }else{ + len = build_def_list(s->ref_list[0], n->short_ref, n->short_ref_count, 0); + len+= build_def_list(s->ref_list[0] +len, n->long_ref, 16, 1); + assert(len <= 32); + for(i=len; iref_count[0]; i++) + s->ref_list[0][i] = NULL; + } + + return 0; +} + +/** +* print short term list +*/ +static void print_short_term(NalContext *n) { + av_log(AV_LOG_DEBUG, "short term list:\n"); + for(int i=0; ishort_ref_count; i++){ + PictureInfo *pic= n->short_ref[i]; + av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d ref:%d \n", i, pic->frame_num, pic->poc, pic->reference); + } +} + +/** +* print long term list +*/ +static void print_long_term(NalContext *n) { + uint32_t i; + + av_log(AV_LOG_DEBUG, "long term list:\n"); + for(i = 0; i < 16; i++){ + PictureInfo *pic= n->long_ref[i]; + if (pic) { + av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d\n", i, pic->frame_num, pic->poc); + } + } +} + +int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb){ + int list, index; + + print_short_term(n); + print_long_term(n); + + for(list=0; listlist_count; list++){ + + if(get_bits1(gb)){ + int frame_num = n->frame_num; + unsigned int abs_diff_pic_num; + for(index=0; ; index++){ + unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(gb); + int i=0; + PictureInfo *ref = NULL; + + if(reordering_of_pic_nums_idc==3){ + break; + } + if(index >= s->ref_count[list]){ + av_log(AV_LOG_ERROR, "reference count overflow\n"); + return -1; + } + + if (reordering_of_pic_nums_idc>2){ + av_log(AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n"); + return -1; + } + + if (reordering_of_pic_nums_idc<2){ + //av_log(AV_LOG_ERROR, "long term pic not supported\n"); + + abs_diff_pic_num= get_ue_golomb(gb) + 1; + if(abs_diff_pic_num > (unsigned) n->max_pic_num){ + av_log(AV_LOG_ERROR, "abs_diff_pic_num overflow\n"); + return -1; + } + + if(reordering_of_pic_nums_idc == 0) + frame_num-= abs_diff_pic_num; + else + frame_num+= abs_diff_pic_num; + frame_num &= n->max_pic_num - 1; + + for(i= 0 ; ishort_ref_count; i++){ + ref = n->short_ref[i]; + if(ref->frame_num == frame_num && ref->reference){ + break; + } + } + ref->pic_id= frame_num; + }else{ + int long_idx; + long_idx= get_ue_golomb(gb); //long_term_pic_idx + + if(long_idx>31){ + av_log(AV_LOG_ERROR, "long_term_pic_idx overflow\n"); + return -1; + } + ref = n->long_ref[long_idx]; + assert(!(ref && !ref->reference)); + if(ref && (ref->reference)){ + ref->pic_id= long_idx; + assert(ref->long_ref); + }else{ + av_log(AV_LOG_ERROR, "reference picture missing during reorder\n"); + } + } + + if (i >= n->short_ref_count) { + av_log(AV_LOG_ERROR, "reference picture missing during reorder\n"); + return -1; + } else { + for(i=index; i+1 ref_count[list]; i++){ + +// if(ref->frame_num == s->ref_list[list][i]->frame_num) +// break; + ///there is probably no need for a separate pic_id and frame_num + if (s->ref_list[list][i]){ + + if(ref->long_ref == s->ref_list[list][i]->long_ref && ref->pic_id == s->ref_list[list][i]->pic_id) + break; + } + } + for(; i > index; i--){ + s->ref_list[list][i]= s->ref_list[list][i-1]; + } + s->ref_list[list][index]= ref; + } + } + } + } + +// //Check if everything went well +// for(list=0; listlist_count; list++){ +// //printf("ref_count %d list %d\n", s->ref_count[list], list); +// for(index= 0; index < s->ref_count[list]; index++){ +// //printf("%d\n", s->ref_list[list][index]->pic_id); +// if(!s->ref_list[list][index]->data[0]){ +// av_log(AV_LOG_ERROR, "Missing reference picture\n"); +// return -1; +// } +// } +// } + + return 0; +} + +static PictureInfo *find_short(NalContext *n, int frame_num){ + int i; + for(i=0; ishort_ref_count; i++){ + if(n->short_ref[i]->frame_num == frame_num) { + return n->short_ref[i]; + } + } + return NULL; +} + +static int remove_short(NalContext *n, H264Slice *s, int frame_num, int release){ + int i; + + for (i=0; ishort_ref_count; i++){ + if (n->short_ref[i]->frame_num == frame_num){ + if (release){ + s->release_ref_cpn[s->release_cnt++] = n->short_ref[i]->cpn; + n->short_ref[i]->reference &= ~2; + } + n->short_ref[i] = NULL; + if (--n->short_ref_count) + memmove(&n->short_ref[i], &n->short_ref[i+1], (n->short_ref_count - i)*sizeof(PictureInfo *)); + return 0; + } + } + return -1; +} + +static void remove_long(NalContext *n, H264Slice *s, int i){ + + if (n->long_ref[i]){ + s->release_ref_cpn[s->release_cnt++] = n->long_ref[i]->cpn; + n->long_ref[i]->reference &= ~2; + n->long_ref[i]->long_ref = 0; + n->long_ref_count--; + n->long_ref[i] = NULL; + } +} + +void ff_h264_remove_all_refs(NalContext *n, H264Slice *s){ + int i; + + while (n->short_ref[0]) + remove_short(n, s, n->short_ref[0]->frame_num, 1); + + for(i=0; i<16; i++){ + remove_long(n, s, i); + } + assert(n->short_ref_count==0); + assert(n->long_ref_count==0); +} + +int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb){ + + if(s->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields + get_bits1(gb); //get_bits1(gb) -1; //broken link + if(get_bits1(gb)){ + av_log(AV_LOG_ERROR, "MMCO_LONG reference management not supported\n"); + } + }else{ + if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag + int i,j; + for(i= 0; iframe_num - get_ue_golomb(gb) - 1) & (n->max_pic_num - 1); + } + if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){ + long_arg= get_ue_golomb_31(gb); + if(long_arg >= 16){ + av_log(AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode); + return -1; + } + } + + if(opcode > (unsigned)MMCO_LONG){ + av_log(AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode); + return -1; + } + if(opcode == MMCO_END) + break; + + switch (opcode){ + case MMCO_SHORT2UNUSED: + remove_short(n, s, short_pic_num, 1); + break; + case MMCO_SHORT2LONG: + pic = find_short(n, short_pic_num); + if (n->long_ref[long_arg] != pic) + remove_long(n, s, long_arg); + remove_short(n, s, short_pic_num, 0); + n->long_ref[long_arg]= pic; + if (pic){ + pic->long_ref=1; + n->long_ref[long_arg]= pic; + n->long_ref_count++; + } + break; + case MMCO_LONG2UNUSED: + assert(n->long_ref[long_arg]); + remove_long(n, s, long_arg); + break; + case MMCO_SET_MAX_LONG: + for(j=long_arg; j<16; j++) + remove_long(n, s, j); + break; + case MMCO_RESET: + while(n->short_ref_count) + remove_short(n, s, n->short_ref[0]->frame_num, 1); + + for(j=0; j < 16; j++) + remove_long(n, s, j); + + s->current_picture_info->poc= + s->poc = + n->poc_lsb= + n->poc_msb= + n->frame_num= + s->current_picture_info->frame_num= 0; + break; + case MMCO_END: + case MMCO_LONG: + break; + } + } + }else{// sliding window ref picture marking + if(n->short_ref_count == n->sps.ref_frame_count) { + s->release_ref_cpn[s->release_cnt++] = n->short_ref[n->short_ref_count - 1]->cpn; + n->short_ref[n->short_ref_count - 1]->reference &= ~2; + n->short_ref[ n->short_ref_count - 1 ] =NULL; + n->short_ref_count--; + } + } + } + + if(n->short_ref_count) + memmove(&n->short_ref[1], &n->short_ref[0], n->short_ref_count*sizeof(PictureInfo *)); + + n->short_ref[0]= s->current_picture_info; + n->short_ref_count++; + + return 0; +} + +static int get_scale_factor(H264Slice *s, int poc, int poc1, int i){ + int poc0 = s->ref_list[0][i]->poc; + int td = av_clip(poc1 - poc0, -128, 127); + if(td == 0 || s->ref_list[0][i]->long_ref){ + return 256; + }else{ + int tb = av_clip(poc - poc0, -128, 127); + int tx = (16384 + (FFABS(td) >> 1)) / td; + return av_clip((tb*tx + 32) >> 6, -1024, 1023); + } +} + +void ff_h264_direct_dist_scale_factor(H264Slice *s){ + const int poc = s->current_picture_info->poc; + const int poc1 = s->ref_list[1][0]->poc; + + for(int i=0; iref_count[0]; i++){ + s->dist_scale_factor[i] = get_scale_factor(s, poc, poc1, i); + } +} + +static void fill_colmap(H264Slice *s, int map[2][16], int list){ + PictureInfo * const ref1 = s->ref_list[1][0]; + int old_ref, rfield; + + /* bogus; fills in for missing frames */ + memset(map[list], 0, sizeof(map[list])); + + for(rfield=0; rfield<2; rfield++){ + for(old_ref=0; old_ref < ref1->ref_count[list]; old_ref++){ + int poc = ref1->ref_poc[list][old_ref]; + + for(int j=0; jref_count[0]; j++){ + if(s->ref_list[0][j]->poc == poc){ + map[list][old_ref] = j; + break; + } + } + } + } +} + +void ff_h264_direct_ref_list_init(H264Slice *s){ + PictureInfo * const cur = s->current_picture_info; + int list; + + for(list=0; list<2; list++){ + cur->ref_count[list] = s->ref_count[list]; + for(int j=0; jref_count[list]; j++){ + cur->ref_poc[list][j] = s->ref_list[list][j] ? s->ref_list[list][j]->poc : 0; + } + } + + if(s->slice_type_nos != FF_B_TYPE || s->direct_spatial_mv_pred) + return; + + for(list=0; list<2; list++){ + fill_colmap(s, s->map_col_to_list0, list); + } +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_refs.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_refs.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,14 @@ +#ifndef H264_REFS_H +#define H264_REFS_H + +#include "avcodec.h" +#include "h264_types.h" + +int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s); +int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb); +void ff_h264_remove_all_refs(NalContext *n, H264Slice *s); +int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb); +void ff_h264_direct_ref_list_init(H264Slice *s); +void ff_h264_direct_dist_scale_factor(H264Slice *s); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_sei.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_sei.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,191 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... sei decoding + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * H.264 / AVC / MPEG4 part10 sei decoding. + * @author Michael Niedermayer + */ + +#include "avcodec.h" +#include "h264_types.h" +#include "golomb.h" + +//#undef NDEBUG +#include + +static const uint8_t sei_num_clock_ts_table[9]={ + 1, 1, 1, 2, 2, 3, 3, 2, 3 +}; + +void ff_h264_reset_sei(NalContext *n) { + n->sei_recovery_frame_cnt = -1; + n->sei_dpb_output_delay = 0; + n->sei_cpb_removal_delay = -1; + n->sei_buffering_period_present = 0; +} + +static int decode_picture_timing(NalContext *n, GetBitContext *gb){ + if(n->sps.nal_hrd_parameters_present_flag || n->sps.vcl_hrd_parameters_present_flag){ + n->sei_cpb_removal_delay = get_bits(gb, n->sps.cpb_removal_delay_length); + n->sei_dpb_output_delay = get_bits(gb, n->sps.dpb_output_delay_length); + } + if(n->sps.pic_struct_present_flag){ + unsigned int i, num_clock_ts; + n->sei_pic_struct = get_bits(gb, 4); + n->sei_ct_type = 0; + + if (n->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING) + return -1; + + num_clock_ts = sei_num_clock_ts_table[n->sei_pic_struct]; + + for (i = 0 ; i < num_clock_ts ; i++){ + if(get_bits(gb, 1)){ /* clock_timestamp_flag */ + unsigned int full_timestamp_flag; + n->sei_ct_type |= 1<sps.time_offset_length > 0) + skip_bits(gb, n->sps.time_offset_length); /* time_offset */ + } + } + } + return 0; +} + +static int decode_unregistered_user_data(GetBitContext *gb, int size){ + char user_data[16+256]; + int e, build, i; + + if(size<16) + return -1; + + for(i=0; i<(int) sizeof(user_data)-1 && isei_recovery_frame_cnt = get_ue_golomb(gb); + skip_bits(gb, 4); /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */ + + return 0; +} + +static int decode_buffering_period(NalContext *n, GetBitContext *gb){ + unsigned int sps_id; + int sched_sel_idx; + SPS *sps; + + sps_id = get_ue_golomb_31(gb); + if(sps_id > 31 || !n->sps_buffers[sps_id]) { + av_log(AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id); + return -1; + } + sps = n->sps_buffers[sps_id]; + + // NOTE: This is really so duplicated in the standard... See H.264, D.1.1 + if (sps->nal_hrd_parameters_present_flag) { + for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) { + n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length); + skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset + } + } + if (sps->vcl_hrd_parameters_present_flag) { + for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) { + n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length); + skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset + } + } + + n->sei_buffering_period_present = 1; + return 0; +} + +int ff_h264_decode_sei(NalContext *n, GetBitContext *gb){ + while(get_bits_count(gb) + 16 < gb->size_in_bits){ + int size, type; + + type=0; + do{ + type+= show_bits(gb, 8); + }while(get_bits(gb, 8) == 255); + + size=0; + do{ + size+= show_bits(gb, 8); + }while(get_bits(gb, 8) == 255); + + switch(type){ + case SEI_TYPE_PIC_TIMING: // Picture timing SEI + if(decode_picture_timing(n, gb) < 0) + return -1; + break; + case SEI_TYPE_USER_DATA_UNREGISTERED: + if(decode_unregistered_user_data(gb, size) < 0) + return -1; + break; + case SEI_TYPE_RECOVERY_POINT: + if(decode_recovery_point(n, gb) < 0) + return -1; + break; + case SEI_BUFFERING_PERIOD: + if(decode_buffering_period(n, gb) < 0) + return -1; + break; + default: + skip_bits(gb, 8*size); + } + + //FIXME check bits here + align_get_bits(gb); + } + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_sei.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_sei.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,7 @@ +#ifndef H264_SEI_H +#define H264_SEI_H + +int ff_h264_decode_sei(NalContext *n, GetBitContext *gb); +void ff_h264_reset_sei(NalContext *n); + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_seq.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_seq.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,220 @@ +/* +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder +* Copyright (c) 2003 Michael Niedermayer +* +* This file is part of FFmpeg. +* +* FFmpeg is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public +* License as published by the Free Software Foundation; either +* version 2.1 of the License, or (at your option) any later version. +* +* FFmpeg is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU Lesser General Public +* License along with FFmpeg; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include "h264_types.h" +#include "h264_parser.h" +#include "h264_nal.h" +#include "h264_entropy.h" +#include "h264_rec.h" +#include "h264_pred_mode.h" +#include "h264_misc.h" +// #undef NDEBUG +#include + +static int decode_slice_entropy_seq(H264Context *h, EntropyContext *ec, H264Slice *s, GetBitContext *gb, H264Mb *mbs){ + int i,j; +// GetBitContext *gb = s->gb; + CABACContext *c = &ec->c; + + if( !s->pps.cabac ){ + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); + return -1; + } + + init_dequant_tables(s, ec); + ec->curr_qscale = s->qscale; + ec->last_qscale_diff = 0; + ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale); + ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale); + + /* realign */ + align_get_bits( gb ); + /* init cabac */ + ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); + + ff_h264_init_cabac_states(ec, s, c); + + for(j=0; jmb_height; j++){ + init_entropy_buf(ec, s, j); + for(i=0; imb_width; i++){ + int eos,ret; + H264Mb *m = &mbs[i + j*ec->mb_width]; + //memset(m, 0, sizeof(H264Mb)); + m->mb_x=i; + m->mb_y=j; + ec->m = m; + + ret = ff_h264_decode_mb_cabac(ec, s, c); + eos = get_cabac_terminate( c); + (void) eos; + if( ret < 0 || c->bytestream > c->bytestream_end + 2) { + av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); + return -1; + } + } + } + +// av_freep(&s->gb.raw); +// if (s->gb.rbsp) +// av_freep(&s->gb.rbsp); + + return 0; +} + + + +/** +* Sequential version +*/ +static void decode_slice_mb_seq(H264Context *h, MBRecContext *d, H264Slice *s2, H264Mb *mbs){ + + for (int i=0; i<2; i++){ + for(int j=0; j< s2->ref_count[i]; j++){ + if (s2->ref_list_cpn[i][j] ==-1) + continue; + int k; + for (k=0; kmax_dpb_cnt; k++){ + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s2->ref_list_cpn[i][j]){ + s2->dp_ref_list[i][j] = &h->dpb[k]; + break; + } + } + } + } + + get_dpb_entry(h, s2); + + if (!h->no_mbd){ + for(int j=0; jmb_height; j++){ + init_mbrec_context(d, d->mrs, s2, j); + if (h->profile) printf("\n[MBREC LINE %d ", j); + for(int i=0; imb_width; i++){ + + if ((i & 0x7) == 0) start_timer(h, REC); + H264Mb *m = &mbs[i + j*d->mb_width]; + if (h->profile==2) + pred_motion_mb_rec (d, d->mrs, s2, m); + else{ + h264_decode_mb_internal(d, d->mrs, s2, m); + } + stop_timer(h, REC); + } + draw_edges(d, s2, j); + + } + } + + for (int i=0; irelease_cnt; i++){ + for(int j=0; jmax_dpb_cnt; j++){ + if(h->dpb[j].cpn== s2->release_ref_cpn[i]){ + release_dpb_entry(h, &h->dpb[j], 2); + break; + } + } + } + s2->release_cnt=0; +} + +/* +* The following code is the main loop of the file converter +*/ +int h264_decode_seq( H264Context *h) { + ParserContext *pc; + NalContext *nc; + EntropyContext *ec; + MBRecContext *rc; + OutputContext *oc; + + H264Slice slice, *s=&slice; + H264Mb *mbs; + DecodedPicture *out; + int frames=0; + +#if HAVE_LIBSDL2 + pthread_t sdl_thr; + if (h->display){ + pthread_create(&sdl_thr, NULL, sdl_thread, h); + } +#endif + + pc = get_parse_context(h->ifile); + nc = get_nal_context(h->width, h->height); + + memset(s, 0, sizeof(H264Slice)); + mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb)); + + ec = get_entropy_context( h ); + rc = get_mbrec_context(h); + rc->top_next = rc->top = av_malloc( h->mb_width * sizeof(TopBorder)); + + oc = get_output_context( h ); + + av_start_timer(); + GetBitContext gb = {0,}; + while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ + if (h->profile) start_timer(h, FRONT); + av_read_frame_internal(pc, &gb); + decode_nal_units(nc, s, &gb); + if (h->profile) stop_timer(h, FRONT); +// memset(s->mbs, 0, sizeof(H264Mb)*ec->mb_width*ec->mb_height); + if (h->profile) start_timer(h, ED); + decode_slice_entropy_seq(h, ec, s, &gb, mbs); + if (h->profile) stop_timer(h, ED); + + if (h->profile) start_timer(h, REC); + decode_slice_mb_seq(h, rc, s, mbs); + if (h->profile) stop_timer(h, REC); + + out =output_frame(h, oc, s->curr_pic, h->ofile, h->frame_width, h->frame_height); + if (out){ + release_dpb_entry(h, out, 1); + } + + print_report(oc->frame_number, oc->video_size, 0, h->verbose); + if (h->profile == 3){ + printf("[ENTROPY %.3fms] [MBREC %.3fms]\n", h->last_time[ED] , h->last_time[REC]); + } + } + while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; + + print_report(oc->frame_number, oc->video_size, 1, h->verbose); + h->num_frames = oc->frame_number; + /* finished ! */ + av_freep(&mbs); + av_freep(&gb.raw); + if (gb.rbsp) + av_freep(&gb.rbsp); + av_freep(&rc->top); + + free_parse_context(pc); + free_nal_context (nc); + free_entropy_context(ec); + free_mbrec_context(rc); + free_output_context(oc); + +#if HAVE_LIBSDL2 + if (h->display){ + signal_sdl_exit(h); + pthread_join(sdl_thr, NULL); + } +#endif + + return 0; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/h264_types.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/h264_types.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,658 @@ +#ifndef H264_TYPES_H +#define H264_TYPES_H + +#include "config.h" +#ifdef HAVE_LIBSDL2 +#include +#endif + +#include +#include "avcodec.h" +#include "cabac.h" +#include "h264_dsp.h" +#include "h264_pred.h" +#include "get_bits.h" + + +#define MAX_REF_PIC_COUNT 16 +#define MAX_DELAYED_PIC_COUNT 16 + +#define MAX_THREADS 80 + +//#define MAX_PIC_COUNT (4*(MAX_REF_PIC_COUNT+MAX_DELAYED_PIC_COUNT)) + +#define DPB_SIZE 33 + + +//potsdam machine 8xX7560 without HT +// static int edb_affinity [16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +// static int edip_affinity[8] = {16, 17, 18, 19, 20, 21, 22, 23}; +// +// static int mbd_affinity[8][5] = { {24, 32, 40, 48, 56}, +// {25, 33, 41, 49, 57}, +// {26, 34, 42, 50, 58}, +// {27, 35, 43, 51, 59}, +// {28, 36, 44, 52, 60}, +// {29, 37, 45, 53, 61}, +// {30, 38, 46, 54, 62}, +// {31, 39, 47, 55, 63}, }; + +// static int edb_affinity [22] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 58, 59, 60, 61 ,62, 63}; +// static int edip_affinity[10] = {16, 17, 18, 19, 20, 21, 22, 23, 56, 57 }; +// +// static int mbd_affinity[8][5] = { {24, 32, 40, 48, 56}, +// {25, 33, 41, 49, 57}, +// {26, 34, 42, 50, 58}, +// {27, 35, 43, 51, 59}, +// {28, 36, 44, 52, 60}, +// {29, 37, 45, 53, 61}, +// {30, 38, 46, 54, 62}, +// {31, 39, 47, 55, 63}, }; +// //4 socket +// static int edip_affinity[5] = {0, 1, 2, 3, 56}; +// static int edb_affinity [12] = {8, 9, 10, 11, 16, 17, 18, 19, 59, 58, 57, 51}; +// +// static int mbd_affinity[4][5] = { {24, 32, 40, 48, 56}, +// {25, 33, 41, 49, 57}, +// {26, 34, 42, 50, 58}, +// {27, 35, 43, 51, 59}, }; + +// static int edip_affinity[3] = {0, 1, 49}; +// static int edb_affinity [6] = {8, 9, 16, 17, 56, 57}; +// +// static int mbd_affinity[2][5] = { {24, 32, 40, 48, 56}, +// {25, 33, 41, 49, 57}}; + +// static int edip_affinity[2] = {0, 8}; +// static int edb_affinity [3] = {16, 24, 56}; +// +// static int mbd_affinity[1][4] = { {32, 40, 48, 56}, +// }; + +/// for ducks_take_off_2160p +// static int edip_affinity[2] = {0, 8}; +// static int edb_affinity [3] = {16, 24, 32}; +// +// static int mbd_affinity[1][4] = {{ 40, 48, 56, 32}}; + +// static int edip_affinity[3] = {0, 1, 57}; +// static int edb_affinity [7] = {8, 9, 16, 17, 24, 25, 56}; +// +// static int mbd_affinity[2][4] = { {32, 40, 48, 56}, +// {33, 41, 49, 57}}; + +//4 socket +// static int edip_affinity[6] = {0, 1, 2, 3, 59}; +// static int edb_affinity [14] = {8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 58, 57}; +// +// static int mbd_affinity[4][4] = { {32, 40, 48, 56}, +// {33, 41, 49, 57}, +// {34, 42, 50, 58}, +// {35, 43, 51, 59}, }; + + +// static int edb_affinity [29] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 59, 60, 61, 62, 63}; +// static int edip_affinity[11] = {24, 25, 26, 27, 28, 29, 30, 31, 63, 62, 61}; +// +// static int mbd_affinity[8][4] = {{32, 40, 48, 56}, +// {33, 41, 49, 57}, +// {34, 42, 50, 58}, +// {35, 43, 51, 59}, +// {36, 44, 52, 60}, +// {37, 45, 53, 61}, +// {38, 46, 54, 62}, +// {39, 47, 55, 63}, }; + +//potsdam machine 4xX7550 with HT +// int edip_affinity[16] = {0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; +// int edb_affinity [16] = {1, 9, 17, 25, 2, 10, 18, 26, 6, 14, 22, 30, 7, 15, 23, 31 }; +// int edip_affinity[16] = {58, 50, 42, 34, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; +// int edb_affinity [16] = {57, 49, 41, 33, 56, 48, 40, 32, 6, 14, 22, 30, 7, 15, 23, 31 }; +// //int edb_affinity [16] = {4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 }; +// //mb threads affinity on logical cores moving back to keep inteference with ed threads low +// int mbd_affinity[4][8] = { {63, 62, 61, 60, 59, 58, 57, 56}, +// {55, 54, 53, 52, 51, 50, 49, 48}, +// {47, 46, 45, 44, 43, 42, 41, 40}, +// {39, 38, 37, 36, 35, 34, 33, 32}, +// }; + + +// static int edip_affinity[2] = {0, 2}; +// static int edb_affinity [4] = {1, 3, 2, 5}; +// +// static int mbd_affinity[1][4] = {{ 4, 6, 7, 5}}; + +enum{ + PARSE=0, + ENTROPY, + REORDER, + REORDER2, //second mutex-cond pair used in reorder_thread + MBDEC, + OUTPUT, + STAGES +}; + +//adhoc for profiling +enum{ + TOTAL=0, + FRONT, + ED, + REC, + PROFILE_STAGES +}; + +/* bit input */ +/* buffer, buffer_end and size_in_bits must be present and used by every reader */ + +/* frame parsing */ +typedef struct ParserContext { + //int64_t offset; ///< byte offset from starting packet start + int ifile; + int ofile; + int buffer_size; + int eof_reached; + + uint8_t *data; + int size; + uint8_t *cur_ptr; + int cur_len; + + int64_t frame_offset; /* offset of the current frame */ + int64_t cur_offset; /* current offset (incremented by each av_parser_parse()) */ + int64_t next_frame_offset; /* offset of the next frame */ + int pict_type; + int repeat_pict; //frame_duration = (1 + repeat_pict) * time_base. It is used by codecs like H.264 to display telecined material. + int key_frame; //Set by parser to 1 for key frames and 0 for non-key frames. + int64_t pos; // Byte position of currently parsed frame in stream. + int64_t last_pos; //Previous frame byte position. + int final_frame; + + uint8_t overread[5]; + int overread_cnt; ///< the number of bytes which where irreversibly read from the next frame + int index; + int last_index; + int frame_start_found; + uint32_t state; ///< contains the last few bytes in MSB order +} ParserContext; + +typedef struct NalContext { + + SPS *sps_buffers[MAX_SPS_COUNT]; + PPS *pps_buffers[MAX_PPS_COUNT]; + SPS sps; ///< current sps + + PictureInfo picture[16 + 1]; ///< Ref pic buffer used for deriving lists. Later linked with pic in dpb. + PictureInfo *release_ref[MAX_MMCO_COUNT]; + PictureInfo *short_ref[32]; + PictureInfo *long_ref[32]; + int long_ref_count; ///< number of actual long term references + int short_ref_count; ///< number of actual short term references + + //POC stuff + uint32_t coded_pic_num; + int poc_lsb; + int poc_msb; + uint32_t poc_offset; + int delta_poc; + int frame_num; + int prev_poc_msb; ///< poc_msb of the last reference pic for POC type 0 + int prev_poc_lsb; ///< poc_lsb of the last reference pic for POC type 0 + int frame_num_offset; ///< for POC type 2 + int prev_frame_num_offset; ///< for POC type 2 + int prev_frame_num; ///< frame_num of the last pic for POC type 1/2 + + int max_pic_num; + int redundant_pic_count; + int outputed_poc; + int ip_id; +// int b8_stride; ///< 2*mb_width+1 used for some 8x8 block arrays to allow simple addressing + int b4_stride; ///< 4*mb_width+1 used for some 4x4 block arrays to allow simple addressing + int mb_stride; ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11 + int mb_width; + int mb_height; + int width; + int height; + + int has_b_frames; + //pic_struct in picture timing SEI message + SEI_PicStructType sei_pic_struct; + // Bit set of clock types for fields/frames in picture timing SEI message. For each found ct_type, appropriate bit is set (e.g., bit 1 for interlaced). + int sei_ct_type; + // dpb_output_delay in picture timing SEI message, see H.264 C.2.2 + int sei_dpb_output_delay; + //cpb_removal_delay in picture timing SEI message, see H.264 C.1.2 + int sei_cpb_removal_delay; + //recovery_frame_cnt from SEI message + int sei_recovery_frame_cnt; + // Timestamp stuff + int sei_buffering_period_present; ///< Buffering period SEI flag + int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs + +} NalContext; + +typedef struct EntropyContext{ + CABACContext c; + + H264Mb *m; + int top_cbp; + int left_cbp; + int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct + + uint32_t top_type; + uint32_t left_type; + uint32_t topright_type; + uint32_t topleft_type; + + int curr_qscale; + int chroma_qp[2]; //QPc + int last_qscale_diff; + + uint32_t dequant4_buffer[6][52][16]; + uint32_t dequant8_buffer[2][52][64]; + uint32_t (*dequant4_coeff[6])[16]; + uint32_t (*dequant8_coeff[2])[64]; + +// uint8_t (*non_zero_count_top)[32]; +// uint8_t (*non_zero_count)[32]; +// uint8_t (*non_zero_count_row[2])[32]; + + uint8_t (*non_zero_count_top)[8]; + uint8_t (*non_zero_count)[8]; + uint8_t (*non_zero_count_row[2])[8]; + DECLARE_ALIGNED(8, uint8_t, non_zero_count_left[8]); + + uint8_t (*mvd_top[2])[2]; + uint8_t (*mvd[2])[2]; + uint8_t (*mvd_table[2][2])[2]; + + uint8_t *direct_top; + uint8_t *direct; + uint8_t *direct_table[2]; + + uint8_t *chroma_pred_mode_top; + uint8_t *chroma_pred_mode; + uint8_t *chroma_pred_mode_table[2]; + + uint16_t *cbp_top; + uint16_t *cbp; + uint16_t *cbp_table[2]; + + int8_t *qscale_top; + int8_t *qscale; + int8_t *qscale_table[2]; + + int8_t *ref_index_top[2]; + int8_t *ref_index[2]; + int8_t *ref_index_table[2][2]; + + uint32_t *mb_type_top; + uint32_t *mb_type; + uint32_t *mb_type_table[2]; + + int b_stride; + int mb_stride; + int mb_width; + int mb_height; + + uint8_t *zigzag_scan; + uint8_t *zigzag_scan8x8; + uint8_t direct_cache[5*8]; + + DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]); + DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; + DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; + DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; + DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; + +} EntropyContext; + +typedef struct H264Slice { + PPS pps; ///< current pps + PictureInfo* current_picture_info; + DecodedPicture* curr_pic; + int slice_num; + + int release_ref_cpn[MAX_MMCO_COUNT]; + int release_cnt; + + int qp_thresh; ///< QP threshold to skip loopfilter + int use_weight; + int use_weight_chroma; + int luma_log2_weight_denom; + int chroma_log2_weight_denom; + + int16_t luma_weight[16][2][2]; + int16_t chroma_weight[16][2][2][2]; + int16_t implicit_weight[16][16][2]; + + //poc number of ref_list int ref_poc[2][16] + //In edslice this must becom Picture Info + int ref_list_cpn[2][16]; + PictureInfo *ref_list[2][16]; ///Reordered version of default_ref_list according to picture reordering in slice header + DecodedPicture *dp_ref_list[2][16]; + int ref_count[2]; ///< counts frames or fields, depending on current mb mode + + int slice_type; + int slice_type_nos; + int slice_alpha_c0_offset; + int slice_beta_offset; + int direct_8x8_inference_flag; + + uint8_t list_count; + uint32_t coded_pic_num; + + int poc; + int key_frame; + int mmco_reset; //FIXME not used? + + ///stuff only needed for nal/entropy decoding +// H264Mb *m; +// GetBitContext *gb; + int ip_id; + int transform_bypass; + int direct_spatial_mv_pred; + int map_col_to_list0[2][16]; + int dist_scale_factor[16]; + + int cabac_init_idc; + int nal_ref_idc; + int nal_unit_type; + + int ref2frm[2][64]; ///< reference to frame number lists, the first 2 are for -2,-1 + + int qscale; + +} H264Slice; + +typedef struct { + H264Slice slice; + H264Mb *mbs; + DecodedPicture *dp; + GetBitContext gb; + + int lines_taken; + int lines_total; + int state; // 0 free, 1 in use //1 wait for entropy, 2 wait for reconstruct. + int initialized; +} SliceBufferEntry; + +typedef struct RingLineEntry{ + union{ + DECLARE_ALIGNED(64, volatile int32_t, mb_cnt); + DECLARE_ALIGNED(64, int32_t, pad[16]); + }; + SliceBufferEntry *sbe; + int id; + int line; + TopBorder *top; + struct RingLineEntry *prev_line; + +} RingLineEntry; + +// #if OMPSS +typedef struct SuperMBTask{ + int smb_x; + int smb_y; +} SuperMBTask; + +typedef struct SuperMBContext{ + int nsmb_width; //number of super macroblocks in picture width + int nsmb_height; //number of super macroblocks in picture height + int nsmb_3dheight; //number of super macroblocks in picture height - max motion vertical vector + int smb_width; //width of a super macroblock + int smb_height; //height of a super macroblock + int refcount; + int index; + SuperMBTask *smbs[2]; +} SuperMBContext; +// #endif + +//scratchpad for decoding a macroblock +typedef struct MBRecState{ + int8_t *ref_index_top[2]; + int8_t *ref_index[2]; + int16_t (*motion_val_top[2])[2]; + int16_t (*motion_val[2])[2]; + uint32_t *mb_type_top; + uint32_t *mb_type; + + int8_t *list1_ref_index[2]; + int16_t (*list1_motion_val[2])[2]; + uint32_t *list1_mb_type; + + int8_t *intra4x4_pred_mode_top; + int8_t *intra4x4_pred_mode; +#if !OMPSS + int8_t intra4x4_pred_mode_left[4]; +#endif + int8_t *non_zero_count_top; + int8_t *non_zero_count; +// int8_t non_zero_count_left[8]; + + + unsigned int topleft_samples_available; + unsigned int topright_samples_available; + unsigned int top_samples_available; + unsigned int left_samples_available; + + int top_type; + int left_type; + + DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]); + DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; + DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; + DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; + DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; + + DECLARE_ALIGNED(8, int16_t, bS)[2][4][4]; + uint8_t edges[2]; + +}MBRecState ; + +typedef struct MBRecContext{ + DSPContext dsp; ///< pointers for accelerated dsp functions + H264DSPContext hdsp; + H264PredContext hpc; + + MBRecState *mrs; + RingLineEntry *rle; //debug + + uint8_t *scratchpad_y; ///implemented different on Cell + uint8_t *scratchpad_cb; ///implemented different on Cell + uint8_t *scratchpad_cr; ///implemented different on Cell + + int linesize; + int uvlinesize; + int mb_width; + int mb_height; + int mb_stride; + int b_stride; + int width; + int height; + +#if !OMPSS // not used in OMPSS + LeftBorder left; + TopBorder *top; + TopBorder *top_next; // next line top border +#endif + /* + .UU.YYYY + .UU.YYYY + .vv.YYYY + .VV.YYYY + */ + + // block_offset[ 0..23] for frame macroblocks + int block_offset[16+8]; + +} MBRecContext; + +#ifdef HAVE_LIBSDL2 +typedef struct SDLContext{ + int display; + int fullscreen; + pthread_t listen_thread; + + SDL_DisplayMode full; + SDL_DisplayMode wind; + + + SDL_Renderer *renderer; + SDL_Rect rect; + SDL_Rect win_rect; + SDL_Window *window; + double aspect; + int win_w; + int win_h; + int resized; + + SDL_Texture *sbmap_texture; + int showmap; + int updatemap; + int pause; + +} SDLContext; +#endif + +typedef struct OutputContext { + int bit_buffer_size; + uint8_t *bit_buffer; + uint64_t video_size; + int frame_number; + DecodedPicture *delayed_pic[DPB_SIZE]; + int dp_cnt; + +} OutputContext; + +typedef struct { + pthread_mutex_t lock; + pthread_cond_t cond; + SliceBufferEntry **queue; + int size; + int cnt; + int fi; + int fo; +} SliceBufferQueue; + +typedef struct { + pthread_mutex_t wslock; + pthread_cond_t wscond; + pthread_mutex_t swlock; + pthread_cond_t swcond; + RingLineEntry **queue; + int size; + int ready; + int free; + int fi; + int fo; +} RingLineQueue; + +#if HAVE_LIBSDL2 +typedef struct { + pthread_mutex_t sdl_lock; + pthread_cond_t sdl_cond; + SDL_Texture **queue; + int size; + int ready; + int fi; + int fo; + int exit; +} SDLTextureQueue; +#endif +/** +* H264Context +*/ +typedef struct H264Context{ + SliceBufferQueue sb_q[STAGES]; + RingLineQueue rl_q; + + pthread_mutex_t lock[STAGES]; + pthread_cond_t cond[STAGES]; + + pthread_mutex_t task_lock; + pthread_cond_t task_cond; + + pthread_attr_t ed_rec_attr[MAX_THREADS]; + pthread_t ed_rec_thr[MAX_THREADS]; + + int init_threads; + pthread_mutex_t ilock; + pthread_cond_t icond; + + const char *file_name; + int profile; + int start; + int touch_start; + int setaff; + int touch_done; + int rl_side_touch; + int statmbd; + pthread_mutex_t slock; + pthread_cond_t scond; + pthread_mutex_t tlock; + pthread_cond_t tcond; + pthread_mutex_t tdlock; + pthread_cond_t tdcond; + + int ed_ppe_threads; + int threads; + int smt; + + int acdpb_cnt; //debug + int reldpb_cnt; + + int sb_size; + SliceBufferEntry *sb; ///< Slice Syntax Buffer + int free_sb_cnt; + int slice_bufs; + + int max_dpb_cnt; + DecodedPicture *dpb; ///< Decoded Picture Buffer + int free_dpb_cnt; + + int ifile; + int ofile; + int frame_width; + int frame_height; + int num_frames; + int width; + int height; + int mb_width; + int mb_height; + int mb_stride; ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11 + int b4_stride; + int b_stride; + + int smb_height; + int smb_width; + pthread_mutex_t smb_lock; + pthread_cond_t sdl_cond; + pthread_mutex_t sdl_lock; + SuperMBContext *smbc; + + int wave_order; + int static_3d; + int pipe_bufs; + + //shared tables used in entropy decoding + uint8_t zigzag_scan[16]; + uint8_t zigzag_scan8x8[64]; + + int verbose; + int no_mbd; + int display; + int fullscreen; + int quit; +#ifdef HAVE_LIBSDL2 + SDLTextureQueue sdlq; + SDLContext *sdlc; +#endif + + struct timespec start_time[PROFILE_STAGES]; + struct timespec end_time[PROFILE_STAGES]; + double last_time[PROFILE_STAGES]; + double total_time[PROFILE_STAGES]; + +}H264Context; + +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/mathops.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/mathops.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,145 @@ +/* + * simple math operations + * Copyright (c) 2001, 2002 Fabrice Bellard + * Copyright (c) 2006 Michael Niedermayer et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef AVCODEC_MATHOPS_H +#define AVCODEC_MATHOPS_H + +#include "libavutil/common.h" +#include "libavutil/internal.h" + +#if ARCH_ARM +# include "arm/mathops.h" +#elif ARCH_PPC +# include "ppc/mathops.h" +#elif ARCH_X86 +# include "x86/mathops.h" +#endif + +/* generic implementation */ + +#ifndef MULL +# define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s)) +#endif + +#ifndef MULH +//gcc 3.4 creates an incredibly bloated mess out of this +//# define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) + +static av_always_inline int MULH(int a, int b){ + return ((int64_t)(a) * (int64_t)(b))>>32; +} +#endif + +#ifndef UMULH +static av_always_inline unsigned UMULH(unsigned a, unsigned b){ + return ((uint64_t)(a) * (uint64_t)(b))>>32; +} +#endif + +#ifndef MUL64 +# define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) +#endif + +#ifndef MAC64 +# define MAC64(d, a, b) ((d) += MUL64(a, b)) +#endif + +#ifndef MLS64 +# define MLS64(d, a, b) ((d) -= MUL64(a, b)) +#endif + +/* signed 16x16 -> 32 multiply add accumulate */ +#ifndef MAC16 +# define MAC16(rt, ra, rb) rt += (ra) * (rb) +#endif + +/* signed 16x16 -> 32 multiply */ +#ifndef MUL16 +# define MUL16(ra, rb) ((ra) * (rb)) +#endif + +#ifndef MLS16 +# define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb)) +#endif + +/* median of 3 */ +#ifndef mid_pred +#define mid_pred mid_pred +static inline av_const int mid_pred(int a, int b, int c) +{ +#if 0 + int t= (a-b)&((a-b)>>31); + a-=t; + b+=t; + b-= (b-c)&((b-c)>>31); + b+= (a-b)&((a-b)>>31); + + return b; +#else + if(a>b){ + if(c>b){ + if(c>a) b=a; + else b=c; + } + }else{ + if(b>c){ + if(c>a) b=c; + else b=a; + } + } + return b; +#endif +} +#endif + +#ifndef sign_extend +static inline av_const int sign_extend(int val, unsigned bits) +{ + return (val << (INT_BIT - bits)) >> (INT_BIT - bits); +} +#endif + +#ifndef zero_extend +static inline av_const unsigned zero_extend(unsigned val, unsigned bits) +{ + return (val << (INT_BIT - bits)) >> (INT_BIT - bits); +} +#endif + +#ifndef COPY3_IF_LT +#define COPY3_IF_LT(x, y, a, b, c, d)\ +if ((y) < (x)) {\ + (x) = (y);\ + (a) = (b);\ + (c) = (d);\ +} +#endif + +#ifndef NEG_SSR32 +# define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s))) +#endif + +#ifndef NEG_USR32 +# define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s))) +#endif + +#endif /* AVCODEC_MATHOPS_H */ + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_altivec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/dsputil_altivec.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,619 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif +#include "libavcodec/dsputil.h" +#include "dsputil_ppc.h" +#include "util_altivec.h" +#include "types_altivec.h" +#include "dsputil_altivec.h" + + +static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) +{ + int i; + vector unsigned char perm, bytes, *pixv; + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); + vector signed short shorts; + + for (i = 0; i < 8; i++) { + // Read potentially unaligned pixels. + // We're reading 16 pixels, and actually only want 8, + // but we simply ignore the extras. + perm = vec_lvsl(0, pixels); + pixv = (vector unsigned char *) pixels; + bytes = vec_perm(pixv[0], pixv[1], perm); + + // convert the bytes into shorts + shorts = (vector signed short)vec_mergeh(zero, bytes); + + // save the data to the block, we assume the block is 16-byte aligned + vec_st(shorts, i*16, (vector signed short*)block); + + pixels += line_size; + } +} + +static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, + const uint8_t *s2, int stride) +{ + int i; + vector unsigned char perm, bytes, *pixv; + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); + vector signed short shorts1, shorts2; + + for (i = 0; i < 4; i++) { + // Read potentially unaligned pixels + // We're reading 16 pixels, and actually only want 8, + // but we simply ignore the extras. + perm = vec_lvsl(0, s1); + pixv = (vector unsigned char *) s1; + bytes = vec_perm(pixv[0], pixv[1], perm); + + // convert the bytes into shorts + shorts1 = (vector signed short)vec_mergeh(zero, bytes); + + // Do the same for the second block of pixels + perm = vec_lvsl(0, s2); + pixv = (vector unsigned char *) s2; + bytes = vec_perm(pixv[0], pixv[1], perm); + + // convert the bytes into shorts + shorts2 = (vector signed short)vec_mergeh(zero, bytes); + + // Do the subtraction + shorts1 = vec_sub(shorts1, shorts2); + + // save the data to the block, we assume the block is 16-byte aligned + vec_st(shorts1, 0, (vector signed short*)block); + + s1 += stride; + s2 += stride; + block += 8; + + + // The code below is a copy of the code above... This is a manual + // unroll. + + // Read potentially unaligned pixels + // We're reading 16 pixels, and actually only want 8, + // but we simply ignore the extras. + perm = vec_lvsl(0, s1); + pixv = (vector unsigned char *) s1; + bytes = vec_perm(pixv[0], pixv[1], perm); + + // convert the bytes into shorts + shorts1 = (vector signed short)vec_mergeh(zero, bytes); + + // Do the same for the second block of pixels + perm = vec_lvsl(0, s2); + pixv = (vector unsigned char *) s2; + bytes = vec_perm(pixv[0], pixv[1], perm); + + // convert the bytes into shorts + shorts2 = (vector signed short)vec_mergeh(zero, bytes); + + // Do the subtraction + shorts1 = vec_sub(shorts1, shorts2); + + // save the data to the block, we assume the block is 16-byte aligned + vec_st(shorts1, 0, (vector signed short*)block); + + s1 += stride; + s2 += stride; + block += 8; + } +} + + +static void clear_block_altivec(DCTELEM *block) { + LOAD_ZERO; + vec_st(zero_s16v, 0, block); + vec_st(zero_s16v, 16, block); + vec_st(zero_s16v, 32, block); + vec_st(zero_s16v, 48, block); + vec_st(zero_s16v, 64, block); + vec_st(zero_s16v, 80, block); + vec_st(zero_s16v, 96, block); + vec_st(zero_s16v, 112, block); +} + + + +/* next one assumes that ((line_size % 16) == 0) */ +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); + register vector unsigned char pixelsv1, pixelsv2; + register vector unsigned char pixelsv1B, pixelsv2B; + register vector unsigned char pixelsv1C, pixelsv2C; + register vector unsigned char pixelsv1D, pixelsv2D; + + register vector unsigned char perm = vec_lvsl(0, pixels); + int i; + register int line_size_2 = line_size << 1; + register int line_size_3 = line_size + line_size_2; + register int line_size_4 = line_size << 2; + +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); +// hand-unrolling the loop by 4 gains about 15% +// mininum execution time goes from 74 to 60 cycles +// it's faster than -funroll-loops, but using +// -funroll-loops w/ this is bad - 74 cycles again. +// all this is on a 7450, tuning for the 7450 +#if 0 + for (i = 0; i < h; i++) { + pixelsv1 = vec_ld(0, pixels); + pixelsv2 = vec_ld(16, pixels); + vec_st(vec_perm(pixelsv1, pixelsv2, perm), + 0, block); + pixels+=line_size; + block +=line_size; + } +#else + for (i = 0; i < h; i += 4) { + pixelsv1 = vec_ld( 0, pixels); + pixelsv2 = vec_ld(15, pixels); + pixelsv1B = vec_ld(line_size, pixels); + pixelsv2B = vec_ld(15 + line_size, pixels); + pixelsv1C = vec_ld(line_size_2, pixels); + pixelsv2C = vec_ld(15 + line_size_2, pixels); + pixelsv1D = vec_ld(line_size_3, pixels); + pixelsv2D = vec_ld(15 + line_size_3, pixels); + vec_st(vec_perm(pixelsv1, pixelsv2, perm), + 0, (unsigned char*)block); + vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), + line_size, (unsigned char*)block); + vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), + line_size_2, (unsigned char*)block); + vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), + line_size_3, (unsigned char*)block); + pixels+=line_size_4; + block +=line_size_4; + } +#endif +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); +} + +/* next one assumes that ((line_size % 16) == 0) */ +#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; + register vector unsigned char perm = vec_lvsl(0, pixels); + int i; + +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); + + for (i = 0; i < h; i++) { + pixelsv1 = vec_ld( 0, pixels); + pixelsv2 = vec_ld(16,pixels); + blockv = vec_ld(0, block); + pixelsv = vec_perm(pixelsv1, pixelsv2, perm); + blockv = vec_avg(blockv,pixelsv); + vec_st(blockv, 0, (unsigned char*)block); + pixels+=line_size; + block +=line_size; + } + +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; + int i; + +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); + + for (i = 0; i < h; i++) { + /* block is 8 bytes-aligned, so we're either in the + left block (16 bytes-aligned) or in the right block (not) */ + int rightside = ((unsigned long)block & 0x0000000F); + + blockv = vec_ld(0, block); + pixelsv1 = vec_ld( 0, pixels); + pixelsv2 = vec_ld(16, pixels); + pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); + + if (rightside) { + pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); + } else { + pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); + } + + blockv = vec_avg(blockv, pixelsv); + + vec_st(blockv, 0, block); + + pixels += line_size; + block += line_size; + } + +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vcone); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); +} + +/* next one assumes that ((line_size % 16) == 0) */ +static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short temp3, temp4, + pixelssum1, pixelssum2, pixelssum3, pixelssum4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vctwo); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + + for (i = 0; i < h ; i++) { + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + + blockv = vec_packsu(temp3, temp4); + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); +} + +/* next one assumes that ((line_size % 16) == 0) */ +static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char blockv, temp1, temp2; + register vector unsigned short temp3, temp4, + pixelssum1, pixelssum2, pixelssum3, pixelssum4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vcone); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + + for (i = 0; i < h ; i++) { + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vcone); + pixelssum1 = vec_add(pixelssum2, vcone); + + blockv = vec_packsu(temp3, temp4); + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); +} + +/* next one assumes that ((line_size % 8) == 0) */ +static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); + register int i; + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; + register vector unsigned char blockv, temp1, temp2, blocktemp; + register vector unsigned short pixelssum1, pixelssum2, temp3; + + register const vector unsigned char vczero = (const vector unsigned char) + vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short) + vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { + pixelsv2 = temp2; + } else { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) { + blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } else { + blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + blockv = vec_avg(blocktemp, blockv); + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); +} + +void dsputil_init_altivec(DSPContext* c) +{ + c->diff_pixels = diff_pixels_altivec; + c->get_pixels = get_pixels_altivec; + c->clear_block = clear_block_altivec; + + c->put_pixels_tab[0][0] = put_pixels16_altivec; + /* the two functions do the same thing, so use the same code */ + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; + c->avg_pixels_tab[0][0] = avg_pixels16_altivec; + c->avg_pixels_tab[1][0] = avg_pixels8_altivec; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; + c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; + c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; + +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_altivec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/dsputil_altivec.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H +#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H + +#include +#include "libavcodec/dsputil.h" + +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); + +int has_altivec(void); + +void fdct_altivec(int16_t *block); +void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, + int x16, int y16, int rounder); +void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); +void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); + +void ff_vp3_idct_altivec(DCTELEM *block); +void ff_vp3_idct_put_altivec(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_altivec(uint8_t *dest, int line_size, DCTELEM *block); + +void dsputil_h264_init_ppc(DSPContext* c); + +void dsputil_init_altivec(DSPContext* c); +//void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx); +//void float_init_altivec(DSPContext* c, AVCodecContext *avctx); +//void int_init_altivec(DSPContext* c, AVCodecContext *avctx); + +#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_ppc.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/dsputil_ppc.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "dsputil_ppc.h" +#include "dsputil_altivec.h" + +static void prefetch_ppc(void *mem, int stride, int h) +{ + register const uint8_t *p = mem; + do { + __asm__ volatile ("dcbt 0,%0" : : "r" (p)); + p+= stride; + } while(--h); +} + +void dsputil_init_ppc(DSPContext* c) +{ + c->prefetch = prefetch_ppc; + +#if HAVE_ALTIVEC + dsputil_h264_init_ppc(c); + dsputil_init_altivec(c); + + c->idct_put = idct_put_altivec; + c->idct_add = idct_add_altivec; + +#endif /* HAVE_ALTIVEC */ +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/dsputil_ppc.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/dsputil_ppc.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PPC_DSPUTIL_PPC_H +#define AVCODEC_PPC_DSPUTIL_PPC_H + +#include "config.h" + +#if CONFIG_POWERPC_PERF +void powerpc_display_perf_report(void); +/* the 604* have 2, the G3* have 4, the G4s have 6, + and the G5 are completely different (they MUST use + ARCH_PPC64, and let's hope all future 64 bis PPC + will use the same PMCs... */ +#define POWERPC_NUM_PMC_ENABLED 6 +/* if you add to the enum below, also add to the perfname array + in dsputil_ppc.c */ +enum powerpc_perf_index { + altivec_fft_num = 0, + altivec_gmc1_num, + altivec_dct_unquantize_h263_num, + altivec_fdct, + altivec_idct_add_num, + altivec_idct_put_num, + altivec_put_pixels16_num, + altivec_avg_pixels16_num, + altivec_avg_pixels8_num, + altivec_put_pixels8_xy2_num, + altivec_put_no_rnd_pixels8_xy2_num, + altivec_put_pixels16_xy2_num, + altivec_put_no_rnd_pixels16_xy2_num, + altivec_hadamard8_diff8x8_num, + altivec_hadamard8_diff16_num, + altivec_avg_pixels8_xy2_num, + powerpc_clear_blocks_dcbz32, + powerpc_clear_blocks_dcbz128, + altivec_put_h264_chroma_mc8_num, + altivec_avg_h264_chroma_mc8_num, + altivec_put_h264_qpel16_h_lowpass_num, + altivec_avg_h264_qpel16_h_lowpass_num, + altivec_put_h264_qpel16_v_lowpass_num, + altivec_avg_h264_qpel16_v_lowpass_num, + altivec_put_h264_qpel16_hv_lowpass_num, + altivec_avg_h264_qpel16_hv_lowpass_num, + powerpc_perf_total +}; +enum powerpc_data_index { + powerpc_data_min = 0, + powerpc_data_max, + powerpc_data_sum, + powerpc_data_num, + powerpc_data_total +}; +extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; + +#if !ARCH_PPC64 +#define POWERP_PMC_DATATYPE unsigned long +#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a)) +#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a)) +#if (POWERPC_NUM_PMC_ENABLED > 2) +#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a)) +#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a)) +#else +#define POWERPC_GET_PMC3(a) do {} while (0) +#define POWERPC_GET_PMC4(a) do {} while (0) +#endif +#if (POWERPC_NUM_PMC_ENABLED > 4) +#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a)) +#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a)) +#else +#define POWERPC_GET_PMC5(a) do {} while (0) +#define POWERPC_GET_PMC6(a) do {} while (0) +#endif +#else /* ARCH_PPC64 */ +#define POWERP_PMC_DATATYPE unsigned long long +#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a)) +#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a)) +#if (POWERPC_NUM_PMC_ENABLED > 2) +#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a)) +#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a)) +#else +#define POWERPC_GET_PMC3(a) do {} while (0) +#define POWERPC_GET_PMC4(a) do {} while (0) +#endif +#if (POWERPC_NUM_PMC_ENABLED > 4) +#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a)) +#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a)) +#else +#define POWERPC_GET_PMC5(a) do {} while (0) +#define POWERPC_GET_PMC6(a) do {} while (0) +#endif +#endif /* ARCH_PPC64 */ +#define POWERPC_PERF_DECLARE(a, cond) \ + POWERP_PMC_DATATYPE \ + pmc_start[POWERPC_NUM_PMC_ENABLED], \ + pmc_stop[POWERPC_NUM_PMC_ENABLED], \ + pmc_loop_index; +#define POWERPC_PERF_START_COUNT(a, cond) do { \ + POWERPC_GET_PMC6(pmc_start[5]); \ + POWERPC_GET_PMC5(pmc_start[4]); \ + POWERPC_GET_PMC4(pmc_start[3]); \ + POWERPC_GET_PMC3(pmc_start[2]); \ + POWERPC_GET_PMC2(pmc_start[1]); \ + POWERPC_GET_PMC1(pmc_start[0]); \ + } while (0) +#define POWERPC_PERF_STOP_COUNT(a, cond) do { \ + POWERPC_GET_PMC1(pmc_stop[0]); \ + POWERPC_GET_PMC2(pmc_stop[1]); \ + POWERPC_GET_PMC3(pmc_stop[2]); \ + POWERPC_GET_PMC4(pmc_stop[3]); \ + POWERPC_GET_PMC5(pmc_stop[4]); \ + POWERPC_GET_PMC6(pmc_stop[5]); \ + if (cond) { \ + for(pmc_loop_index = 0; \ + pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ + pmc_loop_index++) { \ + if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ + POWERP_PMC_DATATYPE diff = \ + pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ + if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ + perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ + if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ + perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ + perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ + perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ + } \ + } \ + } \ +} while (0) +#else /* CONFIG_POWERPC_PERF */ +// those are needed to avoid empty statements. +#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) +#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) +#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) +#endif /* CONFIG_POWERPC_PERF */ + +#endif /* AVCODEC_PPC_DSPUTIL_PPC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/h264_altivec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/h264_altivec.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1021 @@ +/* + * Copyright (c) 2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "libavcodec/h264_data.h" +#include "libavcodec/h264_dsp.h" + +#include "dsputil_ppc.h" +#include "dsputil_altivec.h" +#include "util_altivec.h" +#include "types_altivec.h" + +#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s +#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) + +#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC +#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec +#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec +#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num +#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec +#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num +#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec +#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num +#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec +#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num +#include "h264_template_altivec.c" +#undef OP_U8_ALTIVEC +#undef PREFIX_h264_chroma_mc8_altivec +#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec +#undef PREFIX_h264_chroma_mc8_num +#undef PREFIX_h264_qpel16_h_lowpass_altivec +#undef PREFIX_h264_qpel16_h_lowpass_num +#undef PREFIX_h264_qpel16_v_lowpass_altivec +#undef PREFIX_h264_qpel16_v_lowpass_num +#undef PREFIX_h264_qpel16_hv_lowpass_altivec +#undef PREFIX_h264_qpel16_hv_lowpass_num + +#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC +#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec +#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec +#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num +#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec +#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num +#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec +#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num +#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec +#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num +#include "h264_template_altivec.c" +#undef OP_U8_ALTIVEC +#undef PREFIX_h264_chroma_mc8_altivec +#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec +#undef PREFIX_h264_chroma_mc8_num +#undef PREFIX_h264_qpel16_h_lowpass_altivec +#undef PREFIX_h264_qpel16_h_lowpass_num +#undef PREFIX_h264_qpel16_v_lowpass_altivec +#undef PREFIX_h264_qpel16_v_lowpass_num +#undef PREFIX_h264_qpel16_hv_lowpass_altivec +#undef PREFIX_h264_qpel16_hv_lowpass_num + +#define H264_MC(OPNAME, SIZE, CODETYPE) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \ + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ +}\ + +static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, + const uint8_t * src2, int dst_stride, + int src_stride1, int h) +{ + int i; + vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; + + mask_ = vec_lvsl(0, src2); + + for (i = 0; i < h; i++) { + + tmp1 = vec_ld(i * src_stride1, src1); + mask = vec_lvsl(i * src_stride1, src1); + tmp2 = vec_ld(i * src_stride1 + 15, src1); + + a = vec_perm(tmp1, tmp2, mask); + + tmp1 = vec_ld(i * 16, src2); + tmp2 = vec_ld(i * 16 + 15, src2); + + b = vec_perm(tmp1, tmp2, mask_); + + tmp1 = vec_ld(0, dst); + mask = vec_lvsl(0, dst); + tmp2 = vec_ld(15, dst); + + d = vec_avg(a, b); + + edges = vec_perm(tmp2, tmp1, mask); + + align = vec_lvsr(0, dst); + + tmp2 = vec_perm(d, edges, align); + tmp1 = vec_perm(edges, d, align); + + vec_st(tmp2, 15, dst); + vec_st(tmp1, 0 , dst); + + dst += dst_stride; + } +} + +static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, + const uint8_t * src2, int dst_stride, + int src_stride1, int h) +{ + int i; + vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; + + mask_ = vec_lvsl(0, src2); + + for (i = 0; i < h; i++) { + + tmp1 = vec_ld(i * src_stride1, src1); + mask = vec_lvsl(i * src_stride1, src1); + tmp2 = vec_ld(i * src_stride1 + 15, src1); + + a = vec_perm(tmp1, tmp2, mask); + + tmp1 = vec_ld(i * 16, src2); + tmp2 = vec_ld(i * 16 + 15, src2); + + b = vec_perm(tmp1, tmp2, mask_); + + tmp1 = vec_ld(0, dst); + mask = vec_lvsl(0, dst); + tmp2 = vec_ld(15, dst); + + d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); + + edges = vec_perm(tmp2, tmp1, mask); + + align = vec_lvsr(0, dst); + + tmp2 = vec_perm(d, edges, align); + tmp1 = vec_perm(edges, d, align); + + vec_st(tmp2, 15, dst); + vec_st(tmp1, 0 , dst); + + dst += dst_stride; + } +} + +/* Implemented but could be faster +#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) +#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) + */ + +H264_MC(put_, 16, altivec) +H264_MC(avg_, 16, altivec) + + +/**************************************************************************** + * IDCT transform: + ****************************************************************************/ + +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ + /* 1st stage */ \ + vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ + vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ + vz2 = vec_sra(vb1,vec_splat_u16(1)); \ + vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ + vz3 = vec_sra(vb3,vec_splat_u16(1)); \ + vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ + /* 2nd stage: output */ \ + va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ + va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ + va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ + va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ + +#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ + b0 = vec_mergeh( a0, a0 ); \ + b1 = vec_mergeh( a1, a0 ); \ + b2 = vec_mergeh( a2, a0 ); \ + b3 = vec_mergeh( a3, a0 ); \ + a0 = vec_mergeh( b0, b2 ); \ + a1 = vec_mergel( b0, b2 ); \ + a2 = vec_mergeh( b1, b3 ); \ + a3 = vec_mergel( b1, b3 ); \ + b0 = vec_mergeh( a0, a2 ); \ + b1 = vec_mergel( a0, a2 ); \ + b2 = vec_mergeh( a1, a3 ); \ + b3 = vec_mergel( a1, a3 ) + +#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ + vdst_orig = vec_ld(0, dst); \ + vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ + vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \ + va = vec_add(va, vdst_ss); \ + va_u8 = vec_packsu(va, zero_s16v); \ + va_u32 = vec_splat((vec_u32)va_u8, 0); \ + vec_ste(va_u32, element, (uint32_t*)dst); + +static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) +{ + vec_s16 va0, va1, va2, va3; + vec_s16 vz0, vz1, vz2, vz3; + vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; + vec_u8 va_u8; + vec_u32 va_u32; + vec_s16 vdst_ss; + const vec_u16 v6us = vec_splat_u16(6); + vec_u8 vdst, vdst_orig; + vec_u8 vdst_mask = vec_lvsl(0, dst); + int element = ((unsigned long)dst & 0xf) >> 2; + LOAD_ZERO; + + block[0] += 32; /* add 32 as a DC-level for rounding */ + + vtmp0 = vec_ld(0,block); + vtmp1 = vec_sld(vtmp0, vtmp0, 8); + vtmp2 = vec_ld(16,block); + vtmp3 = vec_sld(vtmp2, vtmp2, 8); + + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); + VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); + + va0 = vec_sra(va0,v6us); + va1 = vec_sra(va1,v6us); + va2 = vec_sra(va2,v6us); + va3 = vec_sra(va3,v6us); + + VEC_LOAD_U8_ADD_S16_STORE_U8(va0); + dst += stride; + VEC_LOAD_U8_ADD_S16_STORE_U8(va1); + dst += stride; + VEC_LOAD_U8_ADD_S16_STORE_U8(va2); + dst += stride; + VEC_LOAD_U8_ADD_S16_STORE_U8(va3); +} + +#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ + /* a0 = SRC(0) + SRC(4); */ \ + vec_s16 a0v = vec_add(s0, s4); \ + /* a2 = SRC(0) - SRC(4); */ \ + vec_s16 a2v = vec_sub(s0, s4); \ + /* a4 = (SRC(2)>>1) - SRC(6); */ \ + vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \ + /* a6 = (SRC(6)>>1) + SRC(2); */ \ + vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \ + /* b0 = a0 + a6; */ \ + vec_s16 b0v = vec_add(a0v, a6v); \ + /* b2 = a2 + a4; */ \ + vec_s16 b2v = vec_add(a2v, a4v); \ + /* b4 = a2 - a4; */ \ + vec_s16 b4v = vec_sub(a2v, a4v); \ + /* b6 = a0 - a6; */ \ + vec_s16 b6v = vec_sub(a0v, a6v); \ + /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ + /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ + vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ + /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ + /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ + vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ + /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ + /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ + vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ + /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ + vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ + /* b1 = (a7>>2) + a1; */ \ + vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \ + /* b3 = a3 + (a5>>2); */ \ + vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \ + /* b5 = (a3>>2) - a5; */ \ + vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \ + /* b7 = a7 - (a1>>2); */ \ + vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ + /* DST(0, b0 + b7); */ \ + d0 = vec_add(b0v, b7v); \ + /* DST(1, b2 + b5); */ \ + d1 = vec_add(b2v, b5v); \ + /* DST(2, b4 + b3); */ \ + d2 = vec_add(b4v, b3v); \ + /* DST(3, b6 + b1); */ \ + d3 = vec_add(b6v, b1v); \ + /* DST(4, b6 - b1); */ \ + d4 = vec_sub(b6v, b1v); \ + /* DST(5, b4 - b3); */ \ + d5 = vec_sub(b4v, b3v); \ + /* DST(6, b2 - b5); */ \ + d6 = vec_sub(b2v, b5v); \ + /* DST(7, b0 - b7); */ \ + d7 = vec_sub(b0v, b7v); \ +} + +#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ + /* unaligned load */ \ + vec_u8 hv = vec_ld( 0, dest ); \ + vec_u8 lv = vec_ld( 7, dest ); \ + vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \ + vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ + vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \ + vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ + vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ + vec_u8 edgehv; \ + /* unaligned store */ \ + vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ + vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ + lv = vec_sel( lv, bodyv, edgelv ); \ + vec_st( lv, 7, dest ); \ + hv = vec_ld( 0, dest ); \ + edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ + hv = vec_sel( hv, bodyv, edgehv ); \ + vec_st( hv, 0, dest ); \ + } + +static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { + vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; + vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; + vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; + + vec_u8 perm_ldv = vec_lvsl(0, dst); + vec_u8 perm_stv = vec_lvsr(8, dst); + + const vec_u16 onev = vec_splat_u16(1); + const vec_u16 twov = vec_splat_u16(2); + const vec_u16 sixv = vec_splat_u16(6); + + const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; + LOAD_ZERO; + + dct[0] += 32; // rounding for the >>6 at the end + + s0 = vec_ld(0x00, (int16_t*)dct); + s1 = vec_ld(0x10, (int16_t*)dct); + s2 = vec_ld(0x20, (int16_t*)dct); + s3 = vec_ld(0x30, (int16_t*)dct); + s4 = vec_ld(0x40, (int16_t*)dct); + s5 = vec_ld(0x50, (int16_t*)dct); + s6 = vec_ld(0x60, (int16_t*)dct); + s7 = vec_ld(0x70, (int16_t*)dct); + + IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, + d0, d1, d2, d3, d4, d5, d6, d7); + + TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); + + IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, + idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); + + ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); +} + +static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) +{ + vec_s16 dc16; + vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; + LOAD_ZERO; + DECLARE_ALIGNED(16, int, dc); + int i; + + dc = (block[0] + 32) >> 6; + dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); + + if (size == 4) + dc16 = vec_sld(dc16, zero_s16v, 8); + dcplus = vec_packsu(dc16, zero_s16v); + dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); + + aligner = vec_lvsr(0, dst); + dcplus = vec_perm(dcplus, dcplus, aligner); + dcminus = vec_perm(dcminus, dcminus, aligner); + + for (i = 0; i < size; i += 4) { + v0 = vec_ld(0, dst+0*stride); + v1 = vec_ld(0, dst+1*stride); + v2 = vec_ld(0, dst+2*stride); + v3 = vec_ld(0, dst+3*stride); + + v0 = vec_adds(v0, dcplus); + v1 = vec_adds(v1, dcplus); + v2 = vec_adds(v2, dcplus); + v3 = vec_adds(v3, dcplus); + + v0 = vec_subs(v0, dcminus); + v1 = vec_subs(v1, dcminus); + v2 = vec_subs(v2, dcminus); + v3 = vec_subs(v3, dcminus); + + vec_st(v0, 0, dst+0*stride); + vec_st(v1, 0, dst+1*stride); + vec_st(v2, 0, dst+2*stride); + vec_st(v3, 0, dst+3*stride); + + dst += 4*stride; + } +} + +static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) +{ + h264_idct_dc_add_internal(dst, block, stride, 4); +} + +static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) +{ + h264_idct_dc_add_internal(dst, block, stride, 8); +} + +static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); + else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); + } +} + +static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i+=4){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct8_add_altivec (dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=16; i<16+8; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + else if(block[i*16]) + h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + } +} + +#define transpose4x16(r0, r1, r2, r3) { \ + register vec_u8 r4; \ + register vec_u8 r5; \ + register vec_u8 r6; \ + register vec_u8 r7; \ + \ + r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ + r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ + r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ + r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ + \ + r0 = vec_mergeh(r4, r6); /*all set 0*/ \ + r1 = vec_mergel(r4, r6); /*all set 1*/ \ + r2 = vec_mergeh(r5, r7); /*all set 2*/ \ + r3 = vec_mergel(r5, r7); /*all set 3*/ \ +} + +static inline void write16x4(uint8_t *dst, int dst_stride, + register vec_u8 r0, register vec_u8 r1, + register vec_u8 r2, register vec_u8 r3) { + DECLARE_ALIGNED(16, unsigned char, result)[64]; + uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; + int int_dst_stride = dst_stride/4; + + vec_st(r0, 0, result); + vec_st(r1, 16, result); + vec_st(r2, 32, result); + vec_st(r3, 48, result); + /* FIXME: there has to be a better way!!!! */ + *dst_int = *src_int; + *(dst_int+ int_dst_stride) = *(src_int + 1); + *(dst_int+ 2*int_dst_stride) = *(src_int + 2); + *(dst_int+ 3*int_dst_stride) = *(src_int + 3); + *(dst_int+ 4*int_dst_stride) = *(src_int + 4); + *(dst_int+ 5*int_dst_stride) = *(src_int + 5); + *(dst_int+ 6*int_dst_stride) = *(src_int + 6); + *(dst_int+ 7*int_dst_stride) = *(src_int + 7); + *(dst_int+ 8*int_dst_stride) = *(src_int + 8); + *(dst_int+ 9*int_dst_stride) = *(src_int + 9); + *(dst_int+10*int_dst_stride) = *(src_int + 10); + *(dst_int+11*int_dst_stride) = *(src_int + 11); + *(dst_int+12*int_dst_stride) = *(src_int + 12); + *(dst_int+13*int_dst_stride) = *(src_int + 13); + *(dst_int+14*int_dst_stride) = *(src_int + 14); + *(dst_int+15*int_dst_stride) = *(src_int + 15); +} + +/** \brief performs a 6x16 transpose of data in src, and stores it to dst + \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing + out of unaligned_load() */ +#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ + register vec_u8 r0 = unaligned_load(0, src); \ + register vec_u8 r1 = unaligned_load( src_stride, src); \ + register vec_u8 r2 = unaligned_load(2* src_stride, src); \ + register vec_u8 r3 = unaligned_load(3* src_stride, src); \ + register vec_u8 r4 = unaligned_load(4* src_stride, src); \ + register vec_u8 r5 = unaligned_load(5* src_stride, src); \ + register vec_u8 r6 = unaligned_load(6* src_stride, src); \ + register vec_u8 r7 = unaligned_load(7* src_stride, src); \ + register vec_u8 r14 = unaligned_load(14*src_stride, src); \ + register vec_u8 r15 = unaligned_load(15*src_stride, src); \ + \ + r8 = unaligned_load( 8*src_stride, src); \ + r9 = unaligned_load( 9*src_stride, src); \ + r10 = unaligned_load(10*src_stride, src); \ + r11 = unaligned_load(11*src_stride, src); \ + r12 = unaligned_load(12*src_stride, src); \ + r13 = unaligned_load(13*src_stride, src); \ + \ + /*Merge first pairs*/ \ + r0 = vec_mergeh(r0, r8); /*0, 8*/ \ + r1 = vec_mergeh(r1, r9); /*1, 9*/ \ + r2 = vec_mergeh(r2, r10); /*2,10*/ \ + r3 = vec_mergeh(r3, r11); /*3,11*/ \ + r4 = vec_mergeh(r4, r12); /*4,12*/ \ + r5 = vec_mergeh(r5, r13); /*5,13*/ \ + r6 = vec_mergeh(r6, r14); /*6,14*/ \ + r7 = vec_mergeh(r7, r15); /*7,15*/ \ + \ + /*Merge second pairs*/ \ + r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ + r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ + r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ + r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ + r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ + r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ + r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ + r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ + \ + /*Third merge*/ \ + r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ + r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ + r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ + r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ + r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ + r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ + /* Don't need to compute 3 and 7*/ \ + \ + /*Final merge*/ \ + r8 = vec_mergeh(r0, r4); /*all set 0*/ \ + r9 = vec_mergel(r0, r4); /*all set 1*/ \ + r10 = vec_mergeh(r1, r5); /*all set 2*/ \ + r11 = vec_mergel(r1, r5); /*all set 3*/ \ + r12 = vec_mergeh(r2, r6); /*all set 4*/ \ + r13 = vec_mergel(r2, r6); /*all set 5*/ \ + /* Don't need to compute 14 and 15*/ \ + \ +} + +// out: o = |x-y| < a +static inline vec_u8 diff_lt_altivec ( register vec_u8 x, + register vec_u8 y, + register vec_u8 a) { + + register vec_u8 diff = vec_subs(x, y); + register vec_u8 diffneg = vec_subs(y, x); + register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */ + o = (vec_u8)vec_cmplt(o, a); + return o; +} + +static inline vec_u8 h264_deblock_mask ( register vec_u8 p0, + register vec_u8 p1, + register vec_u8 q0, + register vec_u8 q1, + register vec_u8 alpha, + register vec_u8 beta) { + + register vec_u8 mask; + register vec_u8 tempmask; + + mask = diff_lt_altivec(p0, q0, alpha); + tempmask = diff_lt_altivec(p1, p0, beta); + mask = vec_and(mask, tempmask); + tempmask = diff_lt_altivec(q1, q0, beta); + mask = vec_and(mask, tempmask); + + return mask; +} + +// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) +static inline vec_u8 h264_deblock_q1(register vec_u8 p0, + register vec_u8 p1, + register vec_u8 p2, + register vec_u8 q0, + register vec_u8 tc0) { + + register vec_u8 average = vec_avg(p0, q0); + register vec_u8 temp; + register vec_u8 uncliped; + register vec_u8 ones; + register vec_u8 max; + register vec_u8 min; + register vec_u8 newp1; + + temp = vec_xor(average, p2); + average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ + ones = vec_splat_u8(1); + temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ + uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ + max = vec_adds(p1, tc0); + min = vec_subs(p1, tc0); + newp1 = vec_max(min, uncliped); + newp1 = vec_min(max, newp1); + return newp1; +} + +#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ + \ + const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ + \ + register vec_u8 pq0bit = vec_xor(p0,q0); \ + register vec_u8 q1minus; \ + register vec_u8 p0minus; \ + register vec_u8 stage1; \ + register vec_u8 stage2; \ + register vec_u8 vec160; \ + register vec_u8 delta; \ + register vec_u8 deltaneg; \ + \ + q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ + stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ + stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ + p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ + stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ + pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ + stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ + stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ + vec160 = vec_ld(0, &A0v); \ + deltaneg = vec_subs(vec160, stage2); /* -d */ \ + delta = vec_subs(stage2, vec160); /* d */ \ + deltaneg = vec_min(tc0masked, deltaneg); \ + delta = vec_min(tc0masked, delta); \ + p0 = vec_subs(p0, deltaneg); \ + q0 = vec_subs(q0, delta); \ + p0 = vec_adds(p0, delta); \ + q0 = vec_adds(q0, deltaneg); \ +} + +#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ + DECLARE_ALIGNED(16, unsigned char, temp)[16]; \ + register vec_u8 alphavec; \ + register vec_u8 betavec; \ + register vec_u8 mask; \ + register vec_u8 p1mask; \ + register vec_u8 q1mask; \ + register vector signed char tc0vec; \ + register vec_u8 finaltc0; \ + register vec_u8 tc0masked; \ + register vec_u8 newp1; \ + register vec_u8 newq1; \ + \ + temp[0] = alpha; \ + temp[1] = beta; \ + alphavec = vec_ld(0, temp); \ + betavec = vec_splat(alphavec, 0x1); \ + alphavec = vec_splat(alphavec, 0x0); \ + mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ + \ + *((int *)temp) = *((int *)tc0); \ + tc0vec = vec_ld(0, (signed char*)temp); \ + tc0vec = vec_mergeh(tc0vec, tc0vec); \ + tc0vec = vec_mergeh(tc0vec, tc0vec); \ + mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ + finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \ + \ + p1mask = diff_lt_altivec(p2, p0, betavec); \ + p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ + tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \ + finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ + newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ + /*end if*/ \ + \ + q1mask = diff_lt_altivec(q2, q0, betavec); \ + q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ + tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \ + finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ + newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ + /*end if*/ \ + \ + h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ + p1 = newp1; \ + q1 = newq1; \ +} + +static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { + + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { + register vec_u8 p2 = vec_ld(-3*stride, pix); + register vec_u8 p1 = vec_ld(-2*stride, pix); + register vec_u8 p0 = vec_ld(-1*stride, pix); + register vec_u8 q0 = vec_ld(0, pix); + register vec_u8 q1 = vec_ld(stride, pix); + register vec_u8 q2 = vec_ld(2*stride, pix); + h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); + vec_st(p1, -2*stride, pix); + vec_st(p0, -1*stride, pix); + vec_st(q0, 0, pix); + vec_st(q1, stride, pix); + } +} + +static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { + + register vec_u8 line0, line1, line2, line3, line4, line5; + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) + return; + readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); + h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); + transpose4x16(line1, line2, line3, line4); + write16x4(pix-2, stride, line1, line2, line3, line4); +} + +static av_always_inline +void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) +{ + int y, aligned; + vec_u8 vblock; + vec_s16 vtemp, vweight, voffset, v0, v1; + vec_u16 vlog2_denom; + DECLARE_ALIGNED(16, int32_t, temp)[4]; + LOAD_ZERO; + + offset <<= log2_denom; + if(log2_denom) offset += 1<<(log2_denom-1); + temp[0] = log2_denom; + temp[1] = weight; + temp[2] = offset; + + vtemp = (vec_s16)vec_ld(0, temp); + vlog2_denom = (vec_u16)vec_splat(vtemp, 1); + vweight = vec_splat(vtemp, 3); + voffset = vec_splat(vtemp, 5); + aligned = !((unsigned long)block & 0xf); + + for (y=0; yput_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; + c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; + +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec + + dspfunc(put_h264_qpel, 0, 16); + dspfunc(avg_h264_qpel, 0, 16); +#undef dspfunc +} + +void ff_h264dsp_init_ppc(H264DSPContext *c){ + c->h264_idct_dc_add= h264_idct_dc_add_altivec; + c->h264_idct_add = ff_h264_idct_add_altivec; + c->h264_idct_add8 = ff_h264_idct_add8_altivec; + c->h264_idct_add16 = ff_h264_idct_add16_altivec; + c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec; + + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec; + c->h264_idct8_add = ff_h264_idct8_add_altivec; + c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; + + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; + c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; + c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; + c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; + c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/h264_template_altivec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/h264_template_altivec.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,783 @@ +/* + * Copyright (c) 2004 Romain Dolbeau + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +//#define DEBUG_ALIGNMENT +#ifdef DEBUG_ALIGNMENT +#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); +#else +#define ASSERT_ALIGNED(ptr) ; +#endif + +/* this code assume that stride % 16 == 0 */ + +#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ + vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ + vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ +\ + psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ + psum = vec_mladd(vB, vsrc1ssH, psum);\ + psum = vec_mladd(vC, vsrc2ssH, psum);\ + psum = vec_mladd(vD, vsrc3ssH, psum);\ + psum = BIAS2(psum);\ + psum = vec_sr(psum, v6us);\ +\ + vdst = vec_ld(0, dst);\ + ppsum = (vec_u8)vec_pack(psum, psum);\ + vfdst = vec_perm(vdst, ppsum, fperm);\ +\ + OP_U8_ALTIVEC(fsum, vfdst, vdst);\ +\ + vec_st(fsum, 0, dst);\ +\ + vsrc0ssH = vsrc2ssH;\ + vsrc1ssH = vsrc3ssH;\ +\ + dst += stride;\ + src += stride; + +#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ +\ + vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ + vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ +\ + psum = vec_mladd(vA, vsrc0ssH, v32ss);\ + psum = vec_mladd(vE, vsrc1ssH, psum);\ + psum = vec_sr(psum, v6us);\ +\ + vdst = vec_ld(0, dst);\ + ppsum = (vec_u8)vec_pack(psum, psum);\ + vfdst = vec_perm(vdst, ppsum, fperm);\ +\ + OP_U8_ALTIVEC(fsum, vfdst, vdst);\ +\ + vec_st(fsum, 0, dst);\ +\ + dst += stride;\ + src += stride; + +#define noop(a) a +#define add28(a) vec_add(v28ss, a) + +static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, + int stride, int h, int x, int y) { + POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); + DECLARE_ALIGNED(16, signed int, ABCD)[4] = + {((8 - x) * (8 - y)), + (( x) * (8 - y)), + ((8 - x) * ( y)), + (( x) * ( y))}; + register int i; + vec_u8 fperm; + const vec_s32 vABCD = vec_ld(0, ABCD); + const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); + const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); + const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); + const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); + LOAD_ZERO; + const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); + const vec_u16 v6us = vec_splat_u16(6); + register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; + register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; + + vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; + vec_u8 vsrc0uc, vsrc1uc; + vec_s16 vsrc0ssH, vsrc1ssH; + vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; + vec_s16 vsrc2ssH, vsrc3ssH, psum; + vec_u8 vdst, ppsum, vfdst, fsum; + + POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); + + if (((unsigned long)dst) % 16 == 0) { + fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F}; + } else { + fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, + 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F}; + } + + vsrcAuc = vec_ld(0, src); + + if (loadSecond) + vsrcBuc = vec_ld(16, src); + vsrcperm0 = vec_lvsl(0, src); + vsrcperm1 = vec_lvsl(1, src); + + vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); + if (reallyBadAlign) + vsrc1uc = vsrcBuc; + else + vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); + + vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); + vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); + + if (ABCD[3]) { + if (!loadSecond) {// -> !reallyBadAlign + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) + } + } else { + vec_u8 vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrcDuc = vec_ld(stride + 16, src); + vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + if (reallyBadAlign) + vsrc3uc = vsrcDuc; + else + vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) + } + } + } else { + const vec_s16 vE = vec_add(vB, vC); + if (ABCD[2]) { // x == 0 B == 0 + if (!loadSecond) {// -> !reallyBadAlign + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + + vsrc0uc = vsrc1uc; + } + } else { + vec_u8 vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrcDuc = vec_ld(stride + 15, src); + vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + + vsrc0uc = vsrc1uc; + } + } + } else { // y == 0 C == 0 + if (!loadSecond) {// -> !reallyBadAlign + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(0, src); + vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + } + } else { + vec_u8 vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(0, src); + vsrcDuc = vec_ld(15, src); + vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + if (reallyBadAlign) + vsrc1uc = vsrcDuc; + else + vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + } + } + } + } + POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); +} + +/* this code assume that stride % 16 == 0 */ +static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { + DECLARE_ALIGNED(16, signed int, ABCD)[4] = + {((8 - x) * (8 - y)), + (( x) * (8 - y)), + ((8 - x) * ( y)), + (( x) * ( y))}; + register int i; + vec_u8 fperm; + const vec_s32 vABCD = vec_ld(0, ABCD); + const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); + const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); + const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); + const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); + LOAD_ZERO; + const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); + const vec_u16 v6us = vec_splat_u16(6); + register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; + register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; + + vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; + vec_u8 vsrc0uc, vsrc1uc; + vec_s16 vsrc0ssH, vsrc1ssH; + vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; + vec_s16 vsrc2ssH, vsrc3ssH, psum; + vec_u8 vdst, ppsum, vfdst, fsum; + + if (((unsigned long)dst) % 16 == 0) { + fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F}; + } else { + fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, + 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F}; + } + + vsrcAuc = vec_ld(0, src); + + if (loadSecond) + vsrcBuc = vec_ld(16, src); + vsrcperm0 = vec_lvsl(0, src); + vsrcperm1 = vec_lvsl(1, src); + + vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); + if (reallyBadAlign) + vsrc1uc = vsrcBuc; + else + vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); + + vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); + vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); + + if (!loadSecond) {// -> !reallyBadAlign + for (i = 0 ; i < h ; i++) { + + + vsrcCuc = vec_ld(stride + 0, src); + + vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) + } + } else { + vec_u8 vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrcDuc = vec_ld(stride + 16, src); + + vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + if (reallyBadAlign) + vsrc3uc = vsrcDuc; + else + vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) + } + } +} + +#undef noop +#undef add28 +#undef CHROMA_MC8_ALTIVEC_CORE + +/* this code assume stride % 16 == 0 */ +static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); + register int i; + + LOAD_ZERO; + const vec_u8 permM2 = vec_lvsl(-2, src); + const vec_u8 permM1 = vec_lvsl(-1, src); + const vec_u8 permP0 = vec_lvsl(+0, src); + const vec_u8 permP1 = vec_lvsl(+1, src); + const vec_u8 permP2 = vec_lvsl(+2, src); + const vec_u8 permP3 = vec_lvsl(+3, src); + const vec_s16 v5ss = vec_splat_s16(5); + const vec_u16 v5us = vec_splat_u16(5); + const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); + + vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + + register int align = ((((unsigned long)src) - 2) % 16); + + vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, + srcP2A, srcP2B, srcP3A, srcP3B, + srcM1A, srcM1B, srcM2A, srcM2B, + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, + pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, + psumA, psumB, sumA, sumB; + + vec_u8 sum, vdst, fsum; + + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); + + for (i = 0 ; i < 16 ; i ++) { + vec_u8 srcR1 = vec_ld(-2, src); + vec_u8 srcR2 = vec_ld(14, src); + + switch (align) { + default: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = vec_perm(srcR1, srcR2, permP3); + } break; + case 11: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = srcR2; + } break; + case 12: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = srcR2; + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 13: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = srcR2; + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 14: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = srcR2; + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 15: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = srcR2; + srcP0 = vec_perm(srcR2, srcR3, permP0); + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + } + + srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); + srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); + srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); + srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); + + srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); + srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); + srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); + srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); + + srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); + srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); + srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); + srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); + + sum1A = vec_adds(srcP0A, srcP1A); + sum1B = vec_adds(srcP0B, srcP1B); + sum2A = vec_adds(srcM1A, srcP2A); + sum2B = vec_adds(srcM1B, srcP2B); + sum3A = vec_adds(srcM2A, srcP3A); + sum3B = vec_adds(srcM2B, srcP3B); + + pp1A = vec_mladd(sum1A, v20ss, v16ss); + pp1B = vec_mladd(sum1B, v20ss, v16ss); + + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); + + pp3A = vec_add(sum3A, pp1A); + pp3B = vec_add(sum3B, pp1B); + + psumA = vec_sub(pp3A, pp2A); + psumB = vec_sub(pp3B, pp2B); + + sumA = vec_sra(psumA, v5us); + sumB = vec_sra(psumB, v5us); + + sum = vec_packsu(sumA, sumB); + + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); + + OP_U8_ALTIVEC(fsum, sum, vdst); + + vec_st(fsum, 0, dst); + + src += srcStride; + dst += dstStride; + } + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); +} + +/* this code assume stride % 16 == 0 */ +static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); + + register int i; + + LOAD_ZERO; + const vec_u8 perm = vec_lvsl(0, src); + const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_u16 v5us = vec_splat_u16(5); + const vec_s16 v5ss = vec_splat_s16(5); + const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); + + uint8_t *srcbis = src - (srcStride * 2); + + const vec_u8 srcM2a = vec_ld(0, srcbis); + const vec_u8 srcM2b = vec_ld(16, srcbis); + const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); + //srcbis += srcStride; + const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); + const vec_u8 srcM1b = vec_ld(16, srcbis); + const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); + //srcbis += srcStride; + const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); + const vec_u8 srcP0b = vec_ld(16, srcbis); + const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); + //srcbis += srcStride; + const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); + const vec_u8 srcP1b = vec_ld(16, srcbis); + const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); + //srcbis += srcStride; + const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); + const vec_u8 srcP2b = vec_ld(16, srcbis); + const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); + //srcbis += srcStride; + + vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); + vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); + vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); + vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); + vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); + vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); + vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); + vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); + vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); + vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); + + vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, + psumA, psumB, sumA, sumB, + srcP3ssA, srcP3ssB, + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; + + vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; + + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); + + for (i = 0 ; i < 16 ; i++) { + srcP3a = vec_ld(0, srcbis += srcStride); + srcP3b = vec_ld(16, srcbis); + srcP3 = vec_perm(srcP3a, srcP3b, perm); + srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); + srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); + //srcbis += srcStride; + + sum1A = vec_adds(srcP0ssA, srcP1ssA); + sum1B = vec_adds(srcP0ssB, srcP1ssB); + sum2A = vec_adds(srcM1ssA, srcP2ssA); + sum2B = vec_adds(srcM1ssB, srcP2ssB); + sum3A = vec_adds(srcM2ssA, srcP3ssA); + sum3B = vec_adds(srcM2ssB, srcP3ssB); + + srcM2ssA = srcM1ssA; + srcM2ssB = srcM1ssB; + srcM1ssA = srcP0ssA; + srcM1ssB = srcP0ssB; + srcP0ssA = srcP1ssA; + srcP0ssB = srcP1ssB; + srcP1ssA = srcP2ssA; + srcP1ssB = srcP2ssB; + srcP2ssA = srcP3ssA; + srcP2ssB = srcP3ssB; + + pp1A = vec_mladd(sum1A, v20ss, v16ss); + pp1B = vec_mladd(sum1B, v20ss, v16ss); + + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); + + pp3A = vec_add(sum3A, pp1A); + pp3B = vec_add(sum3B, pp1B); + + psumA = vec_sub(pp3A, pp2A); + psumB = vec_sub(pp3B, pp2B); + + sumA = vec_sra(psumA, v5us); + sumB = vec_sra(psumB, v5us); + + sum = vec_packsu(sumA, sumB); + + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); + + OP_U8_ALTIVEC(fsum, sum, vdst); + + vec_st(fsum, 0, dst); + + dst += dstStride; + } + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); +} + +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ +static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); + register int i; + LOAD_ZERO; + const vec_u8 permM2 = vec_lvsl(-2, src); + const vec_u8 permM1 = vec_lvsl(-1, src); + const vec_u8 permP0 = vec_lvsl(+0, src); + const vec_u8 permP1 = vec_lvsl(+1, src); + const vec_u8 permP2 = vec_lvsl(+2, src); + const vec_u8 permP3 = vec_lvsl(+3, src); + const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_u32 v10ui = vec_splat_u32(10); + const vec_s16 v5ss = vec_splat_s16(5); + const vec_s16 v1ss = vec_splat_s16(1); + const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); + const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); + + register int align = ((((unsigned long)src) - 2) % 16); + + vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, + srcP2A, srcP2B, srcP3A, srcP3B, + srcM1A, srcM1B, srcM2A, srcM2B, + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, + pp1A, pp1B, pp2A, pp2B, psumA, psumB; + + const vec_u8 mperm = (const vec_u8) + {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, + 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; + int16_t *tmpbis = tmp; + + vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, + tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, + tmpP2ssA, tmpP2ssB; + + vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, + pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, + pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, + ssumAe, ssumAo, ssumBe, ssumBo; + vec_u8 fsum, sumv, sum, vdst; + vec_s16 ssume, ssumo; + + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); + src -= (2 * srcStride); + for (i = 0 ; i < 21 ; i ++) { + vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vec_u8 srcR1 = vec_ld(-2, src); + vec_u8 srcR2 = vec_ld(14, src); + + switch (align) { + default: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = vec_perm(srcR1, srcR2, permP3); + } break; + case 11: { + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = vec_perm(srcR1, srcR2, permP2); + srcP3 = srcR2; + } break; + case 12: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = vec_perm(srcR1, srcR2, permP1); + srcP2 = srcR2; + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 13: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = vec_perm(srcR1, srcR2, permP0); + srcP1 = srcR2; + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 14: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = vec_perm(srcR1, srcR2, permM1); + srcP0 = srcR2; + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + case 15: { + vec_u8 srcR3 = vec_ld(30, src); + srcM2 = vec_perm(srcR1, srcR2, permM2); + srcM1 = srcR2; + srcP0 = vec_perm(srcR2, srcR3, permP0); + srcP1 = vec_perm(srcR2, srcR3, permP1); + srcP2 = vec_perm(srcR2, srcR3, permP2); + srcP3 = vec_perm(srcR2, srcR3, permP3); + } break; + } + + srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); + srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); + srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); + srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); + + srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); + srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); + srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); + srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); + + srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); + srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); + srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); + srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); + + sum1A = vec_adds(srcP0A, srcP1A); + sum1B = vec_adds(srcP0B, srcP1B); + sum2A = vec_adds(srcM1A, srcP2A); + sum2B = vec_adds(srcM1B, srcP2B); + sum3A = vec_adds(srcM2A, srcP3A); + sum3B = vec_adds(srcM2B, srcP3B); + + pp1A = vec_mladd(sum1A, v20ss, sum3A); + pp1B = vec_mladd(sum1B, v20ss, sum3B); + + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); + + psumA = vec_sub(pp1A, pp2A); + psumB = vec_sub(pp1B, pp2B); + + vec_st(psumA, 0, tmp); + vec_st(psumB, 16, tmp); + + src += srcStride; + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ + } + + tmpM2ssA = vec_ld(0, tmpbis); + tmpM2ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpM1ssA = vec_ld(0, tmpbis); + tmpM1ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpP0ssA = vec_ld(0, tmpbis); + tmpP0ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpP1ssA = vec_ld(0, tmpbis); + tmpP1ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + tmpP2ssA = vec_ld(0, tmpbis); + tmpP2ssB = vec_ld(16, tmpbis); + tmpbis += tmpStride; + + for (i = 0 ; i < 16 ; i++) { + const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); + const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); + + const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); + const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); + const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); + const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); + const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); + const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); + + tmpbis += tmpStride; + + tmpM2ssA = tmpM1ssA; + tmpM2ssB = tmpM1ssB; + tmpM1ssA = tmpP0ssA; + tmpM1ssB = tmpP0ssB; + tmpP0ssA = tmpP1ssA; + tmpP0ssB = tmpP1ssB; + tmpP1ssA = tmpP2ssA; + tmpP1ssB = tmpP2ssB; + tmpP2ssA = tmpP3ssA; + tmpP2ssB = tmpP3ssB; + + pp1Ae = vec_mule(sum1A, v20ss); + pp1Ao = vec_mulo(sum1A, v20ss); + pp1Be = vec_mule(sum1B, v20ss); + pp1Bo = vec_mulo(sum1B, v20ss); + + pp2Ae = vec_mule(sum2A, v5ss); + pp2Ao = vec_mulo(sum2A, v5ss); + pp2Be = vec_mule(sum2B, v5ss); + pp2Bo = vec_mulo(sum2B, v5ss); + + pp3Ae = vec_sra((vec_s32)sum3A, v16ui); + pp3Ao = vec_mulo(sum3A, v1ss); + pp3Be = vec_sra((vec_s32)sum3B, v16ui); + pp3Bo = vec_mulo(sum3B, v1ss); + + pp1cAe = vec_add(pp1Ae, v512si); + pp1cAo = vec_add(pp1Ao, v512si); + pp1cBe = vec_add(pp1Be, v512si); + pp1cBo = vec_add(pp1Bo, v512si); + + pp32Ae = vec_sub(pp3Ae, pp2Ae); + pp32Ao = vec_sub(pp3Ao, pp2Ao); + pp32Be = vec_sub(pp3Be, pp2Be); + pp32Bo = vec_sub(pp3Bo, pp2Bo); + + sumAe = vec_add(pp1cAe, pp32Ae); + sumAo = vec_add(pp1cAo, pp32Ao); + sumBe = vec_add(pp1cBe, pp32Be); + sumBo = vec_add(pp1cBo, pp32Bo); + + ssumAe = vec_sra(sumAe, v10ui); + ssumAo = vec_sra(sumAo, v10ui); + ssumBe = vec_sra(sumBe, v10ui); + ssumBo = vec_sra(sumBo, v10ui); + + ssume = vec_packs(ssumAe, ssumBe); + ssumo = vec_packs(ssumAo, ssumBo); + + sumv = vec_packsu(ssume, ssumo); + sum = vec_perm(sumv, sumv, mperm); + + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); + + OP_U8_ALTIVEC(fsum, sum, vdst); + + vec_st(fsum, 0, dst); + + dst += dstStride; + } + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/idct_altivec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/idct_altivec.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2001 Michel Lespinasse + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * NOTE: This code is based on GPL code from the libmpeg2 project. The + * author, Michel Lespinasses, has given explicit permission to release + * under LGPL as part of FFmpeg. + */ + +/* + * FFmpeg integration by Dieter Shirley + * + * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 + * project. I've deleted all of the libmpeg2-specific code, renamed the + * functions and reordered the function parameters. The only change to the + * IDCT function itself was to factor out the partial transposition, and to + * perform a full transpose at the end of the function. + */ + + +#include /* malloc(), free() */ +#include +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif +#include "libavcodec/dsputil.h" +#include "types_altivec.h" +#include "dsputil_ppc.h" +#include "dsputil_altivec.h" + +#define IDCT_HALF \ + /* 1st stage */ \ + t1 = vec_mradds (a1, vx7, vx1 ); \ + t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ + t7 = vec_mradds (a2, vx5, vx3); \ + t3 = vec_mradds (ma2, vx3, vx5); \ + \ + /* 2nd stage */ \ + t5 = vec_adds (vx0, vx4); \ + t0 = vec_subs (vx0, vx4); \ + t2 = vec_mradds (a0, vx6, vx2); \ + t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ + t6 = vec_adds (t8, t3); \ + t3 = vec_subs (t8, t3); \ + t8 = vec_subs (t1, t7); \ + t1 = vec_adds (t1, t7); \ + \ + /* 3rd stage */ \ + t7 = vec_adds (t5, t2); \ + t2 = vec_subs (t5, t2); \ + t5 = vec_adds (t0, t4); \ + t0 = vec_subs (t0, t4); \ + t4 = vec_subs (t8, t3); \ + t3 = vec_adds (t8, t3); \ + \ + /* 4th stage */ \ + vy0 = vec_adds (t7, t1); \ + vy7 = vec_subs (t7, t1); \ + vy1 = vec_mradds (c4, t3, t5); \ + vy6 = vec_mradds (mc4, t3, t5); \ + vy2 = vec_mradds (c4, t4, t0); \ + vy5 = vec_mradds (mc4, t4, t0); \ + vy3 = vec_adds (t2, t6); \ + vy4 = vec_subs (t2, t6); + + +#define IDCT \ + vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ + vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ + vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \ + vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ + vec_u16 shift; \ + \ + c4 = vec_splat (constants[0], 0); \ + a0 = vec_splat (constants[0], 1); \ + a1 = vec_splat (constants[0], 2); \ + a2 = vec_splat (constants[0], 3); \ + mc4 = vec_splat (constants[0], 4); \ + ma2 = vec_splat (constants[0], 5); \ + bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \ + \ + zero = vec_splat_s16 (0); \ + shift = vec_splat_u16 (4); \ + \ + vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ + vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ + vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ + vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ + vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ + vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ + vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ + vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ + \ + IDCT_HALF \ + \ + vx0 = vec_mergeh (vy0, vy4); \ + vx1 = vec_mergel (vy0, vy4); \ + vx2 = vec_mergeh (vy1, vy5); \ + vx3 = vec_mergel (vy1, vy5); \ + vx4 = vec_mergeh (vy2, vy6); \ + vx5 = vec_mergel (vy2, vy6); \ + vx6 = vec_mergeh (vy3, vy7); \ + vx7 = vec_mergel (vy3, vy7); \ + \ + vy0 = vec_mergeh (vx0, vx4); \ + vy1 = vec_mergel (vx0, vx4); \ + vy2 = vec_mergeh (vx1, vx5); \ + vy3 = vec_mergel (vx1, vx5); \ + vy4 = vec_mergeh (vx2, vx6); \ + vy5 = vec_mergel (vx2, vx6); \ + vy6 = vec_mergeh (vx3, vx7); \ + vy7 = vec_mergel (vx3, vx7); \ + \ + vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ + vx1 = vec_mergel (vy0, vy4); \ + vx2 = vec_mergeh (vy1, vy5); \ + vx3 = vec_mergel (vy1, vy5); \ + vx4 = vec_mergeh (vy2, vy6); \ + vx5 = vec_mergel (vy2, vy6); \ + vx6 = vec_mergeh (vy3, vy7); \ + vx7 = vec_mergel (vy3, vy7); \ + \ + IDCT_HALF \ + \ + shift = vec_splat_u16 (6); \ + vx0 = vec_sra (vy0, shift); \ + vx1 = vec_sra (vy1, shift); \ + vx2 = vec_sra (vy2, shift); \ + vx3 = vec_sra (vy3, shift); \ + vx4 = vec_sra (vy4, shift); \ + vx5 = vec_sra (vy5, shift); \ + vx6 = vec_sra (vy6, shift); \ + vx7 = vec_sra (vy7, shift); + + +static const vec_s16 constants[5] = { + {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, + {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, + {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, + {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, + {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} +}; + +void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) +{ +POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); + vec_s16 *block = (vec_s16*)blk; + vec_u8 tmp; + +#if CONFIG_POWERPC_PERF +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); +#endif + IDCT + +#define COPY(dest,src) \ + tmp = vec_packsu (src, src); \ + vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ + vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); + + COPY (dest, vx0) dest += stride; + COPY (dest, vx1) dest += stride; + COPY (dest, vx2) dest += stride; + COPY (dest, vx3) dest += stride; + COPY (dest, vx4) dest += stride; + COPY (dest, vx5) dest += stride; + COPY (dest, vx6) dest += stride; + COPY (dest, vx7) + +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); +} + +void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) +{ +POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); + vec_s16 *block = (vec_s16*)blk; + vec_u8 tmp; + vec_s16 tmp2, tmp3; + vec_u8 perm0; + vec_u8 perm1; + vec_u8 p0, p1, p; + +#if CONFIG_POWERPC_PERF +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); +#endif + + IDCT + + p0 = vec_lvsl (0, dest); + p1 = vec_lvsl (stride, dest); + p = vec_splat_u8 (-1); + perm0 = vec_mergeh (p, p0); + perm1 = vec_mergeh (p, p1); + +#define ADD(dest,src,perm) \ + /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ + tmp = vec_ld (0, dest); \ + tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ + tmp3 = vec_adds (tmp2, src); \ + tmp = vec_packsu (tmp3, tmp3); \ + vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ + vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); + + ADD (dest, vx0, perm0) dest += stride; + ADD (dest, vx1, perm1) dest += stride; + ADD (dest, vx2, perm0) dest += stride; + ADD (dest, vx3, perm1) dest += stride; + ADD (dest, vx4, perm0) dest += stride; + ADD (dest, vx5, perm1) dest += stride; + ADD (dest, vx6, perm0) dest += stride; + ADD (dest, vx7, perm1) + +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/mathops.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/mathops.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,79 @@ +/* + * simple math operations + * Copyright (c) 2001, 2002 Fabrice Bellard + * Copyright (c) 2006 Michael Niedermayer et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PPC_MATHOPS_H +#define AVCODEC_PPC_MATHOPS_H + +#include +#include "config.h" +#include "libavutil/common.h" + +#if HAVE_PPC4XX +/* signed 16x16 -> 32 multiply add accumulate */ +#define MAC16(rt, ra, rb) \ + __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); + +/* signed 16x16 -> 32 multiply */ +#define MUL16(ra, rb) \ + ({ int __rt; \ + __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ + __rt; }) +#endif + +#define MULH MULH +static inline av_const int MULH(int a, int b){ + int r; + __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} + +#if !ARCH_PPC64 +static inline av_const int64_t MAC64(int64_t d, int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x = { d }; + int h, l; + __asm__ ("mullw %3, %4, %5 \n\t" + "mulhw %2, %4, %5 \n\t" + "addc %1, %1, %3 \n\t" + "adde %0, %0, %2 \n\t" + : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) + : "r"(a), "r"(b)); + return x.x; +} +#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) + +static inline av_const int64_t MLS64(int64_t d, int a, int b) +{ + union { uint64_t x; unsigned hl[2]; } x = { d }; + int h, l; + __asm__ ("mullw %3, %4, %5 \n\t" + "mulhw %2, %4, %5 \n\t" + "subfc %1, %3, %1 \n\t" + "subfe %0, %2, %0 \n\t" + : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) + : "r"(a), "r"(b)); + return x.x; +} +#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) +#endif + +#endif /* AVCODEC_PPC_MATHOPS_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/types_altivec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/types_altivec.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2006 Guillaume Poirier + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H +#define AVCODEC_PPC_TYPES_ALTIVEC_H + +/*********************************************************************** + * Vector types + **********************************************************************/ +#define vec_u8 vector unsigned char +#define vec_s8 vector signed char +#define vec_u16 vector unsigned short +#define vec_s16 vector signed short +#define vec_u32 vector unsigned int +#define vec_s32 vector signed int + +/*********************************************************************** + * Null vector + **********************************************************************/ +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 ) + +#define zero_u8v (vec_u8) zerov +#define zero_s8v (vec_s8) zerov +#define zero_u16v (vec_u16) zerov +#define zero_s16v (vec_s16) zerov +#define zero_u32v (vec_u32) zerov +#define zero_s32v (vec_s32) zerov + +#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/ppc/util_altivec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/ppc/util_altivec.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,105 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Contains misc utility macros and inline functions + */ + +#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H +#define AVCODEC_PPC_UTIL_ALTIVEC_H + +#include + +#include "config.h" + +#if HAVE_ALTIVEC_H +#include +#endif + +// used to build registers permutation vectors (vcprm) +// the 's' are for words in the _s_econd vector +#define WORD_0 0x00,0x01,0x02,0x03 +#define WORD_1 0x04,0x05,0x06,0x07 +#define WORD_2 0x08,0x09,0x0a,0x0b +#define WORD_3 0x0c,0x0d,0x0e,0x0f +#define WORD_s0 0x10,0x11,0x12,0x13 +#define WORD_s1 0x14,0x15,0x16,0x17 +#define WORD_s2 0x18,0x19,0x1a,0x1b +#define WORD_s3 0x1c,0x1d,0x1e,0x1f + +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} + +// vcprmle is used to keep the same index as in the SSE version. +// it's the same as vcprm, with the index inversed +// ('le' is Little Endian) +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) + +// used to build inverse/identity vectors (vcii) +// n is _n_egative, p is _p_ositive +#define FLOAT_n -1. +#define FLOAT_p 1. + + +// Transpose 8x8 matrix of 16-bit elements (in-place) +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ +do { \ + vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ + vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ + \ + A1 = vec_mergeh (a, e); \ + B1 = vec_mergel (a, e); \ + C1 = vec_mergeh (b, f); \ + D1 = vec_mergel (b, f); \ + E1 = vec_mergeh (c, g); \ + F1 = vec_mergel (c, g); \ + G1 = vec_mergeh (d, h); \ + H1 = vec_mergel (d, h); \ + \ + A2 = vec_mergeh (A1, E1); \ + B2 = vec_mergel (A1, E1); \ + C2 = vec_mergeh (B1, F1); \ + D2 = vec_mergel (B1, F1); \ + E2 = vec_mergeh (C1, G1); \ + F2 = vec_mergel (C1, G1); \ + G2 = vec_mergeh (D1, H1); \ + H2 = vec_mergel (D1, H1); \ + \ + a = vec_mergeh (A2, E2); \ + b = vec_mergel (A2, E2); \ + c = vec_mergeh (B2, F2); \ + d = vec_mergel (B2, F2); \ + e = vec_mergeh (C2, G2); \ + f = vec_mergel (C2, G2); \ + g = vec_mergeh (D2, H2); \ + h = vec_mergel (D2, H2); \ +} while (0) + + +/** \brief loads unaligned vector \a *src with offset \a offset + and returns it */ +static inline vector unsigned char unaligned_load(int offset, uint8_t *src) +{ + register vector unsigned char first = vec_ld(offset, src); + register vector unsigned char second = vec_ld(offset+15, src); + register vector unsigned char mask = vec_lvsl(offset, src); + return vec_perm(first, second, mask); +} + +#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/raw.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/raw.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,39 @@ +/* + * Raw Video Codec + * Copyright (c) 2001 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Raw Video Codec + */ + +#ifndef AVCODEC_RAW_H +#define AVCODEC_RAW_H + +#include "avcodec.h" + +typedef struct PixelFormatTag { + enum PixelFormat pix_fmt; + unsigned int fourcc; +} PixelFormatTag; + +extern const PixelFormatTag ff_raw_pixelFormatTags[]; +int raw_init_encoder(AVCodecContext *avctx); +#endif /* AVCODEC_RAW_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/rectangle.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/rectangle.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,92 @@ +/* + * rectangle filling function + * Copyright (c) 2003 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * useful rectangle filling function + * @author Michael Niedermayer + */ + +#ifndef AVCODEC_RECTANGLE_H +#define AVCODEC_RECTANGLE_H + +#include +//#include "config.h" +#include "libavutil/common.h" +#include "dsputil.h" + +/** + * fill a rectangle. + * @param h height of the rectangle, should be a constant + * @param w width of the rectangle, should be a constant + * @param size the size of val (1, 2 or 4), should be a constant + */ +static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ + uint8_t *p= (uint8_t*)vp; + assert(size==1 || size==2 || size==4); + assert(w<=4); + + w *= size; + stride *= size; + + assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); + assert((stride&(w-1))==0); + if(w==2){ + const uint16_t v= size==4 ? val : val*0x0101; + *(uint16_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint16_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint16_t*)(p + 2*stride)= v; + *(uint16_t*)(p + 3*stride)= v; + }else if(w==4){ + const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101; + *(uint32_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint32_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint32_t*)(p + 2*stride)= v; + *(uint32_t*)(p + 3*stride)= v; + }else if(w==8){ + const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL; + *(uint64_t*)(p + 0*stride)= v; + if(h==1) return; + *(uint64_t*)(p + 1*stride)= v; + if(h==2) return; + *(uint64_t*)(p + 2*stride)= v; + *(uint64_t*)(p + 3*stride)= v; + }else if(w==16){ + const uint64_t v= val*0x0100000001ULL; + *(uint64_t*)(p + 0+0*stride)= v; + *(uint64_t*)(p + 8+0*stride)= v; + *(uint64_t*)(p + 0+1*stride)= v; + *(uint64_t*)(p + 8+1*stride)= v; + if(h==2) return; + *(uint64_t*)(p + 0+2*stride)= v; + *(uint64_t*)(p + 8+2*stride)= v; + *(uint64_t*)(p + 0+3*stride)= v; + *(uint64_t*)(p + 8+3*stride)= v; + }else + assert(0); + assert(h==4); +} + +#endif /* AVCODEC_RECTANGLE_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/scratch.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/scratch.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,295 @@ +static void *entropy_thread(void *arg){ + H264Context *h = (H264Context *) arg; + EDSlice *s; + + H264Cabac hcabac; + CABACContext cabac; + + ff_init_cabac_states(); + + if (init_cabac(h, &hcabac)<0) + return NULL; + + for(;;){ + { + pthread_mutex_lock(&h->lock[ENTROPY]); + while (h->ed_cnt<=0) + pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]); + s= &h->ed_q[h->ed_fo]; + pthread_mutex_unlock(&h->lock[ENTROPY]); + h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT; + } + if (s->state<0) + break; + + decode_slice_entropy(&hcabac, &cabac, s); + + { + pthread_mutex_lock(&h->lock[MBDEC]); + while (h->mbdec_cnt >= MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); + h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s); + h->mbdec_cnt++; + h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[MBDEC]); + pthread_mutex_unlock(&h->lock[MBDEC]); + } + { + pthread_mutex_lock(&h->lock[ENTROPY]); + h->ed_cnt--; + pthread_cond_signal(&h->cond[ENTROPY]); + pthread_mutex_unlock(&h->lock[ENTROPY]); + } + } + + { + pthread_mutex_lock(&h->lock[MBDEC]); + while (h->mbdec_cnt >= MAX_SLICE_COUNT) + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); + h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s); + h->mbdec_cnt++; + h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; + pthread_cond_signal(&h->cond[MBDEC]); + pthread_mutex_unlock(&h->lock[MBDEC]); + + } + + free_cabac(&hcabac); + + pthread_exit(NULL); + return NULL; + +} +/* +* The following code is the main loop of the file converter +*/ +int av_transcode_1ed(int ifile, int ofile, int frame_width, int frame_height) { + H264Context *h; + pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; + + h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height); + + timer_start = av_gettime(); + + // pthread_create(&read_thr, NULL, read_thread, h); + // pthread_create(&parsenal_thr, NULL, parsenal_thread, h); + pthread_create(&entropy_thr, NULL, entropy_mbd_thread, h); + + // pthread_create(&mbdec_thr, NULL, mbdec_thread, h); + + // pthread_create(&write_thr, NULL, write_thread, h); + + // pthread_join(read_thr, NULL); + // pthread_join(parsenal_thr, NULL); + pthread_join(entropy_thr, NULL); + // pthread_join(mbdec_thr, NULL); + // printf("before write_thr\n"); + // pthread_join(write_thr, NULL); + + /* finished ! */ + ff_h264_decode_end(h); + + return 0; +} + +static void reset_h264mb(EDSlice *s, int mb_width, int mb_height){ + for (int i=0; imbs[i*mb_width + j]; + + m->left_mb_xy=0; + m->top_mb_xy = 0; + } + } +} + +static void *entropy_mbd_thread(void *arg){ + H264Context *h = (H264Context *) arg; + + EDSlice slice, *s=&slice; + MBSlice mbslice, *s2=&mbslice; + H264Cabac hcabac; + CABACContext cabac; + int frames =0; + MBDecContext mbdec, *d=&mbdec; + int size=h->width*h->height; + WriteContext write, *w=&write; + AVCodecParserContext parser, *pc= &parser; + NalContext nal, *n=&nal; + + + memset(pc, 0, sizeof(AVCodecParserContext)); + pc->buffer_size = 2048; + pc->final_frame = 0; + pc->cur_len= 0; + pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE); + pc->size = 2048; + pc->eof_reached =0; + pc->ifile = h->ifile; + + //init parse + memset(n, 0, sizeof(NalContext)); + n->width = h->width; + n->height = h->height; + n->mb_height = h->mb_height; + n->mb_width = h->mb_width; + n->b4_stride = n->mb_width*4 + 1; + n->mb_stride = n->mb_width + 1; + n->outputed_poc = INT_MIN; +// memset(s, 0, sizeof(EDSlice)); +// ff_init_slice(n, s); +// + + memset(w, 0, sizeof(WriteContext)); + w->bit_buffer_size= FFMAX(1024*256, 6*size + 200); + w->bit_buffer= av_mallocz(w->bit_buffer_size); + + + + ff_h264dsp_init(&d->hdsp); + ff_h264_pred_init(&d->hpc); + dsputil_init(&d->dsp); + d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab; + d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab; + d->mb_height = (h->height + 15) / 16; + d->mb_width = (h->width + 15) / 16; + d->linesize = h->width + EDGE_WIDTH*2; + d->uvlinesize = d->linesize>>1; + + for(int i=0; i<16; i++){ + d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3); + } + for(int i=0; i<4; i++){ + d->block_offset[16+i]= + d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3); + } + + d->scratchpad= av_mallocz((h->width+64)*4*16*2*sizeof(uint8_t)); + + ff_init_cabac_states(); + + if (init_cabac(h, &hcabac)<0) + return NULL; + + while(!pc->final_frame && frames_max++ < 1000){ + Picture *out; + + RawFrame *frm; + Picture *pic=NULL; + + RawFrame frm_read; + frm_read.state =0; + av_read_frame_internal(pc, &frm_read); + frm = &frm_read; + + if (frm->state < 0) + break; +/* + { + pthread_mutex_lock(&h->lock[PARSE2]); + while (h->slice_cnt<=0) + pthread_cond_wait(&h->cond[PARSE2], &h->lock[PARSE2]); + h->slice_cnt--; + s= &h->slices[h->slice_next++]; + h->slice_next %= MAX_SLICE_COUNT; + pthread_mutex_unlock(&h->lock[PARSE2]); + }*/ + ff_init_slice(n, s); + reset_h264mb(s, n->mb_width, n->mb_height); + for(int i=0; ipicture[i].reference==0){ + pic= &h->picture[i]; + break; + } + } +// { +// pthread_mutex_lock(&h->lock[PARSE3]); +// while (h->free_pic_cnt<=0) +// pthread_cond_wait(&h->cond[PARSE3], &h->lock[PARSE3]); +// h->free_pic_cnt--; +// /* use first free picture */ +// for(int i=0; ipicture[i].reference==0){ +// pic= &h->picture[i]; +// break; +// } +// } +// pthread_mutex_unlock(&h->lock[PARSE3]); +// } + ff_alloc_picture(n, s, pic); + + decode_nal_units(n, s, frm, pic); + + + decode_slice_entropy(&hcabac, &cabac, s); + memcpy( s2, s, sizeof(MBSlice)); //this only copys the COMMON_SLICE part + av_freep(&s->gb.raw); + decode_slice_mb_seq(d, s2); + +// if (s2->release_cnt>0) { +// int i; +// for (i=0; irelease_cnt; i++){ +// if ((s2->release_ref[i]->reference & ~2) == 0) +// default_release_buffer(h, s2->release_ref[i]); +// else +// s2->release_ref[i]->reference &= ~2; +// } +// s->release_cnt=0; +// } + +if (s->release_cnt>0) { + int i; + for (i=0; irelease_cnt; i++){ + s->release_ref[i]->reference &= ~2; + } + s->release_cnt=0; +} + + + { + pthread_mutex_lock(&h->lock[PARSE2]); + h->slice_cnt++; + pthread_cond_signal(&h->cond[PARSE2]); + pthread_mutex_unlock(&h->lock[PARSE2]); + } + + out =output_frame(w, s2->current_picture, h->ofile, h->width, h->height); + print_report(w->frame_number, w->video_size, 0); + + if (out){ +// if ((out->reference & ~1) == 0) +// default_release_buffer(h, out); +// else + out->reference &= ~1; + } + + { + pthread_mutex_lock(&h->lock[ENTROPY]); + h->ed_cnt--; + pthread_cond_signal(&h->cond[ENTROPY]); + pthread_mutex_unlock(&h->lock[ENTROPY]); + } + } + while (output_frame(w, NULL, h->ofile, h->width, h->height)); + print_report(w->frame_number, w->video_size, 1); + + av_free(w->bit_buffer); + + {//propagate exit + pthread_mutex_lock(&h->lock[WRITE]); + while (h->write_cnt>= MAX_DELAYED_PIC_COUNT) + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); + last_pic.reference = -1; + h->write_q[h->write_fi] = &last_pic; + h->write_cnt++; + h->write_fi++; h->write_fi %= MAX_DELAYED_PIC_COUNT; + pthread_cond_signal(&h->cond[WRITE]); + pthread_mutex_unlock(&h->lock[WRITE]); + + } + free_cabac(&hcabac); + + pthread_exit(NULL); + return NULL; + +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/simple_idct.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/simple_idct.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,372 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * simpleidct in C. + */ + +/* + based upon some outcommented c code from mpeg2dec (idct_mmx.c + written by Aaron Holtzman ) + */ +#include "avcodec.h" +#include "dsputil.h" +#include "mathops.h" +#include "simple_idct.h" + +#if 0 +#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ +#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ +#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ +#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ +#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ +#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ +#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ +#define ROW_SHIFT 8 +#define COL_SHIFT 17 +#else +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define ROW_SHIFT 11 +#define COL_SHIFT 20 // 6 +#endif + +static inline void idctRowCondDC (DCTELEM * row) +{ + int a0, a1, a2, a3, b0, b1, b2, b3; + uint64_t temp; + +#if HAVE_BIGENDIAN +#define ROW0_MASK 0xffff000000000000LL +#else +#define ROW0_MASK 0xffffLL +#endif + if(sizeof(DCTELEM)==2){ + if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | + ((uint64_t *)row)[1]) == 0) { + temp = (row[0] << 3) & 0xffff; + temp += temp << 16; + temp += temp << 32; + ((uint64_t *)row)[0] = temp; + ((uint64_t *)row)[1] = temp; + return; + } + }else{ + if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) { + row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3; + return; + } + } + + a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); + a1 = a0; + a2 = a0; + a3 = a0; + + /* no need to optimize : gcc does it */ + a0 += W2 * row[2]; + a1 += W6 * row[2]; + a2 -= W6 * row[2]; + a3 -= W2 * row[2]; + + b0 = MUL16(W1, row[1]); + MAC16(b0, W3, row[3]); + b1 = MUL16(W3, row[1]); + MAC16(b1, -W7, row[3]); + b2 = MUL16(W5, row[1]); + MAC16(b2, -W1, row[3]); + b3 = MUL16(W7, row[1]); + MAC16(b3, -W5, row[3]); + + temp = ((uint64_t*)row)[1]; + + if (temp != 0) { + a0 += W4*row[4] + W6*row[6]; + a1 += - W4*row[4] - W2*row[6]; + a2 += - W4*row[4] + W2*row[6]; + a3 += W4*row[4] - W6*row[6]; + + MAC16(b0, W5, row[5]); + MAC16(b0, W7, row[7]); + + MAC16(b1, -W1, row[5]); + MAC16(b1, -W5, row[7]); + + MAC16(b2, W7, row[5]); + MAC16(b2, W3, row[7]); + + MAC16(b3, W3, row[5]); + MAC16(b3, -W1, row[7]); + } + + row[0] = (a0 + b0) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; +} + +static inline void idctSparseColPut (uint8_t *dest, int line_size, + DCTELEM * col) +{ + int a0, a1, a2, a3, b0, b1, b2, b3; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + + /* XXX: I did that only to give same values as previous code */ + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); + a1 = a0; + a2 = a0; + a3 = a0; + + a0 += + W2*col[8*2]; + a1 += + W6*col[8*2]; + a2 += - W6*col[8*2]; + a3 += - W2*col[8*2]; + + b0 = MUL16(W1, col[8*1]); + b1 = MUL16(W3, col[8*1]); + b2 = MUL16(W5, col[8*1]); + b3 = MUL16(W7, col[8*1]); + + MAC16(b0, + W3, col[8*3]); + MAC16(b1, - W7, col[8*3]); + MAC16(b2, - W1, col[8*3]); + MAC16(b3, - W5, col[8*3]); + + if(col[8*4]){ + a0 += + W4*col[8*4]; + a1 += - W4*col[8*4]; + a2 += - W4*col[8*4]; + a3 += + W4*col[8*4]; + } + + if (col[8*5]) { + MAC16(b0, + W5, col[8*5]); + MAC16(b1, - W1, col[8*5]); + MAC16(b2, + W7, col[8*5]); + MAC16(b3, + W3, col[8*5]); + } + + if(col[8*6]){ + a0 += + W6*col[8*6]; + a1 += - W2*col[8*6]; + a2 += + W2*col[8*6]; + a3 += - W6*col[8*6]; + } + + if (col[8*7]) { + MAC16(b0, + W7, col[8*7]); + MAC16(b1, - W5, col[8*7]); + MAC16(b2, + W3, col[8*7]); + MAC16(b3, - W1, col[8*7]); + } + + dest[0] = cm[(a0 + b0) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a1 + b1) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a2 + b2) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a3 + b3) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a3 - b3) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a2 - b2) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a1 - b1) >> COL_SHIFT]; + dest += line_size; + dest[0] = cm[(a0 - b0) >> COL_SHIFT]; +} + +static inline void idctSparseColAdd (uint8_t *dest, int line_size, + DCTELEM * col) +{ + int a0, a1, a2, a3, b0, b1, b2, b3; + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; + + /* XXX: I did that only to give same values as previous code */ + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); + a1 = a0; + a2 = a0; + a3 = a0; + + a0 += + W2*col[8*2]; + a1 += + W6*col[8*2]; + a2 += - W6*col[8*2]; + a3 += - W2*col[8*2]; + + b0 = MUL16(W1, col[8*1]); + b1 = MUL16(W3, col[8*1]); + b2 = MUL16(W5, col[8*1]); + b3 = MUL16(W7, col[8*1]); + + MAC16(b0, + W3, col[8*3]); + MAC16(b1, - W7, col[8*3]); + MAC16(b2, - W1, col[8*3]); + MAC16(b3, - W5, col[8*3]); + + if(col[8*4]){ + a0 += + W4*col[8*4]; + a1 += - W4*col[8*4]; + a2 += - W4*col[8*4]; + a3 += + W4*col[8*4]; + } + + if (col[8*5]) { + MAC16(b0, + W5, col[8*5]); + MAC16(b1, - W1, col[8*5]); + MAC16(b2, + W7, col[8*5]); + MAC16(b3, + W3, col[8*5]); + } + + if(col[8*6]){ + a0 += + W6*col[8*6]; + a1 += - W2*col[8*6]; + a2 += + W2*col[8*6]; + a3 += - W6*col[8*6]; + } + + if (col[8*7]) { + MAC16(b0, + W7, col[8*7]); + MAC16(b1, - W5, col[8*7]); + MAC16(b2, + W3, col[8*7]); + MAC16(b3, - W1, col[8*7]); + } + + dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)]; + dest += line_size; + dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)]; +} + +static inline void idctSparseCol (DCTELEM * col) +{ + int a0, a1, a2, a3, b0, b1, b2, b3; + + /* XXX: I did that only to give same values as previous code */ + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); + a1 = a0; + a2 = a0; + a3 = a0; + + a0 += + W2*col[8*2]; + a1 += + W6*col[8*2]; + a2 += - W6*col[8*2]; + a3 += - W2*col[8*2]; + + b0 = MUL16(W1, col[8*1]); + b1 = MUL16(W3, col[8*1]); + b2 = MUL16(W5, col[8*1]); + b3 = MUL16(W7, col[8*1]); + + MAC16(b0, + W3, col[8*3]); + MAC16(b1, - W7, col[8*3]); + MAC16(b2, - W1, col[8*3]); + MAC16(b3, - W5, col[8*3]); + + if(col[8*4]){ + a0 += + W4*col[8*4]; + a1 += - W4*col[8*4]; + a2 += - W4*col[8*4]; + a3 += + W4*col[8*4]; + } + + if (col[8*5]) { + MAC16(b0, + W5, col[8*5]); + MAC16(b1, - W1, col[8*5]); + MAC16(b2, + W7, col[8*5]); + MAC16(b3, + W3, col[8*5]); + } + + if(col[8*6]){ + a0 += + W6*col[8*6]; + a1 += - W2*col[8*6]; + a2 += + W2*col[8*6]; + a3 += - W6*col[8*6]; + } + + if (col[8*7]) { + MAC16(b0, + W7, col[8*7]); + MAC16(b1, - W5, col[8*7]); + MAC16(b2, + W3, col[8*7]); + MAC16(b3, - W1, col[8*7]); + } + + col[0 ] = ((a0 + b0) >> COL_SHIFT); + col[8 ] = ((a1 + b1) >> COL_SHIFT); + col[16] = ((a2 + b2) >> COL_SHIFT); + col[24] = ((a3 + b3) >> COL_SHIFT); + col[32] = ((a3 - b3) >> COL_SHIFT); + col[40] = ((a2 - b2) >> COL_SHIFT); + col[48] = ((a1 - b1) >> COL_SHIFT); + col[56] = ((a0 - b0) >> COL_SHIFT); +} + +void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + int i; + for(i=0; i<8; i++) + idctRowCondDC(block + i*8); + + for(i=0; i<8; i++) + idctSparseColPut(dest + i, line_size, block + i); +} + +void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + int i; + for(i=0; i<8; i++) + idctRowCondDC(block + i*8); + + for(i=0; i<8; i++) + idctSparseColAdd(dest + i, line_size, block + i); +} + +void ff_simple_idct(DCTELEM *block) +{ + int i; + for(i=0; i<8; i++) + idctRowCondDC(block + i*8); + + for(i=0; i<8; i++) + idctSparseCol(block + i); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/simple_idct.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/simple_idct.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,47 @@ +/* + * Simple IDCT + * + * Copyright (c) 2001 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * simple idct header. + */ + +#ifndef AVCODEC_SIMPLE_IDCT_H +#define AVCODEC_SIMPLE_IDCT_H + +#include +#include "dsputil.h" + +void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block); +void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block); +void ff_simple_idct_mmx(int16_t *block); +void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct(DCTELEM *block); + +void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block); + +void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block); +void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block); +void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block); + +#endif /* AVCODEC_SIMPLE_IDCT_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/utils.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/utils.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,68 @@ +/* + * utils for libavcodec + * Copyright (c) 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * utils. + */ + +/* needed for mkstemp() */ +#define _XOPEN_SOURCE 600 + +#include "avcodec.h" +#include "dsputil.h" + +#include +#include +#include +#include +//#undef NDEBUG +#include + +#include + +void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size) +{ + if(min_size < *size) + return ptr; + + *size= FFMAX(17*min_size/16 + 32, min_size); + + ptr= av_realloc(ptr, *size); + if(!ptr) //we could set this to the unmodified min_size but this is safer if the user lost the ptr and uses NULL now + *size= 0; + + return ptr; +} + +void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size) +{ + void **p = ptr; + if (min_size < *size) + return; + *size= FFMAX(17*min_size/16 + 32, min_size); + av_free(*p); + *p = av_malloc(*size); + if (!*p) *size = 0; +} + + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/cpuid.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/cpuid.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,135 @@ +/* + * CPU detection code, extracted from mmx.h + * (c)1997-99 by H. Dietz and R. Fisher + * Converted to C and improved by Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" + +#undef printf + +/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ +#define cpuid(index,eax,ebx,ecx,edx)\ + __asm__ volatile\ + ("mov %%"REG_b", %%"REG_S"\n\t"\ + "cpuid\n\t"\ + "xchg %%"REG_b", %%"REG_S\ + : "=a" (eax), "=S" (ebx),\ + "=c" (ecx), "=d" (edx)\ + : "0" (index)); + +/* Function to test if multimedia instructions are supported... */ +int mm_support() +{ + int rval = 0; + int eax, ebx, ecx, edx; + int max_std_level, max_ext_level, std_caps=0, ext_caps=0; + +#if ARCH_X86_32 + x86_reg a, c; + __asm__ volatile ( + /* See if CPUID instruction is supported ... */ + /* ... Get copies of EFLAGS into eax and ecx */ + "pushfl\n\t" + "pop %0\n\t" + "mov %0, %1\n\t" + + /* ... Toggle the ID bit in one copy and store */ + /* to the EFLAGS reg */ + "xor $0x200000, %0\n\t" + "push %0\n\t" + "popfl\n\t" + + /* ... Get the (hopefully modified) EFLAGS */ + "pushfl\n\t" + "pop %0\n\t" + : "=a" (a), "=c" (c) + : + : "cc" + ); + + if (a == c) + return 0; /* CPUID not supported */ +#endif + + cpuid(0, max_std_level, ebx, ecx, edx); + + if(max_std_level >= 1){ + cpuid(1, eax, ebx, ecx, std_caps); + if (std_caps & (1<<23)) + rval |= FF_MM_MMX; + if (std_caps & (1<<25)) + rval |= FF_MM_MMX2 +#if HAVE_SSE + | FF_MM_SSE; + if (std_caps & (1<<26)) + rval |= FF_MM_SSE2; + if (ecx & 1) + rval |= FF_MM_SSE3; + if (ecx & 0x00000200 ) + rval |= FF_MM_SSSE3; + if (ecx & 0x00080000 ) + rval |= FF_MM_SSE4; + if (ecx & 0x00100000 ) + rval |= FF_MM_SSE42; +#endif + ; + } + + cpuid(0x80000000, max_ext_level, ebx, ecx, edx); + + if(max_ext_level >= 0x80000001){ + cpuid(0x80000001, eax, ebx, ecx, ext_caps); + if (ext_caps & (1<<31)) + rval |= FF_MM_3DNOW; + if (ext_caps & (1<<30)) + rval |= FF_MM_3DNOWEXT; + if (ext_caps & (1<<23)) + rval |= FF_MM_MMX; + if (ext_caps & (1<<22)) + rval |= FF_MM_MMX2; + } + +#if 0 + av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n", + (rval&FF_MM_MMX) ? "MMX ":"", + (rval&FF_MM_MMX2) ? "MMX2 ":"", + (rval&FF_MM_SSE) ? "SSE ":"", + (rval&FF_MM_SSE2) ? "SSE2 ":"", + (rval&FF_MM_SSE3) ? "SSE3 ":"", + (rval&FF_MM_SSSE3) ? "SSSE3 ":"", + (rval&FF_MM_SSE4) ? "SSE4.1 ":"", + (rval&FF_MM_SSE42) ? "SSE4.2 ":"", + (rval&FF_MM_3DNOW) ? "3DNow ":"", + (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":""); +#endif + return rval; +} + +#ifdef TEST +int main ( void ) +{ + int mm_flags; + mm_flags = mm_support(); + printf("mm_support = 0x%08X\n",mm_flags); + return 0; +} +#endif diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_h264_template_mmx.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/dsputil_h264_template_mmx.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2005 Zoltan Hidvegi , + * Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * MMX optimized version of (put|avg)_h264_chroma_mc8. + * H264_CHROMA_MC8_TMPL must be defined to the desired function name + * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function + */ +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) +{ + DECLARE_ALIGNED(8, uint64_t, AA); + DECLARE_ALIGNED(8, uint64_t, DD); + int i; + + if(y==0 && x==0) { + /* no filter needed */ + H264_CHROMA_MC8_MV0(dst, src, stride, h); + return; + } + + assert(x<8 && y<8 && x>=0 && y>=0); + + if(y==0 || x==0) + { + /* 1 dimensional filter only */ + const int dxy = x ? 1 : stride; + + __asm__ volatile( + "movd %0, %%mm5\n\t" + "movq %1, %%mm4\n\t" + "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ + "punpcklwd %%mm5, %%mm5\n\t" + "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ + "pxor %%mm7, %%mm7\n\t" + "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ + :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); + + for(i=0; i> 3)) >> 3 */ + "paddw %%mm6, %%mm0\n\t" + "paddw %%mm6, %%mm1\n\t" + "paddw %%mm2, %%mm0\n\t" + "paddw %%mm3, %%mm1\n\t" + "psrlw $3, %%mm0\n\t" + "psrlw $3, %%mm1\n\t" + "packuswb %%mm1, %%mm0\n\t" + H264_CHROMA_OP(%0, %%mm0) + "movq %%mm0, %0\n\t" + : "=m" (dst[0])); + + src += stride; + dst += stride; + } + return; + } + + /* general case, bilinear */ + __asm__ volatile("movd %2, %%mm4\n\t" + "movd %3, %%mm6\n\t" + "punpcklwd %%mm4, %%mm4\n\t" + "punpcklwd %%mm6, %%mm6\n\t" + "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ + "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ + "movq %%mm4, %%mm5\n\t" + "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ + "psllw $3, %%mm5\n\t" + "psllw $3, %%mm6\n\t" + "movq %%mm5, %%mm7\n\t" + "paddw %%mm6, %%mm7\n\t" + "movq %%mm4, %1\n\t" /* DD = x * y */ + "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ + "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ + "paddw %4, %%mm4\n\t" + "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ + "pxor %%mm7, %%mm7\n\t" + "movq %%mm4, %0\n\t" + : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); + + __asm__ volatile( + /* mm0 = src[0..7], mm1 = src[1..8] */ + "movq %0, %%mm0\n\t" + "movq %1, %%mm1\n\t" + : : "m" (src[0]), "m" (src[1])); + + for(i=0; i> 6 */ + "paddw %1, %%mm2\n\t" + "paddw %1, %%mm3\n\t" + "psrlw $6, %%mm2\n\t" + "psrlw $6, %%mm3\n\t" + "packuswb %%mm3, %%mm2\n\t" + H264_CHROMA_OP(%0, %%mm2) + "movq %%mm2, %0\n\t" + : "=m" (dst[0]) : "m" (*rnd_reg)); + dst+= stride; + } +} + +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) +{ + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "movd %5, %%mm2 \n\t" + "movd %6, %%mm3 \n\t" + "movq "MANGLE(ff_pw_8)", %%mm4\n\t" + "movq "MANGLE(ff_pw_8)", %%mm5\n\t" + "punpcklwd %%mm2, %%mm2 \n\t" + "punpcklwd %%mm3, %%mm3 \n\t" + "punpcklwd %%mm2, %%mm2 \n\t" + "punpcklwd %%mm3, %%mm3 \n\t" + "psubw %%mm2, %%mm4 \n\t" + "psubw %%mm3, %%mm5 \n\t" + + "movd (%1), %%mm0 \n\t" + "movd 1(%1), %%mm6 \n\t" + "add %3, %1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "pmullw %%mm4, %%mm0 \n\t" + "pmullw %%mm2, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "movd 1(%1), %%mm1 \n\t" + "add %3, %1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm4, %%mm0 \n\t" + "pmullw %%mm2, %%mm1 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "movq %%mm1, %%mm0 \n\t" + "pmullw %%mm5, %%mm6 \n\t" + "pmullw %%mm3, %%mm1 \n\t" + "paddw %4, %%mm6 \n\t" + "paddw %%mm6, %%mm1 \n\t" + "psrlw $6, %%mm1 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + H264_CHROMA_OP4((%0), %%mm1, %%mm6) + "movd %%mm1, (%0) \n\t" + "add %3, %0 \n\t" + "movd (%1), %%mm6 \n\t" + "movd 1(%1), %%mm1 \n\t" + "add %3, %1 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm4, %%mm6 \n\t" + "pmullw %%mm2, %%mm1 \n\t" + "paddw %%mm6, %%mm1 \n\t" + "movq %%mm1, %%mm6 \n\t" + "pmullw %%mm5, %%mm0 \n\t" + "pmullw %%mm3, %%mm1 \n\t" + "paddw %4, %%mm0 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "psrlw $6, %%mm1 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + H264_CHROMA_OP4((%0), %%mm1, %%mm0) + "movd %%mm1, (%0) \n\t" + "add %3, %0 \n\t" + "sub $2, %2 \n\t" + "jnz 1b \n\t" + : "+r"(dst), "+r"(src), "+r"(h) + : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) + ); +} + +#ifdef H264_CHROMA_MC2_TMPL +static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + int tmp = ((1<<16)-1)*x + 8; + int CD= tmp*y; + int AB= (tmp<<3) - CD; + __asm__ volatile( + /* mm5 = {A,B,A,B} */ + /* mm6 = {C,D,C,D} */ + "movd %0, %%mm5\n\t" + "movd %1, %%mm6\n\t" + "punpckldq %%mm5, %%mm5\n\t" + "punpckldq %%mm6, %%mm6\n\t" + "pxor %%mm7, %%mm7\n\t" + /* mm0 = src[0,1,1,2] */ + "movd %2, %%mm2\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "pshufw $0x94, %%mm2, %%mm2\n\t" + :: "r"(AB), "r"(CD), "m"(src[0])); + + + __asm__ volatile( + "1:\n\t" + "add %4, %1\n\t" + /* mm1 = A * src[0,1] + B * src[1,2] */ + "movq %%mm2, %%mm1\n\t" + "pmaddwd %%mm5, %%mm1\n\t" + /* mm0 = src[0,1,1,2] */ + "movd (%1), %%mm0\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "pshufw $0x94, %%mm0, %%mm0\n\t" + /* mm1 += C * src[0,1] + D * src[1,2] */ + "movq %%mm0, %%mm2\n\t" + "pmaddwd %%mm6, %%mm0\n\t" + "paddw %3, %%mm1\n\t" + "paddw %%mm0, %%mm1\n\t" + /* dst[0,1] = pack((mm1 + 32) >> 6) */ + "psrlw $6, %%mm1\n\t" + "packssdw %%mm7, %%mm1\n\t" + "packuswb %%mm7, %%mm1\n\t" + H264_CHROMA_OP4((%0), %%mm1, %%mm3) + "movd %%mm1, %%esi\n\t" + "movw %%si, (%0)\n\t" + "add %4, %0\n\t" + "sub $1, %2\n\t" + "jnz 1b\n\t" + : "+r" (dst), "+r"(src), "+r"(h) + : "m" (ff_pw_32), "r"((x86_reg)stride) + : "%esi"); + +} +#endif + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_h264_template_ssse3.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/dsputil_h264_template_ssse3.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2008 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. + * H264_CHROMA_MC8_TMPL must be defined to the desired function name + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function + * AVG_OP must be defined to empty for put and the identify for avg + */ +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) +{ + if(y==0 && x==0) { + /* no filter needed */ + H264_CHROMA_MC8_MV0(dst, src, stride, h); + return; + } + + assert(x<8 && y<8 && x>=0 && y>=0); + + if(y==0 || x==0) + { + /* 1 dimensional filter only */ + __asm__ volatile( + "movd %0, %%xmm7 \n\t" + "movq %1, %%xmm6 \n\t" + "pshuflw $0, %%xmm7, %%xmm7 \n\t" + "movlhps %%xmm6, %%xmm6 \n\t" + "movlhps %%xmm7, %%xmm7 \n\t" + :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) + ); + + if(x) { + __asm__ volatile( + "1: \n\t" + "movq (%1), %%xmm0 \n\t" + "movq 1(%1), %%xmm1 \n\t" + "movq (%1,%3), %%xmm2 \n\t" + "movq 1(%1,%3), %%xmm3 \n\t" + "punpcklbw %%xmm1, %%xmm0 \n\t" + "punpcklbw %%xmm3, %%xmm2 \n\t" + "pmaddubsw %%xmm7, %%xmm0 \n\t" + "pmaddubsw %%xmm7, %%xmm2 \n\t" + AVG_OP("movq (%0), %%xmm4 \n\t") + AVG_OP("movhps (%0,%3), %%xmm4 \n\t") + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm2 \n\t" + "psrlw $3, %%xmm0 \n\t" + "psrlw $3, %%xmm2 \n\t" + "packuswb %%xmm2, %%xmm0 \n\t" + AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") + "movq %%xmm0, (%0) \n\t" + "movhps %%xmm0, (%0,%3) \n\t" + "sub $2, %2 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); + } else { + __asm__ volatile( + "1: \n\t" + "movq (%1), %%xmm0 \n\t" + "movq (%1,%3), %%xmm1 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + "movq (%1,%3,2), %%xmm3 \n\t" + "punpcklbw %%xmm1, %%xmm0 \n\t" + "punpcklbw %%xmm3, %%xmm2 \n\t" + "pmaddubsw %%xmm7, %%xmm0 \n\t" + "pmaddubsw %%xmm7, %%xmm2 \n\t" + AVG_OP("movq (%0), %%xmm4 \n\t") + AVG_OP("movhps (%0,%3), %%xmm4 \n\t") + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm2 \n\t" + "psrlw $3, %%xmm0 \n\t" + "psrlw $3, %%xmm2 \n\t" + "packuswb %%xmm2, %%xmm0 \n\t" + AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") + "movq %%xmm0, (%0) \n\t" + "movhps %%xmm0, (%0,%3) \n\t" + "sub $2, %2 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); + } + return; + } + + /* general case, bilinear */ + __asm__ volatile( + "movd %0, %%xmm7 \n\t" + "movd %1, %%xmm6 \n\t" + "movdqa %2, %%xmm5 \n\t" + "pshuflw $0, %%xmm7, %%xmm7 \n\t" + "pshuflw $0, %%xmm6, %%xmm6 \n\t" + "movlhps %%xmm7, %%xmm7 \n\t" + "movlhps %%xmm6, %%xmm6 \n\t" + :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) + ); + + __asm__ volatile( + "movq (%1), %%xmm0 \n\t" + "movq 1(%1), %%xmm1 \n\t" + "punpcklbw %%xmm1, %%xmm0 \n\t" + "add %3, %1 \n\t" + "1: \n\t" + "movq (%1), %%xmm1 \n\t" + "movq 1(%1), %%xmm2 \n\t" + "movq (%1,%3), %%xmm3 \n\t" + "movq 1(%1,%3), %%xmm4 \n\t" + "lea (%1,%3,2), %1 \n\t" + "punpcklbw %%xmm2, %%xmm1 \n\t" + "punpcklbw %%xmm4, %%xmm3 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + "movdqa %%xmm3, %%xmm4 \n\t" + "pmaddubsw %%xmm7, %%xmm0 \n\t" + "pmaddubsw %%xmm6, %%xmm1 \n\t" + "pmaddubsw %%xmm7, %%xmm2 \n\t" + "pmaddubsw %%xmm6, %%xmm3 \n\t" + "paddw %%xmm5, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm2 \n\t" + "paddw %%xmm0, %%xmm1 \n\t" + "paddw %%xmm2, %%xmm3 \n\t" + "movdqa %%xmm4, %%xmm0 \n\t" + "psrlw $6, %%xmm1 \n\t" + "psrlw $6, %%xmm3 \n\t" + AVG_OP("movq (%0), %%xmm2 \n\t") + AVG_OP("movhps (%0,%3), %%xmm2 \n\t") + "packuswb %%xmm3, %%xmm1 \n\t" + AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") + "movq %%xmm1, (%0)\n\t" + "movhps %%xmm1, (%0,%3)\n\t" + "sub $2, %2 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); +} + +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + __asm__ volatile( + "movd %0, %%mm7 \n\t" + "movd %1, %%mm6 \n\t" + "movq %2, %%mm5 \n\t" + "pshufw $0, %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm6, %%mm6 \n\t" + :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) + ); + + __asm__ volatile( + "movd (%1), %%mm0 \n\t" + "punpcklbw 1(%1), %%mm0 \n\t" + "add %3, %1 \n\t" + "1: \n\t" + "movd (%1), %%mm1 \n\t" + "movd (%1,%3), %%mm3 \n\t" + "punpcklbw 1(%1), %%mm1 \n\t" + "punpcklbw 1(%1,%3), %%mm3 \n\t" + "lea (%1,%3,2), %1 \n\t" + "movq %%mm1, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pmaddubsw %%mm7, %%mm0 \n\t" + "pmaddubsw %%mm6, %%mm1 \n\t" + "pmaddubsw %%mm7, %%mm2 \n\t" + "pmaddubsw %%mm6, %%mm3 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm5, %%mm2 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm0 \n\t" + "psrlw $6, %%mm1 \n\t" + "psrlw $6, %%mm3 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + "packuswb %%mm3, %%mm3 \n\t" + AVG_OP("pavgb (%0), %%mm1 \n\t") + AVG_OP("pavgb (%0,%3), %%mm3 \n\t") + "movd %%mm1, (%0)\n\t" + "movd %%mm3, (%0,%3)\n\t" + "sub $2, %2 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_mmx.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/dsputil_mmx.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,821 @@ +/* + * MMX optimized DSP utils + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * MMX optimization by Nick Kurshev + */ + +#include "libavutil/x86_cpu.h" +#include "libavutil/internal.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/h264_dsp.h" +#include "dsputil_mmx.h" + + +//#undef NDEBUG +//#include + +int mm_flags; /* multimedia extension flags */ + +/* pixel operations */ +DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; + +DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = +{0x8000000080000000ULL, 0x8000000080000000ULL}; + +DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; + +DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; + +DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; +DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; + +#define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t" +#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) +#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) + +#define MOVQ_BFE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ + "paddb %%" #regd ", %%" #regd " \n\t" ::) + +#ifndef PIC +#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) +#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) +#else +// for shared library it's better to use this way for accessing constants +// pcmpeqd -> -1 +#define MOVQ_BONE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd " \n\t" \ + "packuswb %%" #regd ", %%" #regd " \n\t" ::) + +#define MOVQ_WTWO(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd " \n\t" \ + "psllw $1, %%" #regd " \n\t"::) + +#endif + +// using regr as temporary and for the output result +// first argument is unmodifed and second is trashed +// regfe is supposed to contain 0xfefefefefefefefe +#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ + "movq " #rega ", " #regr " \n\t"\ + "pand " #regb ", " #regr " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pand " #regfe "," #regb " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "paddb " #regb ", " #regr " \n\t" + +#define PAVGB_MMX(rega, regb, regr, regfe) \ + "movq " #rega ", " #regr " \n\t"\ + "por " #regb ", " #regr " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pand " #regfe "," #regb " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psubb " #regb ", " #regr " \n\t" + +// mm6 is supposed to contain 0xfefefefefefefefe +#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ + "movq " #rega ", " #regr " \n\t"\ + "movq " #regc ", " #regp " \n\t"\ + "pand " #regb ", " #regr " \n\t"\ + "pand " #regd ", " #regp " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pxor " #regc ", " #regd " \n\t"\ + "pand %%mm6, " #regb " \n\t"\ + "pand %%mm6, " #regd " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psrlq $1, " #regd " \n\t"\ + "paddb " #regb ", " #regr " \n\t"\ + "paddb " #regd ", " #regp " \n\t" + +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ + "movq " #rega ", " #regr " \n\t"\ + "movq " #regc ", " #regp " \n\t"\ + "por " #regb ", " #regr " \n\t"\ + "por " #regd ", " #regp " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pxor " #regc ", " #regd " \n\t"\ + "pand %%mm6, " #regb " \n\t"\ + "pand %%mm6, " #regd " \n\t"\ + "psrlq $1, " #regd " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psubb " #regb ", " #regr " \n\t"\ + "psubb " #regd ", " #regp " \n\t" + +/***********************************/ +/* MMX2 specific */ + +#define DEF(x) x ## _mmx2 + +/* Introduced only in MMX2 set */ +#define PAVGB "pavgb" +#define OP_AVG PAVGB + +#include "dsputil_mmx_avg_template.c" + +#undef DEF +#undef PAVGB +#undef OP_AVG + +#define put_no_rnd_pixels16_mmx put_pixels16_mmx +#define put_no_rnd_pixels8_mmx put_pixels8_mmx +#define put_pixels16_mmx2 put_pixels16_mmx +#define put_pixels8_mmx2 put_pixels8_mmx +#define put_pixels4_mmx2 put_pixels4_mmx +#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx +#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx +#define put_pixels16_3dnow put_pixels16_mmx +#define put_pixels8_3dnow put_pixels8_mmx +#define put_pixels4_3dnow put_pixels4_mmx +#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx +#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx + +/***********************************/ +/* standard MMX */ + +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + const DCTELEM *p; + uint8_t *pix; + + /* read the pixels */ + p = block; + pix = pixels; + /* unrolled loop */ + __asm__ volatile( + "movq %3, %%mm0 \n\t" + "movq 8%3, %%mm1 \n\t" + "movq 16%3, %%mm2 \n\t" + "movq 24%3, %%mm3 \n\t" + "movq 32%3, %%mm4 \n\t" + "movq 40%3, %%mm5 \n\t" + "movq 48%3, %%mm6 \n\t" + "movq 56%3, %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) + :"memory"); + pix += line_size*4; + p += 32; + + // if here would be an exact copy of the code above + // compiler would generate some very strange code + // thus using "r" + __asm__ volatile( + "movq (%3), %%mm0 \n\t" + "movq 8(%3), %%mm1 \n\t" + "movq 16(%3), %%mm2 \n\t" + "movq 24(%3), %%mm3 \n\t" + "movq 32(%3), %%mm4 \n\t" + "movq 40(%3), %%mm5 \n\t" + "movq 48(%3), %%mm6 \n\t" + "movq 56(%3), %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) + :"memory"); +} + +DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; + +#define put_signed_pixels_clamped_mmx_half(off) \ + "movq "#off"(%2), %%mm1 \n\t"\ + "movq 16+"#off"(%2), %%mm2 \n\t"\ + "movq 32+"#off"(%2), %%mm3 \n\t"\ + "movq 48+"#off"(%2), %%mm4 \n\t"\ + "packsswb 8+"#off"(%2), %%mm1 \n\t"\ + "packsswb 24+"#off"(%2), %%mm2 \n\t"\ + "packsswb 40+"#off"(%2), %%mm3 \n\t"\ + "packsswb 56+"#off"(%2), %%mm4 \n\t"\ + "paddb %%mm0, %%mm1 \n\t"\ + "paddb %%mm0, %%mm2 \n\t"\ + "paddb %%mm0, %%mm3 \n\t"\ + "paddb %%mm0, %%mm4 \n\t"\ + "movq %%mm1, (%0) \n\t"\ + "movq %%mm2, (%0, %3) \n\t"\ + "movq %%mm3, (%0, %3, 2) \n\t"\ + "movq %%mm4, (%0, %1) \n\t" + +void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + x86_reg line_skip = line_size; + x86_reg line_skip3; + + __asm__ volatile ( + "movq "MANGLE(ff_vector128)", %%mm0 \n\t" + "lea (%3, %3, 2), %1 \n\t" + put_signed_pixels_clamped_mmx_half(0) + "lea (%0, %3, 4), %0 \n\t" + put_signed_pixels_clamped_mmx_half(64) + :"+&r" (pixels), "=&r" (line_skip3) + :"r" (block), "r"(line_skip) + :"memory"); +} + +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + const DCTELEM *p; + uint8_t *pix; + int i; + + /* read the pixels */ + p = block; + pix = pixels; + MOVQ_ZERO(mm7); + i = 4; + do { + __asm__ volatile( + "movq (%2), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 16(%2), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "movq %0, %%mm4 \n\t" + "movq %1, %%mm6 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm4, %%mm0 \n\t" + "paddsw %%mm5, %%mm1 \n\t" + "movq %%mm6, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm6, %%mm2 \n\t" + "paddsw %%mm5, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %0 \n\t" + "movq %%mm2, %1 \n\t" + :"+m"(*pix), "+m"(*(pix+line_size)) + :"r"(p) + :"memory"); + pix += line_size*2; + p += 16; + } while (--i); +} + +static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" + ); +} + +static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1,%3), %%xmm1 \n\t" + "movdqu (%1,%3,2), %%xmm2 \n\t" + "movdqu (%1,%4), %%xmm3 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2,%3) \n\t" + "movdqa %%xmm2, (%2,%3,2) \n\t" + "movdqa %%xmm3, (%2,%4) \n\t" + "subl $4, %0 \n\t" + "lea (%1,%3,4), %1 \n\t" + "lea (%2,%3,4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) + : "memory" + ); +} + +static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1,%3), %%xmm1 \n\t" + "movdqu (%1,%3,2), %%xmm2 \n\t" + "movdqu (%1,%4), %%xmm3 \n\t" + "pavgb (%2), %%xmm0 \n\t" + "pavgb (%2,%3), %%xmm1 \n\t" + "pavgb (%2,%3,2), %%xmm2 \n\t" + "pavgb (%2,%4), %%xmm3 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2,%3) \n\t" + "movdqa %%xmm2, (%2,%3,2) \n\t" + "movdqa %%xmm3, (%2,%4) \n\t" + "subl $4, %0 \n\t" + "lea (%1,%3,4), %1 \n\t" + "lea (%2,%3,4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) + : "memory" + ); +} + +static void clear_block_sse(DCTELEM *block) +{ + __asm__ volatile( + "xorps %%xmm0, %%xmm0 \n" + "movaps %%xmm0, (%0) \n" + "movaps %%xmm0, 16(%0) \n" + "movaps %%xmm0, 32(%0) \n" + "movaps %%xmm0, 48(%0) \n" + "movaps %%xmm0, 64(%0) \n" + "movaps %%xmm0, 80(%0) \n" + "movaps %%xmm0, 96(%0) \n" + "movaps %%xmm0, 112(%0) \n" + :: "r"(block) + : "memory" + ); +} + +static void clear_blocks_sse(DCTELEM *blocks) +{\ + __asm__ volatile( + "xorps %%xmm0, %%xmm0 \n" + "mov %1, %%"REG_a" \n" + "1: \n" + "movaps %%xmm0, (%0, %%"REG_a") \n" + "movaps %%xmm0, 16(%0, %%"REG_a") \n" + "movaps %%xmm0, 32(%0, %%"REG_a") \n" + "movaps %%xmm0, 48(%0, %%"REG_a") \n" + "movaps %%xmm0, 64(%0, %%"REG_a") \n" + "movaps %%xmm0, 80(%0, %%"REG_a") \n" + "movaps %%xmm0, 96(%0, %%"REG_a") \n" + "movaps %%xmm0, 112(%0, %%"REG_a") \n" + "add $128, %%"REG_a" \n" + " js 1b \n" + : : "r" (((uint8_t *)blocks)+128*6), + "i" (-128*6) + : "%"REG_a + ); +} + +static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ + __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd %4, %%mm0 \n\t" + "movd %5, %%mm1 \n\t" + "movd %6, %%mm2 \n\t" + "movd %7, %%mm3 \n\t" + "punpcklbw %%mm1, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, %1 \n\t" + "movd %%mm1, %2 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, %3 \n\t" + + : "=m" (*(uint32_t*)(dst + 0*dst_stride)), + "=m" (*(uint32_t*)(dst + 1*dst_stride)), + "=m" (*(uint32_t*)(dst + 2*dst_stride)), + "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "m" (*(uint32_t*)(src + 0*src_stride)), + "m" (*(uint32_t*)(src + 1*src_stride)), + "m" (*(uint32_t*)(src + 2*src_stride)), + "m" (*(uint32_t*)(src + 3*src_stride)) + ); +} + +#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ +\ +static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ +}\ +static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ +}\ +static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ +}\ +static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[9];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ +}\ +static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ +}\ +static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[17*2];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ +}\ +static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[17*2];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ +}\ +static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[17*2];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ +} + +#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" +#define AVG_3DNOW_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgusb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" +#define AVG_MMX2_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" + +#define PREFETCH(name, op) \ +static void name(void *mem, int stride, int h){\ + const uint8_t *p= mem;\ + do{\ + __asm__ volatile(#op" %0" :: "m"(*p));\ + p+= stride;\ + }while(--h);\ +} +PREFETCH(prefetch_mmx2, prefetcht0) +#undef PREFETCH + +#include "h264dsp_mmx.c" + +void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); +void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); +void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); + +void dsputil_init_mmx(DSPContext* c) +{ + mm_flags = mm_support(); + + if (mm_flags & FF_MM_MMX) { + c->clear_block = clear_block_sse; + c->clear_blocks = clear_blocks_sse; + c->prefetch = prefetch_mmx2; + + +#define H264_QPEL_FUNCS(x, y, CPU)\ + c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ + c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ + c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ + c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; + + if((mm_flags & FF_MM_SSE2)){ + c->put_pixels_tab[0][0] = put_pixels16_sse2; + c->avg_pixels_tab[0][0] = avg_pixels16_sse2; + + } + if(mm_flags & FF_MM_SSE2){ + H264_QPEL_FUNCS(0, 1, sse2); + H264_QPEL_FUNCS(0, 2, sse2); + H264_QPEL_FUNCS(0, 3, sse2); + H264_QPEL_FUNCS(1, 1, sse2); + H264_QPEL_FUNCS(1, 2, sse2); + H264_QPEL_FUNCS(1, 3, sse2); + H264_QPEL_FUNCS(2, 1, sse2); + H264_QPEL_FUNCS(2, 2, sse2); + H264_QPEL_FUNCS(2, 3, sse2); + H264_QPEL_FUNCS(3, 1, sse2); + H264_QPEL_FUNCS(3, 2, sse2); + H264_QPEL_FUNCS(3, 3, sse2); + } +#if HAVE_SSSE3 + if(mm_flags & FF_MM_SSSE3){ + H264_QPEL_FUNCS(1, 0, ssse3); + H264_QPEL_FUNCS(1, 1, ssse3); + H264_QPEL_FUNCS(1, 2, ssse3); + H264_QPEL_FUNCS(1, 3, ssse3); + H264_QPEL_FUNCS(2, 0, ssse3); + H264_QPEL_FUNCS(2, 1, ssse3); + H264_QPEL_FUNCS(2, 2, ssse3); + H264_QPEL_FUNCS(2, 3, ssse3); + H264_QPEL_FUNCS(3, 0, ssse3); + H264_QPEL_FUNCS(3, 1, ssse3); + H264_QPEL_FUNCS(3, 2, ssse3); + H264_QPEL_FUNCS(3, 3, ssse3); + + c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; + c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; + c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; + } +#endif + + + } +} + +void ff_h264dsp_init_x86(H264DSPContext *c) +{ + mm_flags = mm_support(); + + if (mm_flags & FF_MM_MMX) { + c->h264_idct_dc_add= + c->h264_idct_add= ff_h264_idct_add_mmx; + c->h264_idct8_dc_add= + c->h264_idct8_add= ff_h264_idct8_add_mmx; + + if (mm_flags & FF_MM_MMX2) { + c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; + c->h264_idct_add8 = ff_h264_idct_add8_mmx2; + c->h264_idct_add16 = ff_h264_idct_add16_mmx2; + c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; + + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; + c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; + + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; + c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; + + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; + c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; + c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; + c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; + + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; + c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; + c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; + c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; + } + if(mm_flags & FF_MM_SSE2){ + c->h264_idct8_add = ff_h264_idct8_add_sse2; + c->h264_idct8_add4= ff_h264_idct8_add4_sse2; + } + + } +} + diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_mmx.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/dsputil_mmx.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,170 @@ +/* + * MMX optimized DSP utils + * Copyright (c) 2007 Aurelien Jacobs + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DSPUTIL_MMX_H +#define AVCODEC_X86_DSPUTIL_MMX_H + +#include +#include "libavcodec/dsputil.h" + +typedef struct { uint64_t a, b; } xmm_reg; + +extern const uint64_t ff_bone; +extern const uint64_t ff_wtwo; + +extern const uint64_t ff_pdw_80000000[2]; + +extern const uint64_t ff_pw_3; +extern const uint64_t ff_pw_4; +extern const xmm_reg ff_pw_5; +extern const xmm_reg ff_pw_8; +extern const uint64_t ff_pw_15; +extern const xmm_reg ff_pw_16; +extern const uint64_t ff_pw_20; +extern const xmm_reg ff_pw_28; +extern const xmm_reg ff_pw_32; +extern const uint64_t ff_pw_42; +extern const xmm_reg ff_pw_64; +extern const uint64_t ff_pw_96; +extern const uint64_t ff_pw_128; +extern const uint64_t ff_pw_255; + +extern const uint64_t ff_pb_1; +extern const uint64_t ff_pb_3; +extern const uint64_t ff_pb_7; +extern const uint64_t ff_pb_1F; +extern const uint64_t ff_pb_3F; +extern const uint64_t ff_pb_81; +extern const uint64_t ff_pb_A1; +extern const uint64_t ff_pb_FC; + +extern const double ff_pd_1[2]; +extern const double ff_pd_2[2]; + +#define LOAD4(stride,in,a,b,c,d)\ + "movq 0*"#stride"+"#in", "#a"\n\t"\ + "movq 1*"#stride"+"#in", "#b"\n\t"\ + "movq 2*"#stride"+"#in", "#c"\n\t"\ + "movq 3*"#stride"+"#in", "#d"\n\t" + +#define STORE4(stride,out,a,b,c,d)\ + "movq "#a", 0*"#stride"+"#out"\n\t"\ + "movq "#b", 1*"#stride"+"#out"\n\t"\ + "movq "#c", 2*"#stride"+"#out"\n\t"\ + "movq "#d", 3*"#stride"+"#out"\n\t" + +/* in/out: mma=mma+mmb, mmb=mmb-mma */ +#define SUMSUB_BA( a, b ) \ + "paddw "#b", "#a" \n\t"\ + "paddw "#b", "#b" \n\t"\ + "psubw "#a", "#b" \n\t" + +#define SBUTTERFLY(a,b,t,n,m)\ + "mov" #m " " #a ", " #t " \n\t" /* abcd */\ + "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ + "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ + +#define TRANSPOSE4(a,b,c,d,t)\ + SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ + SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ + SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ + SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ + +// e,f,g,h can be memory +// out: a,d,t,c +#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ + "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ + "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ + "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ + "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ + SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\ + /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\ + SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\ + /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\ + SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\ + /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\ + SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\ + /* c= a3 b3 c3 d3 e3 f3 g3 h3 */ + +#if ARCH_X86_64 +// permutes 01234567 -> 05736421 +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ + SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ + SBUTTERFLY(c,d,b,wd,dqa)\ + SBUTTERFLY(e,f,d,wd,dqa)\ + SBUTTERFLY(g,h,f,wd,dqa)\ + SBUTTERFLY(a,c,h,dq,dqa)\ + SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ + SBUTTERFLY(e,g,b,dq,dqa)\ + SBUTTERFLY(d,f,g,dq,dqa)\ + SBUTTERFLY(a,e,f,qdq,dqa)\ + SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ + SBUTTERFLY(h,b,d,qdq,dqa)\ + SBUTTERFLY(c,g,b,qdq,dqa)\ + "movdqa %%xmm8, "#g" \n\t" +#else +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ + "movdqa "#h", "#t" \n\t"\ + SBUTTERFLY(a,b,h,wd,dqa)\ + "movdqa "#h", 16"#t" \n\t"\ + "movdqa "#t", "#h" \n\t"\ + SBUTTERFLY(c,d,b,wd,dqa)\ + SBUTTERFLY(e,f,d,wd,dqa)\ + SBUTTERFLY(g,h,f,wd,dqa)\ + SBUTTERFLY(a,c,h,dq,dqa)\ + "movdqa "#h", "#t" \n\t"\ + "movdqa 16"#t", "#h" \n\t"\ + SBUTTERFLY(h,b,c,dq,dqa)\ + SBUTTERFLY(e,g,b,dq,dqa)\ + SBUTTERFLY(d,f,g,dq,dqa)\ + SBUTTERFLY(a,e,f,qdq,dqa)\ + SBUTTERFLY(h,d,e,qdq,dqa)\ + "movdqa "#h", 16"#t" \n\t"\ + "movdqa "#t", "#h" \n\t"\ + SBUTTERFLY(h,b,d,qdq,dqa)\ + SBUTTERFLY(c,g,b,qdq,dqa)\ + "movdqa 16"#t", "#g" \n\t" +#endif + +#define MOVQ_WONE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd ::) + +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); + +void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); +void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); + +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); +void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); + +void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag, + double *autoc); + +void ff_mmx_idct(DCTELEM *block); +void ff_mmxext_idct(DCTELEM *block); + +#endif /* AVCODEC_X86_DSPUTIL_MMX_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/dsputil_mmx_avg_template.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/dsputil_mmx_avg_template.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,250 @@ +/* + * DSP utils : average functions are compiled twice for 3dnow/mmx2 + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * MMX optimization by Nick Kurshev + * mostly rewritten by Michael Niedermayer + * and improved by Zdenek Kabelac + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + + +static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" + + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) + + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" + PAVGB" (%3), %%mm0 \n\t" + PAVGB" 8(%3), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + PAVGB" 8(%3), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + PAVGB" 8(%3), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" + + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "1: \n\t" + "movq (%2), %%mm0 \n\t" + "movq (%2, %3), %%mm1 \n\t" + PAVGB" (%1), %%mm0 \n\t" + PAVGB" (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%2), %%mm0 \n\t" + "movq (%2, %3), %%mm1 \n\t" + PAVGB" (%1), %%mm0 \n\t" + PAVGB" (%1, %3), %%mm1 \n\t" + "add %%"REG_a", %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/h264dsp_mmx.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/h264dsp_mmx.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,1741 @@ +/* + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_mmx.h" + +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; + +/***********************************/ +/* IDCT */ + +#define SUMSUB_BADC( a, b, c, d ) \ + "paddw "#b", "#a" \n\t"\ + "paddw "#d", "#c" \n\t"\ + "paddw "#b", "#b" \n\t"\ + "paddw "#d", "#d" \n\t"\ + "psubw "#a", "#b" \n\t"\ + "psubw "#c", "#d" \n\t" + +#define SUMSUBD2_AB( a, b, t ) \ + "movq "#b", "#t" \n\t"\ + "psraw $1 , "#b" \n\t"\ + "paddw "#a", "#b" \n\t"\ + "psraw $1 , "#a" \n\t"\ + "psubw "#t", "#a" \n\t" + +#define IDCT4_1D( s02, s13, d02, d13, t ) \ + SUMSUB_BA ( s02, d02 )\ + SUMSUBD2_AB( s13, d13, t )\ + SUMSUB_BADC( d13, s02, s13, d02 ) + +#define STORE_DIFF_4P( p, t, z ) \ + "psraw $6, "#p" \n\t"\ + "movd (%0), "#t" \n\t"\ + "punpcklbw "#z", "#t" \n\t"\ + "paddsw "#t", "#p" \n\t"\ + "packuswb "#z", "#p" \n\t"\ + "movd "#p", (%0) \n\t" + +static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) +{ + /* Load dct coeffs */ + __asm__ volatile( + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm1 \n\t" + "movq 16(%0), %%mm2 \n\t" + "movq 24(%0), %%mm3 \n\t" + :: "r"(block) ); + + __asm__ volatile( + /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ + IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) + + "movq %0, %%mm6 \n\t" + /* in: 1,4,0,2 out: 1,2,3,0 */ + TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) + + "paddw %%mm6, %%mm3 \n\t" + + /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ + IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) + + "pxor %%mm7, %%mm7 \n\t" + :: "m"(ff_pw_32)); + + __asm__ volatile( + STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) + "add %1, %0 \n\t" + STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) + "add %1, %0 \n\t" + STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) + "add %1, %0 \n\t" + STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) + : "+r"(dst) + : "r" ((x86_reg)stride) + ); +} + +static inline void h264_idct8_1d(int16_t *block) +{ + __asm__ volatile( + "movq 112(%0), %%mm7 \n\t" + "movq 80(%0), %%mm0 \n\t" + "movq 48(%0), %%mm3 \n\t" + "movq 16(%0), %%mm5 \n\t" + + "movq %%mm0, %%mm4 \n\t" + "movq %%mm5, %%mm1 \n\t" + "psraw $1, %%mm4 \n\t" + "psraw $1, %%mm1 \n\t" + "paddw %%mm0, %%mm4 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "paddw %%mm7, %%mm4 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "psubw %%mm5, %%mm4 \n\t" + "paddw %%mm3, %%mm1 \n\t" + + "psubw %%mm3, %%mm5 \n\t" + "psubw %%mm3, %%mm0 \n\t" + "paddw %%mm7, %%mm5 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psraw $1, %%mm3 \n\t" + "psraw $1, %%mm7 \n\t" + "psubw %%mm3, %%mm5 \n\t" + "psubw %%mm7, %%mm0 \n\t" + + "movq %%mm4, %%mm3 \n\t" + "movq %%mm1, %%mm7 \n\t" + "psraw $2, %%mm1 \n\t" + "psraw $2, %%mm3 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "psraw $2, %%mm5 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "psraw $2, %%mm0 \n\t" + "psubw %%mm4, %%mm5 \n\t" + "psubw %%mm0, %%mm7 \n\t" + + "movq 32(%0), %%mm2 \n\t" + "movq 96(%0), %%mm6 \n\t" + "movq %%mm2, %%mm4 \n\t" + "movq %%mm6, %%mm0 \n\t" + "psraw $1, %%mm4 \n\t" + "psraw $1, %%mm6 \n\t" + "psubw %%mm0, %%mm4 \n\t" + "paddw %%mm2, %%mm6 \n\t" + + "movq (%0), %%mm2 \n\t" + "movq 64(%0), %%mm0 \n\t" + SUMSUB_BA( %%mm0, %%mm2 ) + SUMSUB_BA( %%mm6, %%mm0 ) + SUMSUB_BA( %%mm4, %%mm2 ) + SUMSUB_BA( %%mm7, %%mm6 ) + SUMSUB_BA( %%mm5, %%mm4 ) + SUMSUB_BA( %%mm3, %%mm2 ) + SUMSUB_BA( %%mm1, %%mm0 ) + :: "r"(block) + ); +} + +static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) +{ + int i; + DECLARE_ALIGNED(8, int16_t, b2)[64]; + + block[0] += 32; + + for(i=0; i<2; i++){ + DECLARE_ALIGNED(8, uint64_t, tmp); + + h264_idct8_1d(block+4*i); + + __asm__ volatile( + "movq %%mm7, %0 \n\t" + TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) + "movq %%mm0, 8(%1) \n\t" + "movq %%mm6, 24(%1) \n\t" + "movq %%mm7, 40(%1) \n\t" + "movq %%mm4, 56(%1) \n\t" + "movq %0, %%mm7 \n\t" + TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) + "movq %%mm7, (%1) \n\t" + "movq %%mm1, 16(%1) \n\t" + "movq %%mm0, 32(%1) \n\t" + "movq %%mm3, 48(%1) \n\t" + : "=m"(tmp) + : "r"(b2+32*i) + : "memory" + ); + } + + for(i=0; i<2; i++){ + h264_idct8_1d(b2+4*i); + + __asm__ volatile( + "psraw $6, %%mm7 \n\t" + "psraw $6, %%mm6 \n\t" + "psraw $6, %%mm5 \n\t" + "psraw $6, %%mm4 \n\t" + "psraw $6, %%mm3 \n\t" + "psraw $6, %%mm2 \n\t" + "psraw $6, %%mm1 \n\t" + "psraw $6, %%mm0 \n\t" + + "movq %%mm7, (%0) \n\t" + "movq %%mm5, 16(%0) \n\t" + "movq %%mm3, 32(%0) \n\t" + "movq %%mm1, 48(%0) \n\t" + "movq %%mm0, 64(%0) \n\t" + "movq %%mm2, 80(%0) \n\t" + "movq %%mm4, 96(%0) \n\t" + "movq %%mm6, 112(%0) \n\t" + :: "r"(b2+4*i) + : "memory" + ); + } + + add_pixels_clamped_mmx(b2, dst, stride); +} + +#define STORE_DIFF_8P( p, d, t, z )\ + "movq "#d", "#t" \n"\ + "psraw $6, "#p" \n"\ + "punpcklbw "#z", "#t" \n"\ + "paddsw "#t", "#p" \n"\ + "packuswb "#p", "#p" \n"\ + "movq "#p", "#d" \n" + +#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ + "movdqa "#c", "#a" \n"\ + "movdqa "#g", "#e" \n"\ + "psraw $1, "#c" \n"\ + "psraw $1, "#g" \n"\ + "psubw "#e", "#c" \n"\ + "paddw "#a", "#g" \n"\ + "movdqa "#b", "#e" \n"\ + "psraw $1, "#e" \n"\ + "paddw "#b", "#e" \n"\ + "paddw "#d", "#e" \n"\ + "paddw "#f", "#e" \n"\ + "movdqa "#f", "#a" \n"\ + "psraw $1, "#a" \n"\ + "paddw "#f", "#a" \n"\ + "paddw "#h", "#a" \n"\ + "psubw "#b", "#a" \n"\ + "psubw "#d", "#b" \n"\ + "psubw "#d", "#f" \n"\ + "paddw "#h", "#b" \n"\ + "psubw "#h", "#f" \n"\ + "psraw $1, "#d" \n"\ + "psraw $1, "#h" \n"\ + "psubw "#d", "#b" \n"\ + "psubw "#h", "#f" \n"\ + "movdqa "#e", "#d" \n"\ + "movdqa "#a", "#h" \n"\ + "psraw $2, "#d" \n"\ + "psraw $2, "#h" \n"\ + "paddw "#f", "#d" \n"\ + "paddw "#b", "#h" \n"\ + "psraw $2, "#f" \n"\ + "psraw $2, "#b" \n"\ + "psubw "#f", "#e" \n"\ + "psubw "#a", "#b" \n"\ + "movdqa 0x00(%1), "#a" \n"\ + "movdqa 0x40(%1), "#f" \n"\ + SUMSUB_BA(f, a)\ + SUMSUB_BA(g, f)\ + SUMSUB_BA(c, a)\ + SUMSUB_BA(e, g)\ + SUMSUB_BA(b, c)\ + SUMSUB_BA(h, a)\ + SUMSUB_BA(d, f) + +static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) +{ + __asm__ volatile( + "movdqa 0x10(%1), %%xmm1 \n" + "movdqa 0x20(%1), %%xmm2 \n" + "movdqa 0x30(%1), %%xmm3 \n" + "movdqa 0x50(%1), %%xmm5 \n" + "movdqa 0x60(%1), %%xmm6 \n" + "movdqa 0x70(%1), %%xmm7 \n" + H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) + TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) + "paddw %4, %%xmm4 \n" + "movdqa %%xmm4, 0x00(%1) \n" + "movdqa %%xmm2, 0x40(%1) \n" + H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) + "movdqa %%xmm6, 0x60(%1) \n" + "movdqa %%xmm7, 0x70(%1) \n" + "pxor %%xmm7, %%xmm7 \n" + STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) + "lea (%0,%2,4), %0 \n" + STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) + "movdqa 0x60(%1), %%xmm0 \n" + "movdqa 0x70(%1), %%xmm1 \n" + STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) + :"+r"(dst) + :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) + ); +} + +static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + __asm__ volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dst+0*stride)), + "+m"(*(uint32_t*)(dst+1*stride)), + "+m"(*(uint32_t*)(dst+2*stride)), + "+m"(*(uint32_t*)(dst+3*stride)) + ); +} + +static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + int y; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + for(y=2; y--; dst += 4*stride){ + __asm__ volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint64_t*)(dst+0*stride)), + "+m"(*(uint64_t*)(dst+1*stride)), + "+m"(*(uint64_t*)(dst+2*stride)), + "+m"(*(uint64_t*)(dst+3*stride)) + ); + } +} + +//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split +static const uint8_t scan8[16 + 2*4]={ + 4+1*8, 5+1*8, 4+2*8, 5+2*8, + 6+1*8, 7+1*8, 6+2*8, 7+2*8, + 4+3*8, 5+3*8, 4+4*8, 5+4*8, + 6+3*8, 7+3*8, 6+4*8, 7+4*8, + 1+1*8, 2+1*8, + 1+2*8, 2+2*8, + 1+4*8, 2+4*8, + 1+5*8, 2+5*8, +}; + +static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); + else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + } +} + +static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i+=4){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i+=4){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=16; i<16+8; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + else if(block[i*16]) + ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + } +} + +/***********************************/ +/* deblocking */ + +// out: o = |x-y|>a +// clobbers: t +#define DIFF_GT_MMX(x,y,a,o,t)\ + "movq "#y", "#t" \n\t"\ + "movq "#x", "#o" \n\t"\ + "psubusb "#x", "#t" \n\t"\ + "psubusb "#y", "#o" \n\t"\ + "por "#t", "#o" \n\t"\ + "psubusb "#a", "#o" \n\t" + +// out: o = |x-y|>a +// clobbers: t +#define DIFF_GT2_MMX(x,y,a,o,t)\ + "movq "#y", "#t" \n\t"\ + "movq "#x", "#o" \n\t"\ + "psubusb "#x", "#t" \n\t"\ + "psubusb "#y", "#o" \n\t"\ + "psubusb "#a", "#t" \n\t"\ + "psubusb "#a", "#o" \n\t"\ + "pcmpeqb "#t", "#o" \n\t"\ + +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 +// out: mm5=beta-1, mm7=mask +// clobbers: mm4,mm6 +#define H264_DEBLOCK_MASK(alpha1, beta1) \ + "pshufw $0, "#alpha1", %%mm4 \n\t"\ + "pshufw $0, "#beta1 ", %%mm5 \n\t"\ + "packuswb %%mm4, %%mm4 \n\t"\ + "packuswb %%mm5, %%mm5 \n\t"\ + DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ + DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ + "por %%mm4, %%mm7 \n\t"\ + DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ + "por %%mm4, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pcmpeqb %%mm6, %%mm7 \n\t" + +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) +// out: mm1=p0' mm2=q0' +// clobbers: mm0,3-6 +#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ + "movq %%mm1 , %%mm5 \n\t"\ + "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ + "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ + "pcmpeqb %%mm4 , %%mm4 \n\t"\ + "pxor %%mm4 , %%mm3 \n\t"\ + "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ + "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ + "pxor %%mm1 , %%mm4 \n\t"\ + "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ + "pavgb %%mm5 , %%mm3 \n\t"\ + "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ + "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ + "psubusb %%mm3 , %%mm6 \n\t"\ + "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ + "pminub %%mm7 , %%mm6 \n\t"\ + "pminub %%mm7 , %%mm3 \n\t"\ + "psubusb %%mm6 , %%mm1 \n\t"\ + "psubusb %%mm3 , %%mm2 \n\t"\ + "paddusb %%mm3 , %%mm1 \n\t"\ + "paddusb %%mm6 , %%mm2 \n\t" + +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone +// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) +// clobbers: q2, tmp, tc0 +#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ + "movq %%mm1, "#tmp" \n\t"\ + "pavgb %%mm2, "#tmp" \n\t"\ + "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ + "pxor "q2addr", "#tmp" \n\t"\ + "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ + "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ + "movq "#p1", "#tmp" \n\t"\ + "psubusb "#tc0", "#tmp" \n\t"\ + "paddusb "#p1", "#tc0" \n\t"\ + "pmaxub "#tmp", "#q2" \n\t"\ + "pminub "#tc0", "#q2" \n\t"\ + "movq "#q2", "q1addr" \n\t" + +static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) +{ + DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; + + __asm__ volatile( + "movq (%2,%4), %%mm0 \n\t" //p1 + "movq (%2,%4,2), %%mm1 \n\t" //p0 + "movq (%3), %%mm2 \n\t" //q0 + "movq (%3,%4), %%mm3 \n\t" //q1 + H264_DEBLOCK_MASK(%7, %8) + + "movd %6, %%mm4 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpcklwd %%mm4, %%mm4 \n\t" + "pcmpeqb %%mm3, %%mm3 \n\t" + "movq %%mm4, %%mm6 \n\t" + "pcmpgtb %%mm3, %%mm4 \n\t" + "movq %%mm6, %1 \n\t" + "pand %%mm4, %%mm7 \n\t" + "movq %%mm7, %0 \n\t" + + /* filter p1 */ + "movq (%2), %%mm3 \n\t" //p2 + DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 + "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|beta-1 + "pand %0, %%mm6 \n\t" + "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then + "pand %%mm6, %%mm5 \n\t" + "psubb %%mm6, %%mm7 \n\t" + "movq (%3,%4), %%mm3 \n\t" + H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) + + /* filter p0, q0 */ + H264_DEBLOCK_P0_Q0(%9, unused) + "movq %%mm1, (%2,%4,2) \n\t" + "movq %%mm2, (%3) \n\t" + + : "=m"(tmp0[0]), "=m"(tmp0[1]) + : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), + "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), + "m"(ff_bone) + ); +} + +static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + if((tc0[0] & tc0[1]) >= 0) + h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); + if((tc0[2] & tc0[3]) >= 0) + h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); +} +static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + //FIXME: could cut some load/stores by merging transpose with filter + // also, it only needs to transpose 6x8 + DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; + int i; + for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { + if((tc0[0] & tc0[1]) < 0) + continue; + transpose4x4(trans, pix-4, 8, stride); + transpose4x4(trans +4*8, pix, 8, stride); + transpose4x4(trans+4, pix-4+4*stride, 8, stride); + transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); + h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); + transpose4x4(pix-2, trans +2*8, stride, 8); + transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); + } +} + +static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) +{ + __asm__ volatile( + "movq (%0), %%mm0 \n\t" //p1 + "movq (%0,%2), %%mm1 \n\t" //p0 + "movq (%1), %%mm2 \n\t" //q0 + "movq (%1,%2), %%mm3 \n\t" //q1 + H264_DEBLOCK_MASK(%4, %5) + "movd %3, %%mm6 \n\t" + "punpcklbw %%mm6, %%mm6 \n\t" + "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask + H264_DEBLOCK_P0_Q0(%6, %7) + "movq %%mm1, (%0,%2) \n\t" + "movq %%mm2, (%1) \n\t" + + :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), + "r"(*(uint32_t*)tc0), + "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) + ); +} + +static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); +} + +static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + //FIXME: could cut some load/stores by merging transpose with filter + DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; + transpose4x4(trans, pix-2, 8, stride); + transpose4x4(trans+4, pix-2+4*stride, 8, stride); + h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); + transpose4x4(pix-2, trans, stride, 8); + transpose4x4(pix-2+4*stride, trans+4, stride, 8); +} + +// p0 = (p0 + q1 + 2*p1 + 2) >> 2 +#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ + "movq "#p0", %%mm4 \n\t"\ + "pxor "#q1", %%mm4 \n\t"\ + "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ + "pavgb "#q1", "#p0" \n\t"\ + "psubusb %%mm4, "#p0" \n\t"\ + "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ + +static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) +{ + __asm__ volatile( + "movq (%0), %%mm0 \n\t" + "movq (%0,%2), %%mm1 \n\t" + "movq (%1), %%mm2 \n\t" + "movq (%1,%2), %%mm3 \n\t" + H264_DEBLOCK_MASK(%3, %4) + "movq %%mm1, %%mm5 \n\t" + "movq %%mm2, %%mm6 \n\t" + H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' + H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' + "psubb %%mm5, %%mm1 \n\t" + "psubb %%mm6, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "paddb %%mm5, %%mm1 \n\t" + "paddb %%mm6, %%mm2 \n\t" + "movq %%mm1, (%0,%2) \n\t" + "movq %%mm2, (%1) \n\t" + :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), + "m"(alpha1), "m"(beta1), "m"(ff_bone) + ); +} + +static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); +} + +static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) +{ + //FIXME: could cut some load/stores by merging transpose with filter + DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; + transpose4x4(trans, pix-2, 8, stride); + transpose4x4(trans+4, pix-2+4*stride, 8, stride); + h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); + transpose4x4(pix-2, trans, stride, 8); + transpose4x4(pix-2+4*stride, trans+4, stride, 8); +} + +static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { + int dir; + __asm__ volatile( + "movq %0, %%mm7 \n" + "movq %1, %%mm6 \n" + ::"m"(ff_pb_1), "m"(ff_pb_3) + ); + if(field) + __asm__ volatile( + "movq %0, %%mm6 \n" + ::"m"(ff_pb_3_1) + ); + __asm__ volatile( + "movq %%mm6, %%mm5 \n" + "paddb %%mm5, %%mm5 \n" + :); + + // could do a special case for dir==0 && edges==1, but it only reduces the + // average filter time by 1.2% + for( dir=1; dir>=0; dir-- ) { + const x86_reg d_idx = dir ? -8 : -1; + const int mask_mv = dir ? mask_mv1 : mask_mv0; + DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; + int b_idx, edge; + for( b_idx=12, edge=0; edge= limit + "psubusb %%mm5, %%mm3 \n" + "packsswb %%mm3, %%mm1 \n" + "add $40, %0 \n" + "cmp $40, %0 \n" + "jl 1b \n" + "sub $80, %0 \n" + "pshufw $0x4E, %%mm1, %%mm1 \n" + "por %%mm1, %%mm0 \n" + "pshufw $0x4E, %%mm0, %%mm1 \n" + "pminub %%mm1, %%mm0 \n" + ::"r"(d_idx), + "r"(ref[0]+b_idx), + "r"(mv[0]+b_idx) + ); + } else { + __asm__ volatile( + "movd (%1), %%mm0 \n" + "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] + "movq (%2), %%mm1 \n" + "movq 8(%2), %%mm2 \n" + "psubw (%2,%0,4), %%mm1 \n" + "psubw 8(%2,%0,4), %%mm2 \n" + "packsswb %%mm2, %%mm1 \n" + "paddb %%mm6, %%mm1 \n" + "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit + "packsswb %%mm1, %%mm1 \n" + "por %%mm1, %%mm0 \n" + ::"r"(d_idx), + "r"(ref[0]+b_idx), + "r"(mv[0]+b_idx) + ); + } + } + __asm__ volatile( + "movd %0, %%mm1 \n" + "por %1, %%mm1 \n" // nnz[b] || nnz[bn] + ::"m"(nnz[b_idx]), + "m"(nnz[b_idx+d_idx]) + ); + __asm__ volatile( + "pminub %%mm7, %%mm1 \n" + "pminub %%mm7, %%mm0 \n" + "psllw $1, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pmaxub %%mm0, %%mm1 \n" + "punpcklbw %%mm2, %%mm1 \n" + "movq %%mm1, %0 \n" + :"=m"(*bS[dir][edge]) + ::"memory" + ); + } + edges = 4; + step = 1; + } + __asm__ volatile( + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm1 \n\t" + "movq 16(%0), %%mm2 \n\t" + "movq 24(%0), %%mm3 \n\t" + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) + "movq %%mm0, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, 16(%0) \n\t" + "movq %%mm2, 24(%0) \n\t" + ::"r"(bS[0]) + :"memory" + ); +} + +/***********************************/ +/* motion compensation */ + +#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw %4, "#T" \n\t"\ + "paddw %5, "#A" \n\t"\ + "add %2, %0 \n\t"\ + "paddw "#F", "#A" \n\t"\ + "paddw "#A", "#T" \n\t"\ + "psraw $5, "#T" \n\t"\ + "packuswb "#T", "#T" \n\t"\ + OP(T, (%1), A, d)\ + "add %3, %1 \n\t" + +#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "paddw %4, "#A" \n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw %3, "#T" \n\t"\ + "paddw "#F", "#A" \n\t"\ + "add %2, %0 \n\t"\ + "paddw "#A", "#T" \n\t"\ + "mov"#q" "#T", "#OF"(%1) \n\t" + +#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) +#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) +#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) +#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) + + +#define QPEL_H264(OPNAME, OP, MMX)\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm6 \n\t"\ + :: "m"(ff_pw_5)\ + );\ + do{\ + __asm__ volatile(\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq %5, %%mm5 \n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "movq (%2), %%mm4 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + PAVGB" %%mm4, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_16)\ + : "memory"\ + );\ + }while(--h);\ +}\ +\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int w = size>>4;\ + do{\ + int h = size;\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm3 \n\t"\ + "movq 2(%0), %%mm1 \n\t"\ + "movq 10(%0), %%mm4 \n\t"\ + "paddw %%mm4, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw 18(%0), %%mm3 \n\t"\ + "paddw 16(%0), %%mm4 \n\t"\ + "movq 4(%0), %%mm2 \n\t"\ + "movq 12(%0), %%mm5 \n\t"\ + "paddw 6(%0), %%mm2 \n\t"\ + "paddw 14(%0), %%mm5 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "paddsw %%mm2, %%mm0 \n\t"\ + "paddsw %%mm5, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm5, %%mm3 \n\t"\ + "psraw $6, %%mm0 \n\t"\ + "psraw $6, %%mm3 \n\t"\ + "packuswb %%mm3, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + tmp += 8 - size*24;\ + dst += 8 - size*dstStride;\ + }while(w--);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +}\ +static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + do{\ + __asm__ volatile(\ + "movq (%1), %%mm0 \n\t"\ + "movq 8(%1), %%mm1 \n\t"\ + "movq 48(%1), %%mm2 \n\t"\ + "movq 8+48(%1), %%mm3 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "psraw $5, %%mm2 \n\t"\ + "psraw $5, %%mm3 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + "packuswb %%mm3, %%mm2 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm2 \n\t"\ + OP(%%mm0, (%2), %%mm5, q)\ + OP(%%mm2, (%2,%4), %%mm5, q)\ + ::"a"(src8), "c"(src16), "d"(dst),\ + "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ + :"memory");\ + src8 += 2L*src8Stride;\ + src16 += 48;\ + dst += 2L*dstStride;\ + }while(h-=2);\ +}\ +static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ +}\ + + +#if ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=16;\ + __asm__ volatile(\ + "pxor %%xmm15, %%xmm15 \n\t"\ + "movdqa %6, %%xmm14 \n\t"\ + "movdqa %7, %%xmm13 \n\t"\ + "1: \n\t"\ + "lddqu 6(%0), %%xmm1 \n\t"\ + "lddqu -2(%0), %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm15, %%xmm1 \n\t"\ + "punpcklbw %%xmm15, %%xmm0 \n\t"\ + "punpcklbw %%xmm15, %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm0, %%xmm6 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm0, %%xmm8 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm0, %%xmm9 \n\t"\ + "movdqa %%xmm0, %%xmm12 \n\t"\ + "movdqa %%xmm1, %%xmm11 \n\t"\ + "palignr $10,%%xmm0, %%xmm11\n\t"\ + "palignr $10,%%xmm7, %%xmm12\n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $2, %%xmm7, %%xmm9 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $4, %%xmm7, %%xmm8 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $6, %%xmm7, %%xmm6 \n\t"\ + "paddw %%xmm0 ,%%xmm11 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $8, %%xmm7, %%xmm0 \n\t"\ + "paddw %%xmm12,%%xmm7 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm8, %%xmm6 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm9, %%xmm0 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psllw $2, %%xmm6 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "psubw %%xmm0, %%xmm6 \n\t"\ + "paddw %%xmm13,%%xmm11 \n\t"\ + "paddw %%xmm13,%%xmm7 \n\t"\ + "pmullw %%xmm14,%%xmm2 \n\t"\ + "pmullw %%xmm14,%%xmm6 \n\t"\ + "lddqu (%2), %%xmm3 \n\t"\ + "paddw %%xmm11,%%xmm2 \n\t"\ + "paddw %%xmm7, %%xmm6 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "psraw $5, %%xmm6 \n\t"\ + "packuswb %%xmm2,%%xmm6 \n\t"\ + "pavgb %%xmm3, %%xmm6 \n\t"\ + OP(%%xmm6, (%1), %%xmm4, dqa)\ + "add %5, %0 \n\t"\ + "add %5, %1 \n\t"\ + "add %4, %2 \n\t"\ + "decl %3 \n\t"\ + "jg 1b \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +} +#else // ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +} +#endif // ARCH_X86_64 + +#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa %0, %%xmm6 \n\t"\ + :: "m"(ff_pw_5)\ + );\ + do{\ + __asm__ volatile(\ + "lddqu -2(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $10,%%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "movq (%2), %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw %5, %%xmm0 \n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm0, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + "pavgb %%xmm3, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_16)\ + : "memory"\ + );\ + }while(--h);\ +}\ +QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa %5, %%xmm6 \n\t"\ + "1: \n\t"\ + "lddqu -2(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $10,%%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw %6, %%xmm0 \n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm0, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ + +#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + src -= 2*srcStride;\ + \ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movq (%0), %%xmm0 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm1 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm2 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm3 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm2 \n\t"\ + "punpcklbw %%xmm7, %%xmm3 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + if(h==16){\ + __asm__ volatile(\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + }\ +}\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +} + +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ + int w = (size+8)>>3; + src -= 2*srcStride+2; + while(w--){ + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "movq (%0), %%xmm0 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm1 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm2 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm3 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm4 \n\t" + "add %2, %0 \n\t" + "punpcklbw %%xmm7, %%xmm0 \n\t" + "punpcklbw %%xmm7, %%xmm1 \n\t" + "punpcklbw %%xmm7, %%xmm2 \n\t" + "punpcklbw %%xmm7, %%xmm3 \n\t" + "punpcklbw %%xmm7, %%xmm4 \n\t" + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) + : "+a"(src) + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) + : "memory" + ); + if(size==16){ + __asm__ volatile( + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) + : "+a"(src) + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) + : "memory" + ); + } + tmp += 8; + src += 8 - (size+5)*srcStride; + } +} + +#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int h = size;\ + if(size == 16){\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 32(%0), %%xmm4 \n\t"\ + "movdqa 16(%0), %%xmm5 \n\t"\ + "movdqa (%0), %%xmm7 \n\t"\ + "movdqa %%xmm4, %%xmm3 \n\t"\ + "movdqa %%xmm4, %%xmm2 \n\t"\ + "movdqa %%xmm4, %%xmm1 \n\t"\ + "movdqa %%xmm4, %%xmm0 \n\t"\ + "palignr $10, %%xmm5, %%xmm0 \n\t"\ + "palignr $8, %%xmm5, %%xmm1 \n\t"\ + "palignr $6, %%xmm5, %%xmm2 \n\t"\ + "palignr $4, %%xmm5, %%xmm3 \n\t"\ + "palignr $2, %%xmm5, %%xmm4 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "movdqa %%xmm5, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm3 \n\t"\ + "palignr $8, %%xmm7, %%xmm4 \n\t"\ + "palignr $2, %%xmm7, %%xmm6 \n\t"\ + "palignr $10, %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "palignr $6, %%xmm7, %%xmm5 \n\t"\ + "palignr $4, %%xmm7, %%xmm6 \n\t"\ + "paddw %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm5 \n\t"\ + \ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "psraw $6, %%xmm3 \n\t"\ + "packuswb %%xmm0, %%xmm3 \n\t"\ + OP(%%xmm3, (%1), %%xmm7, dqa)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }else{\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 16(%0), %%xmm1 \n\t"\ + "movdqa (%0), %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $10, %%xmm0, %%xmm5 \n\t"\ + "palignr $8, %%xmm0, %%xmm4 \n\t"\ + "palignr $6, %%xmm0, %%xmm3 \n\t"\ + "palignr $4, %%xmm0, %%xmm2 \n\t"\ + "palignr $2, %%xmm0, %%xmm1 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "packuswb %%xmm0, %%xmm0 \n\t"\ + OP(%%xmm0, (%1), %%xmm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }\ +} + +#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ +}\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ +}\ + +#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 +#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 + +#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 +#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 + +#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 +#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 +#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 +#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 + +#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 +#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 +#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 +#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 + +#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 +#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 + +#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ +H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ + +// static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ +// put_pixels16_sse2(dst, src, stride, 16); +// } +// static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ +// avg_pixels16_sse2(dst, src, stride, 16); +// } +#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 +#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 + +#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ +}\ + +#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ +}\ + +#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ +}\ + +#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ +}\ + +#define H264_MC_4816(MMX)\ +H264_MC(put_, 4, MMX, 8)\ +H264_MC(put_, 8, MMX, 8)\ +H264_MC(put_, 16,MMX, 8)\ +H264_MC(avg_, 4, MMX, 8)\ +H264_MC(avg_, 8, MMX, 8)\ +H264_MC(avg_, 16,MMX, 8)\ + +#define H264_MC_816(QPEL, XMM)\ +QPEL(put_, 8, XMM, 16)\ +QPEL(put_, 16,XMM, 16)\ +QPEL(avg_, 8, XMM, 16)\ +QPEL(avg_, 16,XMM, 16)\ + + +#define AVG_3DNOW_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgusb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" +#define AVG_MMX2_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" + +///this does not get detected correctly, uncomment on AMD machine +#ifdef HAVE_AMD3DNOW +#define PAVGB "pavgusb" +//QPEL_H264(put_, PUT_OP, 3dnow) +//QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) +#undef PAVGB +#endif + +#define PAVGB "pavgb" +QPEL_H264(put_, PUT_OP, mmx2) +QPEL_H264(avg_, AVG_MMX2_OP, mmx2) +QPEL_H264_V_XMM(put_, PUT_OP, sse2) +QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) +QPEL_H264_HV_XMM(put_, PUT_OP, sse2) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) +#if HAVE_SSSE3 +QPEL_H264_H_XMM(put_, PUT_OP, ssse3) +QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) +#endif +#undef PAVGB + +H264_MC_816(H264_MC_V, sse2) +H264_MC_816(H264_MC_HV, sse2) +#if HAVE_SSSE3 +H264_MC_816(H264_MC_H, ssse3) +H264_MC_816(H264_MC_HV, ssse3) +#endif + +/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ +DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = { + 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL +}; + +#if HAVE_SSSE3 +#define AVG_OP(X) +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 +#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 +#define H264_CHROMA_MC8_MV0 put_pixels8_mmx +#include "dsputil_h264_template_ssse3.c" +static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); +} + +#undef AVG_OP +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#undef H264_CHROMA_MC8_MV0 +#define AVG_OP(X) X +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 +#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 +#include "dsputil_h264_template_ssse3.c" +static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); +} +#undef AVG_OP +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#undef H264_CHROMA_MC8_MV0 +#endif + +/***********************************/ +/* weighted prediction */ + +static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) +{ + int x, y; + offset <<= log2_denom; + offset += (1 << log2_denom) >> 1; + __asm__ volatile( + "movd %0, %%mm4 \n\t" + "movd %1, %%mm5 \n\t" + "movd %2, %%mm6 \n\t" + "pshufw $0, %%mm4, %%mm4 \n\t" + "pshufw $0, %%mm5, %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + :: "g"(weight), "g"(offset), "g"(log2_denom) + ); + for(y=0; y et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_MATHOPS_H +#define AVCODEC_X86_MATHOPS_H + +#include "config.h" +#include "libavutil/common.h" + +#if ARCH_X86_32 +#define MULL(ra, rb, shift) \ + ({ int rt, dummy; __asm__ (\ + "imull %3 \n\t"\ + "shrdl %4, %%edx, %%eax \n\t"\ + : "=a"(rt), "=d"(dummy)\ + : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\ + rt; }) + +#define MULH(ra, rb) \ + ({ int rt, dummy;\ + __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\ + rt; }) + +#define MUL64(ra, rb) \ + ({ int64_t rt;\ + __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\ + rt; }) +#endif + +// avoid +32 for shift optimization (gcc should do that ...) +#define NEG_SSR32 NEG_SSR32 +static inline int32_t NEG_SSR32( int32_t a, int8_t s){ + __asm__ ("sarl %1, %0\n\t" + : "+r" (a) + : "ic" ((uint8_t)(-s)) + ); + return a; +} + +#define NEG_USR32 NEG_USR32 +static inline uint32_t NEG_USR32(uint32_t a, int8_t s){ + __asm__ ("shrl %1, %0\n\t" + : "+r" (a) + : "ic" ((uint8_t)(-s)) + ); + return a; +} + +#endif /* AVCODEC_X86_MATHOPS_H */ diff -r 11d15c47beaf -r 897f711a7157 libavcodec/x86/mmx.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavcodec/x86/mmx.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,267 @@ +/* + * mmx.h + * Copyright (C) 1997-2001 H. Dietz and R. Fisher + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef AVCODEC_X86_MMX_H +#define AVCODEC_X86_MMX_H + +#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected. + + +#define mmx_i2r(op,imm,reg) \ + __asm__ volatile (#op " %0, %%" #reg \ + : /* nothing */ \ + : "i" (imm) ) + +#define mmx_m2r(op,mem,reg) \ + __asm__ volatile (#op " %0, %%" #reg \ + : /* nothing */ \ + : "m" (mem)) + +#define mmx_r2m(op,reg,mem) \ + __asm__ volatile (#op " %%" #reg ", %0" \ + : "=m" (mem) \ + : /* nothing */ ) + +#define mmx_r2r(op,regs,regd) \ + __asm__ volatile (#op " %" #regs ", %" #regd) + + +#define emms() __asm__ volatile ("emms") + +#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) +#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) +#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) + +#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) +#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) +#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) + +#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) +#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) +#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) +#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) + +#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) +#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) + +#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) +#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) +#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) +#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) +#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) +#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) + +#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) +#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) +#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) +#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) + +#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) +#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) +#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) +#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) + +#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) +#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) + +#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) +#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) + +#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) +#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) +#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) +#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) +#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) +#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) + +#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) +#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) +#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) +#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) +#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) +#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) + +#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) +#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) + +#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) +#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) + +#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) +#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) + +#define por_m2r(var,reg) mmx_m2r (por, var, reg) +#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) + +#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) +#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) +#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) +#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) +#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) +#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) +#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) +#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) +#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) + +#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) +#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) +#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) +#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) +#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) +#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) + +#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) +#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) +#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) +#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) +#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) +#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) +#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) +#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) +#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) + +#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) +#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) +#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) +#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) +#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) +#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) + +#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) +#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) +#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) +#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) + +#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) +#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) +#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) +#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) + +#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) +#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) +#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) +#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) +#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) +#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) + +#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) +#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) +#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) +#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) +#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) +#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) + +#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) +#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) + + +/* 3DNOW extensions */ + +#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) +#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) + + +/* AMD MMX extensions - also available in intel SSE */ + + +#define mmx_m2ri(op,mem,reg,imm) \ + __asm__ volatile (#op " %1, %0, %%" #reg \ + : /* nothing */ \ + : "m" (mem), "i" (imm)) +#define mmx_r2ri(op,regs,regd,imm) \ + __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \ + : /* nothing */ \ + : "i" (imm) ) + +#define mmx_fetch(mem,hint) \ + __asm__ volatile ("prefetch" #hint " %0" \ + : /* nothing */ \ + : "m" (mem)) + + +#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) + +#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) + +#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) +#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) +#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) +#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) + +#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) + +#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) + +#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) +#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) + +#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) +#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) + +#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) +#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) + +#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) +#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) + +#define pmovmskb(mmreg,reg) \ + __asm__ volatile ("movmskps %" #mmreg ", %" #reg) + +#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) +#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) + +#define prefetcht0(mem) mmx_fetch (mem, t0) +#define prefetcht1(mem) mmx_fetch (mem, t1) +#define prefetcht2(mem) mmx_fetch (mem, t2) +#define prefetchnta(mem) mmx_fetch (mem, nta) + +#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) +#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) + +#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) +#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) + +#define sfence() __asm__ volatile ("sfence\n\t") + +/* SSE2 */ +#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm) +#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm) +#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm) +#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm) + +#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) + +#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) +#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) +#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) +#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) +#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) +#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) + +#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var) + +#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg) +#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg) + +#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) +#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) + + +#endif /* AVCODEC_X86_MMX_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/arm/bswap.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/arm/bswap.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,72 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_ARM_BSWAP_H +#define AVUTIL_ARM_BSWAP_H + +#include +#include "config.h" +#include "libavutil/attributes.h" + +#ifdef __ARMCC_VERSION + +#if HAVE_ARMV6 +#define bswap_16 bswap_16 +static av_always_inline av_const unsigned bswap_16(unsigned x) +{ + __asm { rev16 x, x } + return x; +} + +#define bswap_32 bswap_32 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) +{ + return __rev(x); +} +#endif /* HAVE_ARMV6 */ + +#elif HAVE_INLINE_ASM + +#if HAVE_ARMV6 +#define bswap_16 bswap_16 +static av_always_inline av_const unsigned bswap_16(unsigned x) +{ + __asm__("rev16 %0, %0" : "+r"(x)); + return x; +} +#endif + +#define bswap_32 bswap_32 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) +{ +#if HAVE_ARMV6 + __asm__("rev %0, %0" : "+r"(x)); +#else + uint32_t t; + __asm__ ("eor %1, %0, %0, ror #16 \n\t" + "bic %1, %1, #0xFF0000 \n\t" + "mov %0, %0, ror #8 \n\t" + "eor %0, %0, %1, lsr #8 \n\t" + : "+r"(x), "=&r"(t)); +#endif /* HAVE_ARMV6 */ + return x; +} + +#endif /* __ARMCC_VERSION */ + +#endif /* AVUTIL_ARM_BSWAP_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/arm/intreadwrite.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/arm/intreadwrite.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,78 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_ARM_INTREADWRITE_H +#define AVUTIL_ARM_INTREADWRITE_H + +#include +#include "config.h" + +#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM + +#define AV_RN16 AV_RN16 +static av_always_inline uint16_t AV_RN16(const void *p) +{ + uint16_t v; + __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p)); + return v; +} + +#define AV_WN16 AV_WN16 +static av_always_inline void AV_WN16(void *p, uint16_t v) +{ + __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v)); +} + +#define AV_RN32 AV_RN32 +static av_always_inline uint32_t AV_RN32(const void *p) +{ + uint32_t v; + __asm__ ("ldr %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p)); + return v; +} + +#define AV_WN32 AV_WN32 +static av_always_inline void AV_WN32(void *p, uint32_t v) +{ + __asm__ ("str %1, %0" : "=m"(*(uint32_t *)p) : "r"(v)); +} + +#define AV_RN64 AV_RN64 +static av_always_inline uint64_t AV_RN64(const void *p) +{ + union { uint64_t v; uint32_t hl[2]; } v; + __asm__ ("ldr %0, %2 \n\t" + "ldr %1, %3 \n\t" + : "=&r"(v.hl[0]), "=r"(v.hl[1]) + : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1))); + return v.v; +} + +#define AV_WN64 AV_WN64 +static av_always_inline void AV_WN64(void *p, uint64_t v) +{ + union { uint64_t v; uint32_t hl[2]; } vv = { v }; + __asm__ ("str %2, %0 \n\t" + "str %3, %1 \n\t" + : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1)) + : "r"(vv.hl[0]), "r"(vv.hl[1])); +} + +#endif /* HAVE_INLINE_ASM */ + +#endif /* AVUTIL_ARM_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/arm/timer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/arm/timer.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_ARM_TIMER_H +#define AVUTIL_ARM_TIMER_H + +#include +#include "config.h" + +#if HAVE_INLINE_ASM && defined(__ARM_ARCH_7A__) + +#define AV_READ_TIME read_time + +static inline uint64_t read_time(void) +{ + unsigned cc; + __asm__ volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc)); + return cc; +} + +#endif /* HAVE_INLINE_ASM && __ARM_ARCH_7A__ */ + +#endif /* AVUTIL_ARM_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/attributes.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/attributes.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,113 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Macro definitions for various function/variable attributes + */ + +#ifndef AVUTIL_ATTRIBUTES_H +#define AVUTIL_ATTRIBUTES_H + +#ifdef __GNUC__ +# define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y) +#else +# define AV_GCC_VERSION_AT_LEAST(x,y) 0 +#endif + +#ifndef av_always_inline +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define av_always_inline __attribute__((always_inline)) inline +#else +# define av_always_inline inline +#endif +#endif + +#ifndef av_noinline +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define av_noinline __attribute__((noinline)) +#else +# define av_noinline +#endif +#endif + +#ifndef av_pure +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define av_pure __attribute__((pure)) +#else +# define av_pure +#endif +#endif + +#ifndef av_const +#if AV_GCC_VERSION_AT_LEAST(2,6) +# define av_const __attribute__((const)) +#else +# define av_const +#endif +#endif + +#ifndef av_cold +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3) +# define av_cold __attribute__((cold)) +#else +# define av_cold +#endif +#endif + +#ifndef av_flatten +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1) +# define av_flatten __attribute__((flatten)) +#else +# define av_flatten +#endif +#endif + +#ifndef attribute_deprecated +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define attribute_deprecated __attribute__((deprecated)) +#else +# define attribute_deprecated +#endif +#endif + +#ifndef av_unused +#if defined(__GNUC__) +# define av_unused __attribute__((unused)) +#else +# define av_unused +#endif +#endif + +#ifndef av_uninit +#if defined(__GNUC__) && !defined(__ICC) +# define av_uninit(x) x=x +#else +# define av_uninit(x) x +#endif +#endif + +#ifdef __GNUC__ +# define av_builtin_constant_p __builtin_constant_p +#else +# define av_builtin_constant_p(x) 0 +#endif + +#endif /* AVUTIL_ATTRIBUTES_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/bswap.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/bswap.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,95 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * byte swapping routines + */ + +#ifndef AVUTIL_BSWAP_H +#define AVUTIL_BSWAP_H + +#include +#include "config.h" +#include "attributes.h" + +#if ARCH_ARM +# include "arm/bswap.h" +#elif ARCH_X86 +# include "x86/bswap.h" +#endif + +#ifndef bswap_16 +static av_always_inline av_const uint16_t bswap_16(uint16_t x) +{ + x= (x>>8) | (x<<8); + return x; +} +#endif + +#ifndef bswap_32 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) +{ + x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); + x= (x>>16) | (x<<16); + return x; +} +#endif + +#ifndef bswap_64 +static inline uint64_t av_const bswap_64(uint64_t x) +{ +#if 0 + x= ((x<< 8)&0xFF00FF00FF00FF00ULL) | ((x>> 8)&0x00FF00FF00FF00FFULL); + x= ((x<<16)&0xFFFF0000FFFF0000ULL) | ((x>>16)&0x0000FFFF0000FFFFULL); + return (x>>32) | (x<<32); +#else + union { + uint64_t ll; + uint32_t l[2]; + } w, r; + w.ll = x; + r.l[0] = bswap_32 (w.l[1]); + r.l[1] = bswap_32 (w.l[0]); + return r.ll; +#endif +} +#endif + +// be2me ... big-endian to machine-endian +// le2me ... little-endian to machine-endian + +#if HAVE_BIGENDIAN +#define be2me_16(x) (x) +#define be2me_32(x) (x) +#define be2me_64(x) (x) +#define le2me_16(x) bswap_16(x) +#define le2me_32(x) bswap_32(x) +#define le2me_64(x) bswap_64(x) +#else +#define be2me_16(x) bswap_16(x) +#define be2me_32(x) bswap_32(x) +#define be2me_64(x) bswap_64(x) +#define le2me_16(x) (x) +#define le2me_32(x) (x) +#define le2me_64(x) (x) +#endif + +#endif /* AVUTIL_BSWAP_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/common.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/common.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,298 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * common internal and external API header + */ + +#ifndef AVUTIL_COMMON_H +#define AVUTIL_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include "attributes.h" + +//rounded division & shift +#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b)) +/* assume b>0 */ +#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) +#define FFABS(a) ((a) >= 0 ? (a) : (-(a))) +#define FFSIGN(a) ((a) > 0 ? 1 : -1) + +#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) +#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c) +#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) +#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c) + +#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) +#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0])) +#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1)) + +/* misc math functions */ +extern const uint8_t ff_log2_tab[256]; + +static inline av_const int av_log2_c(unsigned int v) +{ + int n = 0; + if (v & 0xffff0000) { + v >>= 16; + n += 16; + } + if (v & 0xff00) { + v >>= 8; + n += 8; + } + n += ff_log2_tab[v]; + + return n; +} + +static inline av_const int av_log2_16bit_c(unsigned int v) +{ + int n = 0; + if (v & 0xff00) { + v >>= 8; + n += 8; + } + n += ff_log2_tab[v]; + + return n; +} + +#ifdef HAVE_AV_CONFIG_H +# include "config.h" +#endif + +/** + * Clips a signed integer value into the amin-amax range. + * @param a value to clip + * @param amin minimum value of the clip range + * @param amax maximum value of the clip range + * @return clipped value + */ +static inline av_const int av_clip(int a, int amin, int amax) +{ + if (a < amin) return amin; + else if (a > amax) return amax; + else return a; +} + +/** + * Clips a signed integer value into the 0-255 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const uint8_t av_clip_uint8(int a) +{ + if (a&(~0xFF)) return (-a)>>31; + else return a; +} + +/** + * Clips a signed integer value into the 0-65535 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const uint16_t av_clip_uint16(int a) +{ + if (a&(~0xFFFF)) return (-a)>>31; + else return a; +} + +/** + * Clips a signed integer value into the -32768,32767 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const int16_t av_clip_int16(int a) +{ + if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF; + else return a; +} + +/** + * Clips a signed 64-bit integer value into the -2147483648,2147483647 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const int32_t av_clipl_int32(int64_t a) +{ + if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF; + else return a; +} + +/** + * Clips a float value into the amin-amax range. + * @param a value to clip + * @param amin minimum value of the clip range + * @param amax maximum value of the clip range + * @return clipped value + */ +static inline av_const float av_clipf(float a, float amin, float amax) +{ + if (a < amin) return amin; + else if (a > amax) return amax; + else return a; +} + +/** Computes ceil(log2(x)). + * @param x value used to compute ceil(log2(x)) + * @return computed ceiling of log2(x) + */ +static inline av_const int av_ceil_log2(int x) +{ + return av_log2_c((x - 1) << 1); +} + +#define MKTAG(a,b,c,d) (a | (b << 8) | (c << 16) | (d << 24)) +#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24)) + +/*! + * \def GET_UTF8(val, GET_BYTE, ERROR) + * Converts a UTF-8 character (up to 4 bytes long) to its 32-bit UCS-4 encoded form + * \param val is the output and should be of type uint32_t. It holds the converted + * UCS-4 character and should be a left value. + * \param GET_BYTE gets UTF-8 encoded bytes from any proper source. It can be + * a function or a statement whose return value or evaluated value is of type + * uint8_t. It will be executed up to 4 times for values in the valid UTF-8 range, + * and up to 7 times in the general case. + * \param ERROR action that should be taken when an invalid UTF-8 byte is returned + * from GET_BYTE. It should be a statement that jumps out of the macro, + * like exit(), goto, return, break, or continue. + */ +#define GET_UTF8(val, GET_BYTE, ERROR)\ + val= GET_BYTE;\ + {\ + int ones= 7 - av_log2(val ^ 255);\ + if(ones==1)\ + ERROR\ + val&= 127>>ones;\ + while(--ones > 0){\ + int tmp= GET_BYTE - 128;\ + if(tmp>>6)\ + ERROR\ + val= (val<<6) + tmp;\ + }\ + } + +/*! + * \def GET_UTF16(val, GET_16BIT, ERROR) + * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form + * \param val is the output and should be of type uint32_t. It holds the converted + * UCS-4 character and should be a left value. + * \param GET_16BIT gets two bytes of UTF-16 encoded data converted to native endianness. + * It can be a function or a statement whose return value or evaluated value is of type + * uint16_t. It will be executed up to 2 times. + * \param ERROR action that should be taken when an invalid UTF-16 surrogate is + * returned from GET_BYTE. It should be a statement that jumps out of the macro, + * like exit(), goto, return, break, or continue. + */ +#define GET_UTF16(val, GET_16BIT, ERROR)\ + val = GET_16BIT;\ + {\ + unsigned int hi = val - 0xD800;\ + if (hi < 0x800) {\ + val = GET_16BIT - 0xDC00;\ + if (val > 0x3FFU || hi > 0x3FFU)\ + ERROR\ + val += (hi<<10) + 0x10000;\ + }\ + }\ + +/*! + * \def PUT_UTF8(val, tmp, PUT_BYTE) + * Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long). + * \param val is an input-only argument and should be of type uint32_t. It holds + * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If + * val is given as a function it is executed only once. + * \param tmp is a temporary variable and should be of type uint8_t. It + * represents an intermediate value during conversion that is to be + * output by PUT_BYTE. + * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination. + * It could be a function or a statement, and uses tmp as the input byte. + * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be + * executed up to 4 times for values in the valid UTF-8 range and up to + * 7 times in the general case, depending on the length of the converted + * Unicode character. + */ +#define PUT_UTF8(val, tmp, PUT_BYTE)\ + {\ + int bytes, shift;\ + uint32_t in = val;\ + if (in < 0x80) {\ + tmp = in;\ + PUT_BYTE\ + } else {\ + bytes = (av_log2(in) + 4) / 5;\ + shift = (bytes - 1) * 6;\ + tmp = (256 - (256 >> bytes)) | (in >> shift);\ + PUT_BYTE\ + while (shift >= 6) {\ + shift -= 6;\ + tmp = 0x80 | ((in >> shift) & 0x3f);\ + PUT_BYTE\ + }\ + }\ + } + +/*! + * \def PUT_UTF16(val, tmp, PUT_16BIT) + * Converts a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes). + * \param val is an input-only argument and should be of type uint32_t. It holds + * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If + * val is given as a function it is executed only once. + * \param tmp is a temporary variable and should be of type uint16_t. It + * represents an intermediate value during conversion that is to be + * output by PUT_16BIT. + * \param PUT_16BIT writes the converted UTF-16 data to any proper destination + * in desired endianness. It could be a function or a statement, and uses tmp + * as the input byte. For example, PUT_BYTE could be "*output++ = tmp;" + * PUT_BYTE will be executed 1 or 2 times depending on input character. + */ +#define PUT_UTF16(val, tmp, PUT_16BIT)\ + {\ + uint32_t in = val;\ + if (in < 0x10000) {\ + tmp = in;\ + PUT_16BIT\ + } else {\ + tmp = 0xD800 | ((in - 0x10000) >> 10);\ + PUT_16BIT\ + tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\ + PUT_16BIT\ + }\ + }\ + + + +#include "mem.h" + +#ifdef HAVE_AV_CONFIG_H +# include "internal.h" +#endif /* HAVE_AV_CONFIG_H */ + +#endif /* AVUTIL_COMMON_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/error.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/error.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,53 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * error code definitions + */ + +#ifndef AVUTIL_ERROR_H +#define AVUTIL_ERROR_H + +#include +#include "common.h" + +/* error handling */ +#if EDOM > 0 +#define AVERROR(e) (-(e)) ///< Returns a negative error code from a POSIX error code, to return from library functions. +#define AVUNERROR(e) (-(e)) ///< Returns a POSIX error code from a library function error return value. +#else +/* Some platforms have E* and errno already negated. */ +#define AVERROR(e) (e) +#define AVUNERROR(e) (e) +#endif + +#define AVERROR_EOF AVERROR(EPIPE) ///< End of file + + +/** + * Puts a description of the AVERROR code errnum in errbuf. + * In case of failure the global variable errno is set to indicate the + * error. + * + * @param errbuf_size the size in bytes of errbuf + * @return 0 on success, a negative value otherwise + */ +int av_strerror(int errnum, char *errbuf, size_t errbuf_size); + +#endif /* AVUTIL_ERROR_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/internal.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/internal.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,168 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * common internal API header + */ + +#ifndef AVUTIL_INTERNAL_H +#define AVUTIL_INTERNAL_H + +#if !defined(DEBUG) && !defined(NDEBUG) +# define NDEBUG +#endif + +#include +#include +#include +#include +#include "config.h" +#include "attributes.h" +#include "timer.h" + + + +#ifndef INT16_MIN +#define INT16_MIN (-0x7fff - 1) +#endif + +#ifndef INT16_MAX +#define INT16_MAX 0x7fff +#endif + +#ifndef INT32_MIN +#define INT32_MIN (-0x7fffffff - 1) +#endif + +#ifndef INT32_MAX +#define INT32_MAX 0x7fffffff +#endif + +#ifndef UINT32_MAX +#define UINT32_MAX 0xffffffff +#endif + +#ifndef INT64_MIN +#define INT64_MIN (-0x7fffffffffffffffLL - 1) +#endif + +#ifndef INT64_MAX +#define INT64_MAX INT64_C(9223372036854775807) +#endif + +#ifndef UINT64_MAX +#define UINT64_MAX UINT64_C(0xFFFFFFFFFFFFFFFF) +#endif + +#ifndef INT_BIT +# define INT_BIT (CHAR_BIT * sizeof(int)) +#endif + +#ifndef offsetof +# define offsetof(T, F) ((unsigned int)((char *)&((T *)0)->F)) +#endif + +/* Use to export labels from asm. */ +#define LABEL_MANGLE(a) #a +#define LOCAL_MANGLE(a) #a +#define MANGLE(a) #a + +// Use rip-relative addressing if compiling PIC code on x86-64. +// #if ARCH_X86_64 && defined(PIC) +// # define LOCAL_MANGLE(a) #a "(%%rip)" +// #else +// # define LOCAL_MANGLE(a) #a +// #endif +// +// #define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a) + +/* debug stuff */ + +/* dprintf macros */ +#ifdef DEBUG +# define dprintf(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__) +#else +# define dprintf(pctx, ...) +#endif + +#define av_abort() do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0) + +/* math */ + + +/* avoid usage of dangerous/inappropriate system functions */ +// #undef malloc +// #define malloc please_use_av_malloc +// #undef free +// #define free please_use_av_free +#undef realloc +#define realloc please_use_av_realloc +#undef time +#define time time_is_forbidden_due_to_security_issues +#undef rand +#define rand rand_is_forbidden_due_to_state_trashing_use_av_lfg_get +#undef srand +#define srand srand_is_forbidden_due_to_state_trashing_use_av_lfg_init +#undef random +#define random random_is_forbidden_due_to_state_trashing_use_av_lfg_get +#undef sprintf +#define sprintf sprintf_is_forbidden_due_to_security_issues_use_snprintf +//#undef exit +//#define exit exit_is_forbidden +#ifndef LIBAVFORMAT_BUILD + +#undef puts +#define puts please_use_av_log_instead_of_puts +#undef perror +#define perror please_use_av_log_instead_of_perror +#endif + +#define FF_ALLOC_OR_GOTO(p, size, label)\ +{\ + p = av_malloc(size);\ + if (p == NULL && (size) != 0) {\ + av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\ + goto label;\ + }\ +} + +#define FF_ALLOCZ_OR_GOTO(p, size, label)\ +{\ + p = av_mallocz(size);\ + if (p == NULL && (size) != 0) {\ + av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\ + goto label;\ + }\ +} + + +/** + * Returns NULL if CONFIG_SMALL is true, otherwise the argument + * without modification. Used to disable the definition of strings + * (for example AVCodec long_names). + */ +#if CONFIG_SMALL +# define NULL_IF_CONFIG_SMALL(x) NULL +#else +# define NULL_IF_CONFIG_SMALL(x) x +#endif + +#endif /* AVUTIL_INTERNAL_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/intreadwrite.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/intreadwrite.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,498 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_INTREADWRITE_H +#define AVUTIL_INTREADWRITE_H + +#include +#include "config.h" +#include "bswap.h" +#include "common.h" + +typedef union { + uint64_t u64; + uint32_t u32[2]; + uint16_t u16[4]; + uint8_t u8 [8]; + double f64; + float f32[2]; +} __attribute__((__may_alias__)) av_alias64; + +typedef union { + uint32_t u32; + uint16_t u16[2]; + uint8_t u8 [4]; + float f32; +} __attribute__((__may_alias__)) av_alias32; + +typedef union { + uint16_t u16; + uint8_t u8 [2]; +} __attribute__((__may_alias__)) av_alias16 ; + +/* + * Arch-specific headers can provide any combination of + * AV_[RW][BLN](16|24|32|64) and AV_(COPY|SWAP|ZERO)(64|128) macros. + * Preprocessor symbols must be defined, even if these are implemented + * as inline functions. + */ + +#if ARCH_ARM +# include "arm/intreadwrite.h" +#elif ARCH_PPC +# include "ppc/intreadwrite.h" +#elif ARCH_X86 +# include "x86/intreadwrite.h" +#endif + +/* + * Map AV_RNXX <-> AV_R[BL]XX for all variants provided by per-arch headers. + */ + +#if HAVE_BIGENDIAN + +# if defined(AV_RN16) && !defined(AV_RB16) +# define AV_RB16(p) AV_RN16(p) +# elif !defined(AV_RN16) && defined(AV_RB16) +# define AV_RN16(p) AV_RB16(p) +# endif + +# if defined(AV_WN16) && !defined(AV_WB16) +# define AV_WB16(p, v) AV_WN16(p, v) +# elif !defined(AV_WN16) && defined(AV_WB16) +# define AV_WN16(p, v) AV_WB16(p, v) +# endif + +# if defined(AV_RN24) && !defined(AV_RB24) +# define AV_RB24(p) AV_RN24(p) +# elif !defined(AV_RN24) && defined(AV_RB24) +# define AV_RN24(p) AV_RB24(p) +# endif + +# if defined(AV_WN24) && !defined(AV_WB24) +# define AV_WB24(p, v) AV_WN24(p, v) +# elif !defined(AV_WN24) && defined(AV_WB24) +# define AV_WN24(p, v) AV_WB24(p, v) +# endif + +# if defined(AV_RN32) && !defined(AV_RB32) +# define AV_RB32(p) AV_RN32(p) +# elif !defined(AV_RN32) && defined(AV_RB32) +# define AV_RN32(p) AV_RB32(p) +# endif + +# if defined(AV_WN32) && !defined(AV_WB32) +# define AV_WB32(p, v) AV_WN32(p, v) +# elif !defined(AV_WN32) && defined(AV_WB32) +# define AV_WN32(p, v) AV_WB32(p, v) +# endif + +# if defined(AV_RN64) && !defined(AV_RB64) +# define AV_RB64(p) AV_RN64(p) +# elif !defined(AV_RN64) && defined(AV_RB64) +# define AV_RN64(p) AV_RB64(p) +# endif + +# if defined(AV_WN64) && !defined(AV_WB64) +# define AV_WB64(p, v) AV_WN64(p, v) +# elif !defined(AV_WN64) && defined(AV_WB64) +# define AV_WN64(p, v) AV_WB64(p, v) +# endif + +#else /* HAVE_BIGENDIAN */ + +# if defined(AV_RN16) && !defined(AV_RL16) +# define AV_RL16(p) AV_RN16(p) +# elif !defined(AV_RN16) && defined(AV_RL16) +# define AV_RN16(p) AV_RL16(p) +# endif + +# if defined(AV_WN16) && !defined(AV_WL16) +# define AV_WL16(p, v) AV_WN16(p, v) +# elif !defined(AV_WN16) && defined(AV_WL16) +# define AV_WN16(p, v) AV_WL16(p, v) +# endif + +# if defined(AV_RN24) && !defined(AV_RL24) +# define AV_RL24(p) AV_RN24(p) +# elif !defined(AV_RN24) && defined(AV_RL24) +# define AV_RN24(p) AV_RL24(p) +# endif + +# if defined(AV_WN24) && !defined(AV_WL24) +# define AV_WL24(p, v) AV_WN24(p, v) +# elif !defined(AV_WN24) && defined(AV_WL24) +# define AV_WN24(p, v) AV_WL24(p, v) +# endif + +# if defined(AV_RN32) && !defined(AV_RL32) +# define AV_RL32(p) AV_RN32(p) +# elif !defined(AV_RN32) && defined(AV_RL32) +# define AV_RN32(p) AV_RL32(p) +# endif + +# if defined(AV_WN32) && !defined(AV_WL32) +# define AV_WL32(p, v) AV_WN32(p, v) +# elif !defined(AV_WN32) && defined(AV_WL32) +# define AV_WN32(p, v) AV_WL32(p, v) +# endif + +# if defined(AV_RN64) && !defined(AV_RL64) +# define AV_RL64(p) AV_RN64(p) +# elif !defined(AV_RN64) && defined(AV_RL64) +# define AV_RN64(p) AV_RL64(p) +# endif + +# if defined(AV_WN64) && !defined(AV_WL64) +# define AV_WL64(p, v) AV_WN64(p, v) +# elif !defined(AV_WN64) && defined(AV_WL64) +# define AV_WN64(p, v) AV_WL64(p, v) +# endif + +#endif /* !HAVE_BIGENDIAN */ + +/* + * Define AV_[RW]N helper macros to simplify definitions not provided + * by per-arch headers. + */ + + + +#if defined(__DECC) + +# define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p))) +# define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v)) + +#else + +#ifndef AV_RB16 +# define AV_RB16(x) \ + ((((const uint8_t*)(x))[0] << 8) | \ + ((const uint8_t*)(x))[1]) +#endif +#ifndef AV_WB16 +# define AV_WB16(p, d) do { \ + ((uint8_t*)(p))[1] = (d); \ + ((uint8_t*)(p))[0] = (d)>>8; \ + } while(0) +#endif + +#ifndef AV_RL16 +# define AV_RL16(x) \ + ((((const uint8_t*)(x))[1] << 8) | \ + ((const uint8_t*)(x))[0]) +#endif +#ifndef AV_WL16 +# define AV_WL16(p, d) do { \ + ((uint8_t*)(p))[0] = (d); \ + ((uint8_t*)(p))[1] = (d)>>8; \ + } while(0) +#endif + +#ifndef AV_RB32 +# define AV_RB32(x) \ + ((((const uint8_t*)(x))[0] << 24) | \ + (((const uint8_t*)(x))[1] << 16) | \ + (((const uint8_t*)(x))[2] << 8) | \ + ((const uint8_t*)(x))[3]) +#endif +#ifndef AV_WB32 +# define AV_WB32(p, d) do { \ + ((uint8_t*)(p))[3] = (d); \ + ((uint8_t*)(p))[2] = (d)>>8; \ + ((uint8_t*)(p))[1] = (d)>>16; \ + ((uint8_t*)(p))[0] = (d)>>24; \ + } while(0) +#endif + +#ifndef AV_RL32 +# define AV_RL32(x) \ + ((((const uint8_t*)(x))[3] << 24) | \ + (((const uint8_t*)(x))[2] << 16) | \ + (((const uint8_t*)(x))[1] << 8) | \ + ((const uint8_t*)(x))[0]) +#endif +#ifndef AV_WL32 +# define AV_WL32(p, d) do { \ + ((uint8_t*)(p))[0] = (d); \ + ((uint8_t*)(p))[1] = (d)>>8; \ + ((uint8_t*)(p))[2] = (d)>>16; \ + ((uint8_t*)(p))[3] = (d)>>24; \ + } while(0) +#endif + +#ifndef AV_RB64 +# define AV_RB64(x) \ + (((uint64_t)((const uint8_t*)(x))[0] << 56) | \ + ((uint64_t)((const uint8_t*)(x))[1] << 48) | \ + ((uint64_t)((const uint8_t*)(x))[2] << 40) | \ + ((uint64_t)((const uint8_t*)(x))[3] << 32) | \ + ((uint64_t)((const uint8_t*)(x))[4] << 24) | \ + ((uint64_t)((const uint8_t*)(x))[5] << 16) | \ + ((uint64_t)((const uint8_t*)(x))[6] << 8) | \ + (uint64_t)((const uint8_t*)(x))[7]) +#endif +#ifndef AV_WB64 +# define AV_WB64(p, d) do { \ + ((uint8_t*)(p))[7] = (d); \ + ((uint8_t*)(p))[6] = (d)>>8; \ + ((uint8_t*)(p))[5] = (d)>>16; \ + ((uint8_t*)(p))[4] = (d)>>24; \ + ((uint8_t*)(p))[3] = (d)>>32; \ + ((uint8_t*)(p))[2] = (d)>>40; \ + ((uint8_t*)(p))[1] = (d)>>48; \ + ((uint8_t*)(p))[0] = (d)>>56; \ + } while(0) +#endif + +#ifndef AV_RL64 +# define AV_RL64(x) \ + (((uint64_t)((const uint8_t*)(x))[7] << 56) | \ + ((uint64_t)((const uint8_t*)(x))[6] << 48) | \ + ((uint64_t)((const uint8_t*)(x))[5] << 40) | \ + ((uint64_t)((const uint8_t*)(x))[4] << 32) | \ + ((uint64_t)((const uint8_t*)(x))[3] << 24) | \ + ((uint64_t)((const uint8_t*)(x))[2] << 16) | \ + ((uint64_t)((const uint8_t*)(x))[1] << 8) | \ + (uint64_t)((const uint8_t*)(x))[0]) +#endif +#ifndef AV_WL64 +# define AV_WL64(p, d) do { \ + ((uint8_t*)(p))[0] = (d); \ + ((uint8_t*)(p))[1] = (d)>>8; \ + ((uint8_t*)(p))[2] = (d)>>16; \ + ((uint8_t*)(p))[3] = (d)>>24; \ + ((uint8_t*)(p))[4] = (d)>>32; \ + ((uint8_t*)(p))[5] = (d)>>40; \ + ((uint8_t*)(p))[6] = (d)>>48; \ + ((uint8_t*)(p))[7] = (d)>>56; \ + } while(0) +#endif + +#if HAVE_BIGENDIAN +# define AV_RN(s, p) AV_RB##s(p) +# define AV_WN(s, p, v) AV_WB##s(p, v) +#else +# define AV_RN(s, p) AV_RL##s(p) +# define AV_WN(s, p, v) AV_WL##s(p, v) +#endif + +#endif /* HAVE_FAST_UNALIGNED */ + +#ifndef AV_RN16 +# define AV_RN16(p) AV_RN(16, p) +#endif + +#ifndef AV_RN32 +# define AV_RN32(p) AV_RN(32, p) +#endif + +#ifndef AV_RN64 +# define AV_RN64(p) AV_RN(64, p) +#endif + +#ifndef AV_WN16 +# define AV_WN16(p, v) AV_WN(16, p, v) +#endif + +#ifndef AV_WN32 +# define AV_WN32(p, v) AV_WN(32, p, v) +#endif + +#ifndef AV_WN64 +# define AV_WN64(p, v) AV_WN(64, p, v) +#endif + +#if HAVE_BIGENDIAN +# define AV_RB(s, p) AV_RN##s(p) +# define AV_WB(s, p, v) AV_WN##s(p, v) +# define AV_RL(s, p) bswap_##s(AV_RN##s(p)) +# define AV_WL(s, p, v) AV_WN##s(p, bswap_##s(v)) +#else +# define AV_RB(s, p) bswap_##s(AV_RN##s(p)) +# define AV_WB(s, p, v) AV_WN##s(p, bswap_##s(v)) +# define AV_RL(s, p) AV_RN##s(p) +# define AV_WL(s, p, v) AV_WN##s(p, v) +#endif + +#define AV_RB8(x) (((const uint8_t*)(x))[0]) +#define AV_WB8(p, d) do { ((uint8_t*)(p))[0] = (d); } while(0) + +#define AV_RL8(x) AV_RB8(x) +#define AV_WL8(p, d) AV_WB8(p, d) + +#ifndef AV_RB16 +# define AV_RB16(p) AV_RB(16, p) +#endif +#ifndef AV_WB16 +# define AV_WB16(p, v) AV_WB(16, p, v) +#endif + +#ifndef AV_RL16 +# define AV_RL16(p) AV_RL(16, p) +#endif +#ifndef AV_WL16 +# define AV_WL16(p, v) AV_WL(16, p, v) +#endif + +#ifndef AV_RB32 +# define AV_RB32(p) AV_RB(32, p) +#endif +#ifndef AV_WB32 +# define AV_WB32(p, v) AV_WB(32, p, v) +#endif + +#ifndef AV_RL32 +# define AV_RL32(p) AV_RL(32, p) +#endif +#ifndef AV_WL32 +# define AV_WL32(p, v) AV_WL(32, p, v) +#endif + +#ifndef AV_RB64 +# define AV_RB64(p) AV_RB(64, p) +#endif +#ifndef AV_WB64 +# define AV_WB64(p, v) AV_WB(64, p, v) +#endif + +#ifndef AV_RL64 +# define AV_RL64(p) AV_RL(64, p) +#endif +#ifndef AV_WL64 +# define AV_WL64(p, v) AV_WL(64, p, v) +#endif + +#ifndef AV_RB24 +# define AV_RB24(x) \ + ((((const uint8_t*)(x))[0] << 16) | \ + (((const uint8_t*)(x))[1] << 8) | \ + ((const uint8_t*)(x))[2]) +#endif +#ifndef AV_WB24 +# define AV_WB24(p, d) do { \ + ((uint8_t*)(p))[2] = (d); \ + ((uint8_t*)(p))[1] = (d)>>8; \ + ((uint8_t*)(p))[0] = (d)>>16; \ + } while(0) +#endif + +#ifndef AV_RL24 +# define AV_RL24(x) \ + ((((const uint8_t*)(x))[2] << 16) | \ + (((const uint8_t*)(x))[1] << 8) | \ + ((const uint8_t*)(x))[0]) +#endif +#ifndef AV_WL24 +# define AV_WL24(p, d) do { \ + ((uint8_t*)(p))[0] = (d); \ + ((uint8_t*)(p))[1] = (d)>>8; \ + ((uint8_t*)(p))[2] = (d)>>16; \ + } while(0) +#endif + +/* + * The AV_[RW]NA macros access naturally aligned data + * in a type-safe way. + */ + +#define AV_RNA(s, p) (((const av_alias##s*)(p))->u##s) +#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v)) + +#ifndef AV_RN16A +# define AV_RN16A(p) AV_RNA(16, p) +#endif + +#ifndef AV_RN32A +# define AV_RN32A(p) AV_RNA(32, p) +#endif + +#ifndef AV_RN64A +# define AV_RN64A(p) AV_RNA(64, p) +#endif + +#ifndef AV_WN16A +# define AV_WN16A(p, v) AV_WNA(16, p, v) +#endif + +#ifndef AV_WN32A +# define AV_WN32A(p, v) AV_WNA(32, p, v) +#endif + +#ifndef AV_WN64A +# define AV_WN64A(p, v) AV_WNA(64, p, v) +#endif + +/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be + * naturally aligned. They may be implemented using MMX, + * so emms_c() must be called before using any float code + * afterwards. + */ + +#define AV_COPY(n, d, s) \ + (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n) + +#ifndef AV_COPY16 +# define AV_COPY16(d, s) AV_COPY(16, d, s) +#endif + +#ifndef AV_COPY32 +# define AV_COPY32(d, s) AV_COPY(32, d, s) +#endif + +#ifndef AV_COPY64 +# define AV_COPY64(d, s) AV_COPY(64, d, s) +#endif + +#ifndef AV_COPY128 +# define AV_COPY128(d, s) \ + do { \ + AV_COPY64(d, s); \ + AV_COPY64((char*)(d)+8, (char*)(s)+8); \ + } while(0) +#endif + +#define AV_SWAP(n, a, b) FFSWAP(av_alias##n, *(av_alias##n*)(a), *(av_alias##n*)(b)) + +#ifndef AV_SWAP64 +# define AV_SWAP64(a, b) AV_SWAP(64, a, b) +#endif + +#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0) + +#ifndef AV_ZERO16 +# define AV_ZERO16(d) AV_ZERO(16, d) +#endif + +#ifndef AV_ZERO32 +# define AV_ZERO32(d) AV_ZERO(32, d) +#endif + +#ifndef AV_ZERO64 +# define AV_ZERO64(d) AV_ZERO(64, d) +#endif + +#ifndef AV_ZERO128 +# define AV_ZERO128(d) \ + do { \ + AV_ZERO64(d); \ + AV_ZERO64((char*)(d)+8); \ + } while(0) +#endif + +#endif /* AVUTIL_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/log.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/log.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,111 @@ +/* + * log functions + * Copyright (c) 2003 Michel Bardiaux + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * logging functions + */ +#include "error.h" +#include +#include +#include "log.h" + + +static int av_log_level = AV_LOG_INFO; + +static int use_ansi_color=-1; + +#undef fprintf +static void colored_fputs(int color, const char *str){ + if(use_ansi_color<0){ +#if HAVE_ISATTY && !defined(_WIN32) + use_ansi_color= getenv("TERM") && !getenv("NO_COLOR") && isatty(2); +#else + use_ansi_color= 0; +#endif + } + + if(use_ansi_color){ + fprintf(stderr, "\033[%d;3%dm", color>>4, color&15); + } + fputs(str, stderr); + if(use_ansi_color){ + fprintf(stderr, "\033[0m"); + } +} + +void av_log_default_callback(int level, const char* fmt, va_list vl) +{ + static int print_prefix=1; + static int count; + static char line[1024], prev[1024]; + static const uint8_t color[]={0x41,0x41,0x11,0x03,9,9,9}; + + if(level>av_log_level) + return; +#undef fprintf + + line[0]=0; + + vsnprintf(line + strlen(line), sizeof(line) - strlen(line), fmt, vl); + + print_prefix= line[strlen(line)-1] == '\n'; + if(print_prefix && !strcmp(line, prev)){ + count++; + return; + } + if(count>0){ + fprintf(stderr, " Last message repeated %d times\n", count); + count=0; + } + colored_fputs(color[av_clip(level>>3, 0, 6)], line); + strcpy(prev, line); +} + +static void (*av_log_callback)(int, const char*, va_list) = av_log_default_callback; + +void av_log(int level, const char *fmt, ...) +{ + va_list vl; + va_start(vl, fmt); + av_vlog(level, fmt, vl); + va_end(vl); +} + +void av_vlog(int level, const char *fmt, va_list vl) +{ + av_log_callback(level, fmt, vl); +} + +int av_log_get_level(void) +{ + return av_log_level; +} + +void av_log_set_level(int level) +{ + av_log_level = level; +} + +void av_log_set_callback(void (*callback)(int, const char*, va_list)) +{ + av_log_callback = callback; +} diff -r 11d15c47beaf -r 897f711a7157 libavutil/log.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/log.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,120 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_LOG_H +#define AVUTIL_LOG_H + +#include +//#include "avutil.h" + +/** + * Describes the class of an AVClass context structure. That is an + * arbitrary struct of which the first field is a pointer to an + * AVClass struct (e.g. AVCodecContext, AVFormatContext etc.). + */ +typedef struct { + /** + * The name of the class; usually it is the same name as the + * context structure type to which the AVClass is associated. + */ + const char* class_name; + + /** + * A pointer to a function which returns the name of a context + * instance ctx associated with the class. + */ + const char* (*item_name)(void* ctx); + + /** + * a pointer to the first option specified in the class if any or NULL + * + * @see av_set_default_options() + */ + const struct AVOption *option; + + /** + * LIBAVUTIL_VERSION with which this structure was created. + * This is used to allow fields to be added without requiring major + * version bumps everywhere. + */ + + int version; +} AVClass; + +/* av_log API */ + +#define AV_LOG_QUIET -8 + +/** + * Something went really wrong and we will crash now. + */ +#define AV_LOG_PANIC 0 + +/** + * Something went wrong and recovery is not possible. + * For example, no header was found for a format which depends + * on headers or an illegal combination of parameters is used. + */ +#define AV_LOG_FATAL 8 + +/** + * Something went wrong and cannot losslessly be recovered. + * However, not all future data is affected. + */ +#define AV_LOG_ERROR 16 + +/** + * Something somehow does not look correct. This may or may not + * lead to problems. An example would be the use of '-vstrict -2'. + */ +#define AV_LOG_WARNING 24 + +#define AV_LOG_INFO 32 +#define AV_LOG_VERBOSE 40 + +/** + * Stuff which is only useful for libav* developers. + */ +#define AV_LOG_DEBUG 48 + +/** + * Sends the specified message to the log if the level is less than or equal + * to the current av_log_level. By default, all logging messages are sent to + * stderr. This behavior can be altered by setting a different av_vlog callback + * function. + * + * @param avcl A pointer to an arbitrary struct of which the first field is a + * pointer to an AVClass struct. + * @param level The importance level of the message, lower values signifying + * higher importance. + * @param fmt The format string (printf-compatible) that specifies how + * subsequent arguments are converted to output. + * @see av_vlog + */ + +void av_log(int level, const char *fmt, ...); + +void av_vlog(int level, const char *fmt, va_list); +int av_log_get_level(void); +void av_log_set_level(int); +void av_log_set_callback(void (*)(int, const char*, va_list)); +void av_log_default_callback(int level, const char* fmt, va_list vl); + +#endif /* AVUTIL_LOG_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/mem.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/mem.c Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,127 @@ +/* + * default memory allocator for libavutil + * Copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * default memory allocator for libavutil + */ + +#include "config.h" + +#include +#include +#include +#include +#if HAVE_MALLOC_H +#include +#endif + +#include "mem.h" + +/* here we can use OS-dependent allocation functions */ +#undef free +#undef malloc +#undef realloc + +#ifdef MALLOC_PREFIX + +#define malloc AV_JOIN(MALLOC_PREFIX, malloc) +#define memalign AV_JOIN(MALLOC_PREFIX, memalign) +#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign) +#define realloc AV_JOIN(MALLOC_PREFIX, realloc) +#define free AV_JOIN(MALLOC_PREFIX, free) + +void *malloc(size_t size); +void *memalign(size_t align, size_t size); +int posix_memalign(void **ptr, size_t align, size_t size); +void *realloc(void *ptr, size_t size); +void free(void *ptr); + +#endif /* MALLOC_PREFIX */ + + +/* You can redefine av_malloc and av_free in your project to use your + memory allocator. You do not need to suppress this file because the + linker will do it automatically. */ + +void *av_malloc(unsigned int size) +{ + void *ptr = NULL; + /* let's disallow possible ambiguous cases */ + if(size > (INT_MAX-16) ) + return NULL; + +//FIXME: when no aligned mallocs vector code should be disabled. +#if HAVE_POSIX_MEMALIGN + if (posix_memalign(&ptr,16,size)) + ptr = NULL; +#elif HAVE_MEMALIGN + ptr = memalign(16,size); +#else + ptr = malloc(size); +#endif + return ptr; +} + +void *av_realloc(void *ptr, unsigned int size) +{ + /* let's disallow possible ambiguous cases */ + if(size > (INT_MAX-16) ) + return NULL; + + return realloc(ptr, size); + +} + +void av_free(void *ptr) +{ + /* XXX: this test should not be needed on most libcs */ + if (ptr) + free(ptr); + +} + +void av_freep(void *arg) +{ + void **ptr= (void**)arg; + av_free(*ptr); + *ptr = NULL; +} + +void *av_mallocz(unsigned int size) +{ + void *ptr = av_malloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; +} + +char *av_strdup(const char *s) +{ + char *ptr= NULL; + if(s){ + int len = strlen(s) + 1; + ptr = av_malloc(len); + if (ptr) + memcpy(ptr, s, len); + } + return ptr; +} diff -r 11d15c47beaf -r 897f711a7157 libavutil/mem.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/mem.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,143 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * memory handling functions + */ + +#ifndef AVUTIL_MEM_H +#define AVUTIL_MEM_H + +#include "attributes.h" +#include "config.h" + +#define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v +#define DECLARE_ALIGNED_16(t,v) t __attribute__ ((aligned (16))) v +#define DECLARE_ASM_CONST(n,t,v) static const t __attribute__((used)) __attribute__ ((aligned (n))) v + +#if AV_GCC_VERSION_AT_LEAST(3,1) + #define av_malloc_attrib __attribute__((__malloc__)) +#else + #define av_malloc_attrib +#endif + +/** + * Allocates a block of size bytes with alignment suitable for all + * memory accesses (including vectors if available on the CPU). + * @param size Size in bytes for the memory block to be allocated. + * @return Pointer to the allocated block, NULL if the block cannot + * be allocated. + * @see av_mallocz() + */ +void *av_malloc(unsigned int size) av_malloc_attrib; + +/** + * Allocates or reallocates a block of memory. + * If ptr is NULL and size > 0, allocates a new block. If + * size is zero, frees the memory block pointed to by ptr. + * @param size Size in bytes for the memory block to be allocated or + * reallocated. + * @param ptr Pointer to a memory block already allocated with + * av_malloc(z)() or av_realloc() or NULL. + * @return Pointer to a newly reallocated block or NULL if the block + * cannot be reallocated or the function is used to free the memory block. + * @see av_fast_realloc() + */ +void *av_realloc(void *ptr, unsigned int size); + +/** + * Reallocates the given block if it is not large enough, otherwise it + * does nothing. + * + * @see av_realloc + */ +void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size); + +/** + * Allocates a buffer, reusing the given one if large enough. + * + * Contrary to av_fast_realloc the current buffer contents might not be + * preserved and on error the old buffer is freed, thus no special + * handling to avoid memleaks is necessary. + * + * @param ptr pointer to pointer to already allocated buffer, overwritten with pointer to new buffer + * @param size size of the buffer *ptr points to + * @param min_size minimum size of *ptr buffer after returning, *ptr will be NULL and + * *size 0 if an error occurred. + */ +void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size); + +/** + * Frees a memory block which has been allocated with av_malloc(z)() or + * av_realloc(). + * @param ptr Pointer to the memory block which should be freed. + * @note ptr = NULL is explicitly allowed. + * @note It is recommended that you use av_freep() instead. + * @see av_freep() + */ + +void av_free(void *ptr); + +/** + * Allocates a block of size bytes with alignment suitable for all + * memory accesses (including vectors if available on the CPU) and + * zeroes all the bytes of the block. + * @param size Size in bytes for the memory block to be allocated. + * @return Pointer to the allocated block, NULL if it cannot be allocated. + * @see av_malloc() + */ +void *av_mallocz(unsigned int size) av_malloc_attrib; + +/** + * Duplicates the string s. + * @param s string to be duplicated + * @return Pointer to a newly allocated string containing a + * copy of s or NULL if the string cannot be allocated. + */ +char *av_strdup(const char *s) av_malloc_attrib; + +/** + * Frees a memory block which has been allocated with av_malloc(z)() or + * av_realloc() and set the pointer pointing to it to NULL. + * @param ptr Pointer to the pointer to the memory block which should + * be freed. + * @see av_free() + */ +void av_freep(void *ptr); + + +static av_always_inline uint32_t pack16to32(int a, int b){ +#if HAVE_BIGENDIAN + return (b&0xFFFF) + (a<<16); +#else + return (a&0xFFFF) + (b<<16); +#endif +} + +static av_always_inline uint16_t pack8to16(int a, int b){ +#if HAVE_BIGENDIAN + return (b&0xFF) + (a<<8); +#else + return (a&0xFF) + (b<<8); +#endif +} + +#endif /* AVUTIL_MEM_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/pixfmt.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/pixfmt.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,161 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PIXFMT_H +#define AVUTIL_PIXFMT_H + +/** + * @file + * pixel format definitions + * + * @warning This file has to be considered an internal but installed + * header, so it should not be directly included in your projects. + */ + +/** + * Pixel format. Notes: + * + * PIX_FMT_RGB32 is handled in an endian-specific manner. An RGBA + * color is put together as: + * (A << 24) | (R << 16) | (G << 8) | B + * This is stored as BGRA on little-endian CPU architectures and ARGB on + * big-endian CPUs. + * + * When the pixel format is palettized RGB (PIX_FMT_PAL8), the palettized + * image data is stored in AVFrame.data[0]. The palette is transported in + * AVFrame.data[1], is 1024 bytes long (256 4-byte entries) and is + * formatted the same as in PIX_FMT_RGB32 described above (i.e., it is + * also endian-specific). Note also that the individual RGB palette + * components stored in AVFrame.data[1] should be in the range 0..255. + * This is important as many custom PAL8 video codecs that were designed + * to run on the IBM VGA graphics adapter use 6-bit palette components. + * + * For all the 8bit per pixel formats, an RGB32 palette is in data[1] like + * for pal8. This palette is filled in automatically by the function + * allocating the picture. + * + * Note, make sure that all newly added big endian formats have pix_fmt&1==1 + * and that all newly added little endian formats have pix_fmt&1==0 + * this allows simpler detection of big vs little endian. + */ +enum PixelFormat { + PIX_FMT_NONE= -1, + PIX_FMT_YUV420P, ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples) + PIX_FMT_YUYV422, ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr + PIX_FMT_RGB24, ///< packed RGB 8:8:8, 24bpp, RGBRGB... + PIX_FMT_BGR24, ///< packed RGB 8:8:8, 24bpp, BGRBGR... + PIX_FMT_YUV422P, ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples) + PIX_FMT_YUV444P, ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples) + PIX_FMT_YUV410P, ///< planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples) + PIX_FMT_YUV411P, ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples) + PIX_FMT_GRAY8, ///< Y , 8bpp + PIX_FMT_MONOWHITE, ///< Y , 1bpp, 0 is white, 1 is black + PIX_FMT_MONOBLACK, ///< Y , 1bpp, 0 is black, 1 is white + PIX_FMT_PAL8, ///< 8 bit with PIX_FMT_RGB32 palette + PIX_FMT_YUVJ420P, ///< planar YUV 4:2:0, 12bpp, full scale (JPEG) + PIX_FMT_YUVJ422P, ///< planar YUV 4:2:2, 16bpp, full scale (JPEG) + PIX_FMT_YUVJ444P, ///< planar YUV 4:4:4, 24bpp, full scale (JPEG) + PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing + PIX_FMT_XVMC_MPEG2_IDCT, + PIX_FMT_UYVY422, ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1 + PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3 + PIX_FMT_BGR8, ///< packed RGB 3:3:2, 8bpp, (msb)2B 3G 3R(lsb) + PIX_FMT_BGR4, ///< packed RGB 1:2:1, 4bpp, (msb)1B 2G 1R(lsb) + PIX_FMT_BGR4_BYTE, ///< packed RGB 1:2:1, 8bpp, (msb)1B 2G 1R(lsb) + PIX_FMT_RGB8, ///< packed RGB 3:3:2, 8bpp, (msb)2R 3G 3B(lsb) + PIX_FMT_RGB4, ///< packed RGB 1:2:1, 4bpp, (msb)1R 2G 1B(lsb) + PIX_FMT_RGB4_BYTE, ///< packed RGB 1:2:1, 8bpp, (msb)1R 2G 1B(lsb) + PIX_FMT_NV12, ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 for UV + PIX_FMT_NV21, ///< as above, but U and V bytes are swapped + + PIX_FMT_ARGB, ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB... + PIX_FMT_RGBA, ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA... + PIX_FMT_ABGR, ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR... + PIX_FMT_BGRA, ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA... + + PIX_FMT_GRAY16BE, ///< Y , 16bpp, big-endian + PIX_FMT_GRAY16LE, ///< Y , 16bpp, little-endian + PIX_FMT_YUV440P, ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples) + PIX_FMT_YUVJ440P, ///< planar YUV 4:4:0 full scale (JPEG) + PIX_FMT_YUVA420P, ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples) + PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers + PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers + PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers + PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers + PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers + PIX_FMT_RGB48BE, ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, big-endian + PIX_FMT_RGB48LE, ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, little-endian + + PIX_FMT_RGB565BE, ///< packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), big-endian + PIX_FMT_RGB565LE, ///< packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), little-endian + PIX_FMT_RGB555BE, ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), big-endian, most significant bit to 0 + PIX_FMT_RGB555LE, ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), little-endian, most significant bit to 0 + + PIX_FMT_BGR565BE, ///< packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), big-endian + PIX_FMT_BGR565LE, ///< packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), little-endian + PIX_FMT_BGR555BE, ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), big-endian, most significant bit to 1 + PIX_FMT_BGR555LE, ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), little-endian, most significant bit to 1 + + PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers + PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers + PIX_FMT_VAAPI_VLD, ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers + + PIX_FMT_YUV420P16LE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian + PIX_FMT_YUV420P16BE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian + PIX_FMT_YUV422P16LE, ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian + PIX_FMT_YUV422P16BE, ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian + PIX_FMT_YUV444P16LE, ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian + PIX_FMT_YUV444P16BE, ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian + PIX_FMT_VDPAU_MPEG4, ///< MPEG4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers + PIX_FMT_DXVA2_VLD, ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer + + PIX_FMT_RGB444BE, ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), big-endian, most significant bits to 0 + PIX_FMT_RGB444LE, ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), little-endian, most significant bits to 0 + PIX_FMT_BGR444BE, ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), big-endian, most significant bits to 1 + PIX_FMT_BGR444LE, ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), little-endian, most significant bits to 1 + PIX_FMT_Y400A, ///< 8bit gray, 8bit alpha + PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions +}; + +#if HAVE_BIGENDIAN +# define PIX_FMT_NE(be, le) PIX_FMT_##be +#else +# define PIX_FMT_NE(be, le) PIX_FMT_##le +#endif + +#define PIX_FMT_RGB32 PIX_FMT_NE(ARGB, BGRA) +#define PIX_FMT_RGB32_1 PIX_FMT_NE(RGBA, ABGR) +#define PIX_FMT_BGR32 PIX_FMT_NE(ABGR, RGBA) +#define PIX_FMT_BGR32_1 PIX_FMT_NE(BGRA, ARGB) + +#define PIX_FMT_GRAY16 PIX_FMT_NE(GRAY16BE, GRAY16LE) +#define PIX_FMT_RGB48 PIX_FMT_NE(RGB48BE, RGB48LE) +#define PIX_FMT_RGB565 PIX_FMT_NE(RGB565BE, RGB565LE) +#define PIX_FMT_RGB555 PIX_FMT_NE(RGB555BE, RGB555LE) +#define PIX_FMT_RGB444 PIX_FMT_NE(RGB444BE, RGB444LE) +#define PIX_FMT_BGR565 PIX_FMT_NE(BGR565BE, BGR565LE) +#define PIX_FMT_BGR555 PIX_FMT_NE(BGR555BE, BGR555LE) +#define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE) + +#define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE) +#define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE) +#define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE) + +#endif /* AVUTIL_PIXFMT_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/ppc/intreadwrite.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/ppc/intreadwrite.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PPC_INTREADWRITE_H +#define AVUTIL_PPC_INTREADWRITE_H + +#include +#include "config.h" + +#if HAVE_XFORM_ASM + +#define AV_RL16 AV_RL16 +static av_always_inline uint16_t AV_RL16(const void *p) +{ + uint16_t v; + __asm__ ("lhbrx %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p)); + return v; +} + +#define AV_WL16 AV_WL16 +static av_always_inline void AV_WL16(void *p, uint16_t v) +{ + __asm__ ("sthbrx %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v)); +} + +#define AV_RL32 AV_RL32 +static av_always_inline uint32_t AV_RL32(const void *p) +{ + uint32_t v; + __asm__ ("lwbrx %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p)); + return v; +} + +#define AV_WL32 AV_WL32 +static av_always_inline void AV_WL32(void *p, uint32_t v) +{ + __asm__ ("stwbrx %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v)); +} + +#if HAVE_LDBRX + +#define AV_RL64 AV_RL64 +static av_always_inline uint64_t AV_RL64(const void *p) +{ + uint64_t v; + __asm__ ("ldbrx %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p)); + return v; +} + +#define AV_WL64 AV_WL64 +static av_always_inline void AV_WL64(void *p, uint64_t v) +{ + __asm__ ("stdbrx %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v)); +} + +#else + +#define AV_RL64 AV_RL64 +static av_always_inline uint64_t AV_RL64(const void *p) +{ + union { uint64_t v; uint32_t hl[2]; } v; + __asm__ ("lwbrx %0, %y2 \n\t" + "lwbrx %1, %y3 \n\t" + : "=&r"(v.hl[1]), "=r"(v.hl[0]) + : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1))); + return v.v; +} + +#define AV_WL64 AV_WL64 +static av_always_inline void AV_WL64(void *p, uint64_t v) +{ + union { uint64_t v; uint32_t hl[2]; } vv = { v }; + __asm__ ("stwbrx %2, %y0 \n\t" + "stwbrx %3, %y1 \n\t" + : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1)) + : "r"(vv.hl[1]), "r"(vv.hl[0])); +} + +#endif /* HAVE_LDBRX */ + +#endif /* HAVE_XFORM_ASM */ + +/* + * GCC fails miserably on the packed struct version which is used by + * default, so we override it here. + */ + +#define AV_RB64(p) (*(const uint64_t *)(p)) +#define AV_WB64(p, v) (*(uint64_t *)(p) = (v)) + +#endif /* AVUTIL_PPC_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/ppc/timer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/ppc/timer.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2005 Luca Barbato + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_PPC_TIMER_H +#define AVUTIL_PPC_TIMER_H + +#include + +#define AV_READ_TIME read_time + +static inline uint64_t read_time(void) +{ + uint32_t tbu, tbl, temp; + + /* from section 2.2.1 of the 32-bit PowerPC PEM */ + __asm__ volatile( + "1:\n" + "mftbu %2\n" + "mftb %0\n" + "mftbu %1\n" + "cmpw %2,%1\n" + "bne 1b\n" + : "=r"(tbl), "=r"(tbu), "=r"(temp) + : + : "cc"); + + return (((uint64_t)tbu)<<32) | (uint64_t)tbl; +} + +#endif /* AVUTIL_PPC_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/timer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/timer.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,69 @@ +/** + * @file + * high precision timer, useful to profile code + * + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_TIMER_H +#define AVUTIL_TIMER_H + +#include +#include +#include "config.h" + +#if ARCH_ARM +# include "arm/timer.h" +#elif ARCH_PPC +# include "ppc/timer.h" +#elif ARCH_X86 +# include "x86/timer.h" +#endif + +#if !defined(AV_READ_TIME) && HAVE_GETHRTIME +# define AV_READ_TIME gethrtime +#endif + +#ifdef AV_READ_TIME +#define START_TIMER \ +uint64_t tend;\ +uint64_t tstart= AV_READ_TIME();\ + +#define STOP_TIMER(id) \ +tend= AV_READ_TIME();\ +{\ + static uint64_t tsum=0;\ + static int tcount=0;\ + static int tskip_count=0;\ + if(tcount<2 || tend - tstart < 8*tsum/tcount || tend - tstart < 2000){\ + tsum+= tend - tstart;\ + tcount++;\ + }else\ + tskip_count++;\ + if(((tcount+tskip_count)&(tcount+tskip_count-1))==0){\ + av_log(NULL, AV_LOG_ERROR, "%"PRIu64" dezicycles in %s, %d runs, %d skips\n",\ + tsum*10/tcount, id, tcount, tskip_count);\ + }\ +} +#else +#define START_TIMER +#define STOP_TIMER(id) {} +#endif + +#endif /* AVUTIL_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86/bswap.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/x86/bswap.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,61 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * byte swapping routines + */ + +#ifndef AVUTIL_X86_BSWAP_H +#define AVUTIL_X86_BSWAP_H + +#include +#include "config.h" +#include "libavutil/attributes.h" + +#define bswap_16 bswap_16 +static av_always_inline av_const uint16_t bswap_16(uint16_t x) +{ + __asm__("rorw $8, %0" : "+r"(x)); + return x; +} + +#define bswap_32 bswap_32 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) +{ +// #if HAVE_BSWAP + __asm__("bswap %0" : "+r" (x)); +// #else +// __asm__("rorw $8, %w0 \n\t" +// "rorl $16, %0 \n\t" +// "rorw $8, %w0" +// : "+r"(x)); +// #endif + return x; +} + +#if ARCH_X86_64 +#define bswap_64 bswap_64 +static inline uint64_t av_const bswap_64(uint64_t x) +{ + __asm__("bswap %0": "=r" (x) : "0" (x)); + return x; +} +#endif + +#endif /* AVUTIL_X86_BSWAP_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86/intreadwrite.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/x86/intreadwrite.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2010 Alexander Strange + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_INTREADWRITE_H +#define AVUTIL_X86_INTREADWRITE_H + +#include +#include "config.h" +#include "libavutil/attributes.h" + +#if HAVE_MMX + +#if defined(__MMX__) + +#define AV_COPY64 AV_COPY64 +static av_always_inline void AV_COPY64(void *d, const void *s) +{ + __asm__("movq %1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + : "=m"(*(uint64_t*)d) + : "m" (*(const uint64_t*)s) + : "mm0"); +} + +#define AV_SWAP64 AV_SWAP64 +static av_always_inline void AV_SWAP64(void *a, void *b) +{ + __asm__("movq %1, %%mm0 \n\t" + "movq %0, %%mm1 \n\t" + "movq %%mm0, %0 \n\t" + "movq %%mm1, %1 \n\t" + : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b) + ::"mm0", "mm1"); +} + +#define AV_ZERO64 AV_ZERO64 +static av_always_inline void AV_ZERO64(void *d) +{ + __asm__("pxor %%mm0, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + : "=m"(*(uint64_t*)d) + :: "mm0"); +} + +#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */ + +#ifdef __SSE__ + +#define AV_COPY128 AV_COPY128 +static av_always_inline void AV_COPY128(void *d, const void *s) +{ + struct v {uint64_t v[2];}; + + __asm__("movaps %1, %%xmm0 \n\t" + "movaps %%xmm0, %0 \n\t" + : "=m"(*(struct v*)d) + : "m" (*(const struct v*)s) + : "xmm0"); +} + +#endif /* __SSE__ */ + +#ifdef __SSE2__ + +#define AV_ZERO128 AV_ZERO128 +static av_always_inline void AV_ZERO128(void *d) +{ + struct v {uint64_t v[2];}; + + __asm__("pxor %%xmm0, %%xmm0 \n\t" + "movdqa %%xmm0, %0 \n\t" + : "=m"(*(struct v*)d) + :: "xmm0"); +} + +#endif /* __SSE2__ */ + +#endif /* HAVE_MMX */ + +#endif /* AVUTIL_X86_INTREADWRITE_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86/timer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/x86/timer.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,35 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_TIMER_H +#define AVUTIL_X86_TIMER_H + +#include + +#define AV_READ_TIME read_time + +static inline uint64_t read_time(void) +{ + uint32_t a, d; + __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((uint64_t)d << 32) + a; +} + +#endif /* AVUTIL_X86_TIMER_H */ diff -r 11d15c47beaf -r 897f711a7157 libavutil/x86_cpu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libavutil/x86_cpu.h Tue Sep 25 15:55:33 2012 +0200 @@ -0,0 +1,73 @@ +/* + * copyright (c) 2006 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_CPU_H +#define AVUTIL_X86_CPU_H + +#include +#include "config.h" + +#if ARCH_X86_64 +# define REG_a "rax" +# define REG_b "rbx" +# define REG_c "rcx" +# define REG_d "rdx" +# define REG_D "rdi" +# define REG_S "rsi" +# define PTR_SIZE "8" +typedef int64_t x86_reg; + +# define REG_SP "rsp" +# define REG_BP "rbp" +# define REGBP rbp +# define REGa rax +# define REGb rbx +# define REGc rcx +# define REGd rdx +# define REGSP rsp + +#elif ARCH_X86_32 + +# define REG_a "eax" +# define REG_b "ebx" +# define REG_c "ecx" +# define REG_d "edx" +# define REG_D "edi" +# define REG_S "esi" +# define PTR_SIZE "4" +typedef int32_t x86_reg; + +# define REG_SP "esp" +# define REG_BP "ebp" +# define REGBP ebp +# define REGa eax +# define REGb ebx +# define REGc ecx +# define REGd edx +# define REGSP esp +#else +typedef int x86_reg; +#endif + +// #if ARCH_X86_64 && defined(PIC) +// # define BROKEN_RELOCATIONS 1 +// #endif + +#endif /* AVUTIL_X86_CPU_H */